tesseract 4.1.1
Loading...
Searching...
No Matches
tesstrain.py
Go to the documentation of this file.
1#!/usr/bin/env python3
2
3# (C) Copyright 2014, Google Inc.
4# (C) Copyright 2018, James R Barlow
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8# http://www.apache.org/licenses/LICENSE-2.0
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15# This script provides an easy way to execute various phases of training
16# Tesseract. For a detailed description of the phases, see
17# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
18
19import logging
20import os
21import sys
22
23if (sys.version_info.major < 3) or (sys.version_info.major == 3 and sys.version_info.minor < 6):
24 raise Exception("Must be using Python minimum version 3.6!")
25
26sys.path.insert(0, os.path.dirname(__file__))
27from tesstrain_utils import (
28 parse_flags,
29 initialize_fontconfig,
30 phase_I_generate_image,
31 phase_UP_generate_unicharset,
32 phase_E_extract_features,
33 make_lstmdata,
34 cleanup,
35)
36import language_specific
37
38log = logging.getLogger()
39
40
42 log.setLevel(logging.DEBUG)
43 console = logging.StreamHandler()
44 console.setLevel(logging.INFO)
45 console_formatter = logging.Formatter(
46 "[%(asctime)s] %(levelname)s - %(message)s", datefmt="%H:%M:%S"
47 )
48 console.setFormatter(console_formatter)
49 log.addHandler(console)
50
51
53 logfile = logging.FileHandler(logfile)
54 logfile.setLevel(logging.DEBUG)
55 logfile_formatter = logging.Formatter(
56 "[%(asctime)s] - %(levelname)s - %(name)s - %(message)s"
57 )
58 logfile.setFormatter(logfile_formatter)
59 log.addHandler(logfile)
60
61
62def main():
64 ctx = parse_flags()
65 setup_logging_logfile(ctx.log_file)
66 if not ctx.linedata:
67 log.error("--linedata_only is required since only LSTM is supported")
68 sys.exit(1)
69
70 log.info(f"=== Starting training for language {ctx.lang_code}")
72
73 initialize_fontconfig(ctx)
74 phase_I_generate_image(ctx, par_factor=8)
75 phase_UP_generate_unicharset(ctx)
76
77 if ctx.linedata:
78 phase_E_extract_features(ctx, ["--psm", "6", "lstm.train"], "lstmf")
79 make_lstmdata(ctx)
80
81 cleanup(ctx)
82 log.info("All done!")
83 return 0
84
85
86if __name__ == "__main__":
87 main()
88
89# _rc0 = subprocess.call(["tlog","\n=== Starting training for language '"+str(LANG_CODE.val)+"'"],shell=True)
90# _rc0 = subprocess.call(["source",os.popen("dirname "+__file__).read().rstrip("\n")+"/language-specific.sh"],shell=True)
91# _rc0 = subprocess.call(["set_lang_specific_parameters",str(LANG_CODE.val)],shell=True)
92# _rc0 = subprocess.call(["initialize_fontconfig"],shell=True)
93# _rc0 = subprocess.call(["phase_I_generate_image","8"],shell=True)
94# _rc0 = subprocess.call(["phase_UP_generate_unicharset"],shell=True)
95# if (LINEDATA ):
96# subprocess.call(["phase_E_extract_features"," --psm 6 lstm.train ","8","lstmf"],shell=True)
97# subprocess.call(["make__lstmdata"],shell=True)
98# subprocess.call(["tlog","\nCreated starter traineddata for language '"+str(LANG_CODE.val)+"'\n"],shell=True)
99# subprocess.call(["tlog","\nRun lstmtraining to do the LSTM training for language '"+str(LANG_CODE.val)+"'\n"],shell=True)
100# else:
101# subprocess.call(["phase_D_generate_dawg"],shell=True)
102# subprocess.call(["phase_E_extract_features","box.train","8","tr"],shell=True)
103# subprocess.call(["phase_C_cluster_prototypes",str(TRAINING_DIR.val)+"/"+str(LANG_CODE.val)+".normproto"],shell=True)
104# if (str(ENABLE_SHAPE_CLUSTERING.val) == "y" ):
105# subprocess.call(["phase_S_cluster_shapes"],shell=True)
106# subprocess.call(["phase_M_cluster_microfeatures"],shell=True)
107# subprocess.call(["phase_B_generate_ambiguities"],shell=True)
108# subprocess.call(["make__traineddata"],shell=True)
109# subprocess.call(["tlog","\nCompleted training for language '"+str(LANG_CODE.val)+"'\n"],shell=True)
def set_lang_specific_parameters(ctx, lang)
def main()
Definition: tesstrain.py:62
def setup_logging_logfile(logfile)
Definition: tesstrain.py:52
def setup_logging_console()
Definition: tesstrain.py:41