tesseract 4.1.1
Loading...
Searching...
No Matches
classifier_tester.cpp
Go to the documentation of this file.
1// Copyright 2011 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14// Filename: classifier_tester.cpp
15// Purpose: Tests a character classifier on data as formatted for training,
16// but doesn't have to be the same as the training data.
17// Author: Ray Smith
18
19#include <algorithm>
20#include <cstdio>
21#ifdef GOOGLE_TESSERACT
22#include "base/commandlineflags.h"
23#endif // GOOGLE_TESSERACT
24#include "baseapi.h"
25#include "commontraining.h"
26#include "mastertrainer.h"
27#include "params.h"
28#include "strngs.h"
29#include "tessclassifier.h"
30#include "tesseractclass.h"
31
32static STRING_PARAM_FLAG(classifier, "", "Classifier to test");
33static STRING_PARAM_FLAG(lang, "eng", "Language to test");
34static STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
35
40};
41
42static const char* names[] = {"pruner", "full"};
43
44static tesseract::ShapeClassifier* InitializeClassifier(
45 const char* classifer_name, const UNICHARSET& unicharset,
46 int argc, char **argv,
48 // Decode the classifier string.
49 ClassifierName classifier = CN_COUNT;
50 for (int c = 0; c < CN_COUNT; ++c) {
51 if (strcmp(classifer_name, names[c]) == 0) {
52 classifier = static_cast<ClassifierName>(c);
53 break;
54 }
55 }
56 if (classifier == CN_COUNT) {
57 fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
58 return nullptr;
59 }
60
61 // We need to initialize tesseract to test.
62 *api = new tesseract::TessBaseAPI;
65 tesseract::Classify* classify = nullptr;
66 if (
67 classifier == CN_PRUNER || classifier == CN_FULL) {
68 if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
69 engine_mode) < 0) {
70 fprintf(stderr, "Tesseract initialization failed!\n");
71 return nullptr;
72 }
73 tesseract = const_cast<tesseract::Tesseract*>((*api)->tesseract());
74 classify = static_cast<tesseract::Classify*>(tesseract);
75 if (classify->shape_table() == nullptr) {
76 fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
77 return nullptr;
78 }
79 }
80 tesseract::ShapeClassifier* shape_classifier = nullptr;
81
82 if (classifier == CN_PRUNER) {
83 shape_classifier = new tesseract::TessClassifier(true, classify);
84 } else if (classifier == CN_FULL) {
85 shape_classifier = new tesseract::TessClassifier(false, classify);
86 }
87 tprintf("Testing classifier %s:\n", classifer_name);
88 return shape_classifier;
89}
90
91// This program has complex setup requirements, so here is some help:
92// Two different modes, tr files and serialized mastertrainer.
93// From tr files:
94// classifier_tester -U unicharset -F font_properties -X xheights
95// -classifier x -lang lang [-output_trainer trainer] *.tr
96// From a serialized trainer:
97// classifier_tester -input_trainer trainer [-lang lang] -classifier x
98//
99// In the first case, the unicharset must be the unicharset from within
100// the classifier under test, and the font_properties and xheights files must
101// match the files used during training.
102// In the second case, the trainer file must have been prepared from
103// some previous run of shapeclustering, mftraining, or classifier_tester
104// using the same conditions as above, ie matching unicharset/font_properties.
105//
106// Available values of classifier (x above) are:
107// pruner : Tesseract class pruner only.
108// full : Tesseract full classifier.
109// with an input trainer.)
110int main(int argc, char **argv) {
111 tesseract::CheckSharedLibraryVersion();
112 ParseArguments(&argc, &argv);
113 STRING file_prefix;
114 tesseract::MasterTrainer* trainer =
115 tesseract::LoadTrainingData(argc, argv, false, nullptr, &file_prefix);
117 // Decode the classifier string.
118 tesseract::ShapeClassifier* shape_classifier = InitializeClassifier(
119 FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
120 if (shape_classifier == nullptr) {
121 fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
122 return 1;
123 }
124
125 // We want to test junk as well if it is available.
126 // trainer->IncludeJunk();
127 // We want to test with replicated samples too.
129
131 std::max(3, static_cast<int>(FLAGS_debug_level)), false,
132 shape_classifier, nullptr);
133 delete shape_classifier;
134 delete api;
135 delete trainer;
136
137 return 0;
138} /* main */
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int main(int argc, char **argv)
ClassifierName
@ CN_FULL
@ CN_PRUNER
@ CN_COUNT
#define STRING_PARAM_FLAG(name, val, comment)
void ParseArguments(int *argc, char ***argv)
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:269
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
@ CT_UNICHAR_TOP1_ERR
Definition: errorcounter.h:74
Definition: strngs.h:45
const ShapeTable * shape_table() const
Definition: classify.h:111
void ReplicateAndRandomizeSamplesIfRequired()
const UNICHARSET & unicharset() const
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)