tesseract 4.1.1
Loading...
Searching...
No Matches
blobclass.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: blobclass.c
3 ** Purpose: High level blob classification and training routines.
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 ******************************************************************************/
17
21#include "blobclass.h"
22
23#include <cstdio>
24
25#include "classify.h"
26#include "featdefs.h"
27#include "mf.h"
28#include "normfeat.h"
29
30static const char kUnknownFontName[] = "UnknownFont";
31
32static STRING_VAR(classify_font_name, kUnknownFontName,
33 "Default font name to be used in training");
34
35namespace tesseract {
40// Finds the name of the training font and returns it in fontname, by cutting
41// it out based on the expectation that the filename is of the form:
42// /path/to/dir/[lang].[fontname].exp[num]
43// The [lang], [fontname] and [num] fields should not have '.' characters.
44// If the global parameter classify_font_name is set, its value is used instead.
45void ExtractFontName(const STRING& filename, STRING* fontname) {
46 *fontname = classify_font_name;
47 if (*fontname == kUnknownFontName) {
48 // filename is expected to be of the form [lang].[fontname].exp[num]
49 // The [lang], [fontname] and [num] fields should not have '.' characters.
50 const char *basename = strrchr(filename.string(), '/');
51 const char *firstdot = strchr(basename ? basename : filename.string(), '.');
52 const char *lastdot = strrchr(filename.string(), '.');
53 if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
54 ++firstdot;
55 *fontname = firstdot;
56 fontname->truncate_at(lastdot - firstdot);
57 }
58 }
59}
60
61
62/*---------------------------------------------------------------------------*/
63
64// Extracts features from the given blob and saves them in the tr_file_data_
65// member variable.
66// fontname: Name of font that this blob was printed in.
67// cn_denorm: Character normalization transformation to apply to the blob.
68// fx_info: Character normalization parameters computed with cn_denorm.
69// blob_text: Ground truth text for the blob.
70void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
71 const DENORM& cn_denorm,
72 const INT_FX_RESULT_STRUCT& fx_info,
73 const char* blob_text) {
75 CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
76 CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
77 CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
78 CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
79
80 if (ValidCharDescription(feature_defs_, CharDesc)) {
81 // Label the features with a class name and font name.
82 tr_file_data_ += "\n";
83 tr_file_data_ += fontname;
84 tr_file_data_ += " ";
85 tr_file_data_ += blob_text;
86 tr_file_data_ += "\n";
87
88 // write micro-features to file and clean up
89 WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
90 } else {
91 tprintf("Blob learned was invalid!\n");
92 }
93 FreeCharDescription(CharDesc);
94} // LearnBlob
95
96// Writes stored training data to a .tr file based on the given filename.
97// Returns false on error.
98bool Classify::WriteTRFile(const STRING& filename) {
99 bool result = false;
100 STRING tr_filename = filename + ".tr";
101 FILE* fp = fopen(tr_filename.string(), "wb");
102 if (fp) {
103 result =
104 tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length());
105 fclose(fp);
106 }
107 tr_file_data_.truncate_at(0);
108 return result;
109}
110
111} // namespace tesseract.
#define STRING_VAR(name, val, comment)
Definition: params.h:309
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc, STRING *str)
Definition: featdefs.cpp:174
CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs)
Definition: featdefs.cpp:148
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:129
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc)
Definition: featdefs.cpp:195
FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm)
Definition: mf.cpp:43
FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info)
Definition: normfeat.cpp:61
void ExtractFontName(const STRING &filename, STRING *fontname)
Definition: blobclass.cpp:45
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
Definition: blobs.h:284
Definition: strngs.h:45
void truncate_at(int32_t index)
Definition: strngs.cpp:265
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:541
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:98
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:217
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:70
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:247
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:41