tesseract 4.1.1
Loading...
Searching...
No Matches
adaptmatch.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: adaptmatch.cpp
3 ** Purpose: High level adaptive matcher.
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 ******************************************************************************/
17
18/*-----------------------------------------------------------------------------
19 Include Files and Type Defines
20-----------------------------------------------------------------------------*/
21#ifdef HAVE_CONFIG_H
22#include "config_auto.h"
23#endif
24
25#include <algorithm> // for max, min
26#include <cassert> // for assert
27#include <cmath> // for fabs
28#include <cstdint> // for INT32_MAX, UINT8_MAX
29#include <cstdio> // for fflush, fclose, fopen, stdout, FILE
30#include <cstdlib> // for malloc
31#include <cstring> // for strstr, memset, strcmp
32#include "adaptive.h" // for ADAPT_CLASS, free_adapted_templates
33#include "ambigs.h" // for UnicharIdVector, UnicharAmbigs
34#include "bitvec.h" // for FreeBitVector, NewBitVector, BIT_VECTOR
35#include "blobs.h" // for TBLOB, TWERD
36#include "callcpp.h" // for cprintf, window_wait
37#include "classify.h" // for Classify, CST_FRAGMENT, CST_WHOLE
38#include "dict.h" // for Dict
39#include "errcode.h" // for ASSERT_HOST
40#include "featdefs.h" // for CharNormDesc
41#include "float2int.h" // for BASELINE_Y_SHIFT
42#include "fontinfo.h" // for ScoredFont, FontSet
43#include "genericvector.h" // for GenericVector
44#include "helpers.h" // for IntCastRounded, ClipToRange
45#include "intfx.h" // for BlobToTrainingSample, INT_FX_RESULT_S...
46#include "intmatcher.h" // for CP_RESULT_STRUCT, IntegerMatcher
47#include "intproto.h" // for INT_FEATURE_STRUCT, (anonymous), Clas...
48#include "matchdefs.h" // for CLASS_ID, FEATURE_ID, PROTO_ID, NO_PROTO
49#include "mfoutline.h" // for baseline, character, MF_SCALE_FACTOR
50#include "normalis.h" // for DENORM, kBlnBaselineOffset, kBlnXHeight
51#include "normfeat.h" // for ActualOutlineLength, CharNormLength
52#include "ocrfeatures.h" // for FEATURE_STRUCT, FreeFeatureSet, FEATURE
53#include "oldlist.h" // for push, delete_d
54#include "outfeat.h" // for OutlineFeatDir, OutlineFeatLength
55#include "pageres.h" // for WERD_RES
56#include "params.h" // for IntParam, BoolParam, DoubleParam, Str...
57#include "picofeat.h" // for PicoFeatDir, PicoFeatX, PicoFeatY
58#include "protos.h" // for PROTO_STRUCT, FillABC, PROTO
59#include "ratngs.h" // for BLOB_CHOICE_IT, BLOB_CHOICE_LIST, BLO...
60#include "rect.h" // for TBOX
61#include "scrollview.h" // for ScrollView, ScrollView::BROWN, Scroll...
62#include "seam.h" // for SEAM
63#include "serialis.h" // for TFile
64#include "shapeclassifier.h" // for ShapeClassifier
65#include "shapetable.h" // for UnicharRating, ShapeTable, Shape, Uni...
66#include "strngs.h" // for STRING
67#include "tessclassifier.h" // for TessClassifier
68#include "tessdatamanager.h" // for TessdataManager, TESSDATA_INTTEMP
69#include "tprintf.h" // for tprintf
70#include "trainingsample.h" // for TrainingSample
71#include "unichar.h" // for UNICHAR_ID, INVALID_UNICHAR_ID
72#include "unicharset.h" // for UNICHARSET, CHAR_FRAGMENT, UNICHAR_SPACE
73#include "unicity_table.h" // for UnicityTable
74
75#define ADAPT_TEMPLATE_SUFFIX ".a"
76
77#define MAX_MATCHES 10
78#define UNLIKELY_NUM_FEAT 200
79#define NO_DEBUG 0
80#define MAX_ADAPTABLE_WERD_SIZE 40
81
82#define ADAPTABLE_WERD_ADJUSTMENT (0.05)
83
84#define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
85
86#define WORST_POSSIBLE_RATING (0.0f)
87
90
92 int32_t BlobLength;
99
102 inline void Initialize() {
103 BlobLength = INT32_MAX;
104 HasNonfragment = false;
105 ComputeBest();
106 }
107 // Computes best_unichar_id, best_match_index and best_rating.
108 void ComputeBest() {
109 best_unichar_id = INVALID_UNICHAR_ID;
110 best_match_index = -1;
112 for (int i = 0; i < match.size(); ++i) {
113 if (match[i].rating > best_rating) {
114 best_rating = match[i].rating;
115 best_unichar_id = match[i].unichar_id;
117 }
118 }
119 }
120};
121
122struct PROTO_KEY {
126};
127
128/*-----------------------------------------------------------------------------
129 Private Macros
130-----------------------------------------------------------------------------*/
131inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
132 return (1.0f - confidence) > matcher_great_threshold;
133}
134
135/*-----------------------------------------------------------------------------
136 Private Function Prototypes
137-----------------------------------------------------------------------------*/
138// Returns the index of the given id in results, if present, or the size of the
139// vector (index it will go at) if not present.
140static int FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
141 for (int i = 0; i < results.match.size(); i++) {
142 if (results.match[i].unichar_id == id)
143 return i;
144 }
145 return results.match.size();
146}
147
148// Returns the current rating for a unichar id if we have rated it, defaulting
149// to WORST_POSSIBLE_RATING.
150static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
151 int index = FindScoredUnichar(id, results);
152 if (index >= results.match.size()) return WORST_POSSIBLE_RATING;
153 return results.match[index].rating;
154}
155
156void InitMatcherRatings(float *Rating);
157
158int MakeTempProtoPerm(void *item1, void *item2);
159
160void SetAdaptiveThreshold(float Threshold);
161
162
163/*-----------------------------------------------------------------------------
164 Public Code
165-----------------------------------------------------------------------------*/
166/*---------------------------------------------------------------------------*/
167namespace tesseract {
191void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
192 assert(Choices != nullptr);
193 auto *Results = new ADAPT_RESULTS;
194 Results->Initialize();
195
196 ASSERT_HOST(AdaptedTemplates != nullptr);
197
198 DoAdaptiveMatch(Blob, Results);
199
200 RemoveBadMatches(Results);
201 Results->match.sort(&UnicharRating::SortDescendingRating);
202 RemoveExtraPuncs(Results);
203 Results->ComputeBest();
204 ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
205 Choices);
206
207 // TODO(rays) Move to before ConvertMatchesToChoices!
208 if (LargeSpeckle(*Blob) || Choices->length() == 0)
209 AddLargeSpeckleTo(Results->BlobLength, Choices);
210
211 if (matcher_debug_level >= 1) {
212 tprintf("AD Matches = ");
214 }
215
216#ifndef GRAPHICS_DISABLED
218 DebugAdaptiveClassifier(Blob, Results);
219#endif
220
221 delete Results;
222} /* AdaptiveClassifier */
223
224// If *win is nullptr, sets it to a new ScrollView() object with title msg.
225// Clears the window and draws baselines.
226void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
227 int y_offset, const TBOX &wbox) {
228 #ifndef GRAPHICS_DISABLED
229 const int kSampleSpaceWidth = 500;
230 if (*win == nullptr) {
231 *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
232 kSampleSpaceWidth * 2, 200, true);
233 }
234 (*win)->Clear();
235 (*win)->Pen(64, 64, 64);
236 (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
237 kSampleSpaceWidth, kBlnBaselineOffset);
238 (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
239 kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
240 (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
241 wbox.right(), wbox.bottom());
242 #endif // GRAPHICS_DISABLED
243}
244
245// Learns the given word using its chopped_word, seam_array, denorm,
246// box_word, best_state, and correct_text to learn both correctly and
247// incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
248// is called and the data will be saved in an internal buffer.
249// Otherwise AdaptToBlob is called for adaption within a document.
250void Classify::LearnWord(const char* fontname, WERD_RES* word) {
251 int word_len = word->correct_text.size();
252 if (word_len == 0) return;
253
254 float* thresholds = nullptr;
255 if (fontname == nullptr) {
256 // Adaption mode.
257 if (!EnableLearning || word->best_choice == nullptr)
258 return; // Can't or won't adapt.
259
261 tprintf("\n\nAdapting to word = %s\n",
262 word->best_choice->debug_string().string());
263 thresholds = new float[word_len];
267 matcher_rating_margin, thresholds);
268 }
269 int start_blob = 0;
270
271 #ifndef GRAPHICS_DISABLED
273 if (learn_fragmented_word_debug_win_ != nullptr) {
274 window_wait(learn_fragmented_word_debug_win_);
275 }
276 RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
277 word->chopped_word->bounding_box());
278 RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
279 word->chopped_word->bounding_box());
280 word->chopped_word->plot(learn_fragmented_word_debug_win_);
282 }
283 #endif // GRAPHICS_DISABLED
284
285 for (int ch = 0; ch < word_len; ++ch) {
287 tprintf("\nLearning %s\n", word->correct_text[ch].string());
288 }
289 if (word->correct_text[ch].length() > 0) {
290 float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
291
292 LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
293 CST_WHOLE, word->correct_text[ch].string(), word);
294
295 if (word->best_state[ch] > 1 && !disable_character_fragments) {
296 // Check that the character breaks into meaningful fragments
297 // that each match a whole character with at least
298 // classify_character_fragments_garbage_certainty_threshold
299 bool garbage = false;
300 int frag;
301 for (frag = 0; frag < word->best_state[ch]; ++frag) {
302 TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
304 garbage |= LooksLikeGarbage(frag_blob);
305 }
306 }
307 // Learn the fragments.
308 if (!garbage) {
309 bool pieces_all_natural = word->PiecesAllNatural(start_blob,
310 word->best_state[ch]);
311 if (pieces_all_natural || !prioritize_division) {
312 for (frag = 0; frag < word->best_state[ch]; ++frag) {
314 word->correct_text[ch].split(' ', &tokens);
315
316 tokens[0] = CHAR_FRAGMENT::to_string(
317 tokens[0].string(), frag, word->best_state[ch],
318 pieces_all_natural);
319
320 STRING full_string;
321 for (int i = 0; i < tokens.size(); i++) {
322 full_string += tokens[i];
323 if (i != tokens.size() - 1)
324 full_string += ' ';
325 }
326 LearnPieces(fontname, start_blob + frag, 1, threshold,
327 CST_FRAGMENT, full_string.string(), word);
328 }
329 }
330 }
331 }
332
333 // TODO(rays): re-enable this part of the code when we switch to the
334 // new classifier that needs to see examples of garbage.
335 /*
336 if (word->best_state[ch] > 1) {
337 // If the next blob is good, make junk with the rightmost fragment.
338 if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
339 LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
340 word->best_state[ch + 1] + 1,
341 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
342 }
343 // If the previous blob is good, make junk with the leftmost fragment.
344 if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
345 LearnPieces(fontname, start_blob - word->best_state[ch - 1],
346 word->best_state[ch - 1] + 1,
347 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
348 }
349 }
350 // If the next blob is good, make a join with it.
351 if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
352 STRING joined_text = word->correct_text[ch];
353 joined_text += word->correct_text[ch + 1];
354 LearnPieces(fontname, start_blob,
355 word->best_state[ch] + word->best_state[ch + 1],
356 threshold, CST_NGRAM, joined_text.string(), word);
357 }
358 */
359 }
360 start_blob += word->best_state[ch];
361 }
362 delete [] thresholds;
363} // LearnWord.
364
365// Builds a blob of length fragments, from the word, starting at start,
366// and then learns it, as having the given correct_text.
367// If fontname is not nullptr, then LearnBlob is called and the data will be
368// saved in an internal buffer for static training.
369// Otherwise AdaptToBlob is called for adaption within a document.
370// threshold is a magic number required by AdaptToChar and generated by
371// ComputeAdaptionThresholds.
372// Although it can be partly inferred from the string, segmentation is
373// provided to explicitly clarify the character segmentation.
374void Classify::LearnPieces(const char* fontname, int start, int length,
375 float threshold, CharSegmentationType segmentation,
376 const char* correct_text, WERD_RES* word) {
377 // TODO(daria) Remove/modify this if/when we want
378 // to train and/or adapt to n-grams.
379 if (segmentation != CST_WHOLE &&
380 (segmentation != CST_FRAGMENT || disable_character_fragments))
381 return;
382
383 if (length > 1) {
384 SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
385 start + length - 1);
386 }
387 TBLOB* blob = word->chopped_word->blobs[start];
388 // Rotate the blob if needed for classification.
389 TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
390 if (rotated_blob == nullptr)
391 rotated_blob = blob;
392
393 #ifndef GRAPHICS_DISABLED
394 // Draw debug windows showing the blob that is being learned if needed.
395 if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
396 RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
397 word->chopped_word->bounding_box());
398 rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
399 learn_debug_win_->Update();
400 window_wait(learn_debug_win_);
401 }
402 if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
403 ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
404 blob->plot(learn_fragments_debug_win_,
406 learn_fragments_debug_win_->Update();
407 }
408 #endif // GRAPHICS_DISABLED
409
410 if (fontname != nullptr) {
411 classify_norm_method.set_value(character); // force char norm spc 30/11/93
412 tess_bn_matching.set_value(false); // turn it off
413 tess_cn_matching.set_value(false);
414 DENORM bl_denorm, cn_denorm;
415 INT_FX_RESULT_STRUCT fx_info;
417 &bl_denorm, &cn_denorm, &fx_info);
418 LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
419 } else if (unicharset.contains_unichar(correct_text)) {
420 UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
421 int font_id = word->fontinfo != nullptr
422 ? fontinfo_table_.get_id(*word->fontinfo)
423 : 0;
425 tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
426 unicharset.id_to_unichar(class_id), threshold, font_id);
427 // If filename is not nullptr we are doing recognition
428 // (as opposed to training), so we must have already set word fonts.
429 AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
430 if (BackupAdaptedTemplates != nullptr) {
431 // Adapt the backup templates too. They will be used if the primary gets
432 // too full.
433 AdaptToChar(rotated_blob, class_id, font_id, threshold,
435 }
436 } else if (classify_debug_level >= 1) {
437 tprintf("Can't adapt to %s not in unicharset\n", correct_text);
438 }
439 if (rotated_blob != blob) {
440 delete rotated_blob;
441 }
442
443 SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
444 start + length - 1);
445} // LearnPieces.
446
447/*---------------------------------------------------------------------------*/
460 STRING Filename;
461 FILE *File;
462
463 if (AdaptedTemplates != nullptr &&
465 Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
466 File = fopen (Filename.string(), "wb");
467 if (File == nullptr)
468 cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
469 else {
470 cprintf ("\nSaving adapted templates to %s ...", Filename.string());
471 fflush(stdout);
473 cprintf ("\n");
474 fclose(File);
475 }
476 }
477
478 if (AdaptedTemplates != nullptr) {
480 AdaptedTemplates = nullptr;
481 }
482 if (BackupAdaptedTemplates != nullptr) {
484 BackupAdaptedTemplates = nullptr;
485 }
486
487 if (PreTrainedTemplates != nullptr) {
489 PreTrainedTemplates = nullptr;
490 }
493 if (AllProtosOn != nullptr) {
494 FreeBitVector(AllProtosOn);
495 FreeBitVector(AllConfigsOn);
496 FreeBitVector(AllConfigsOff);
497 FreeBitVector(TempProtoMask);
498 AllProtosOn = nullptr;
499 AllConfigsOn = nullptr;
500 AllConfigsOff = nullptr;
501 TempProtoMask = nullptr;
502 }
503 delete shape_table_;
504 shape_table_ = nullptr;
505 delete static_classifier_;
506 static_classifier_ = nullptr;
507} /* EndAdaptiveClassifier */
508
509
510/*---------------------------------------------------------------------------*/
529 return;
530 if (AllProtosOn != nullptr)
531 EndAdaptiveClassifier(); // Don't leak with multiple inits.
532
533 // If there is no language_data_path_prefix, the classifier will be
534 // adaptive only.
535 if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
536 TFile fp;
539
540 if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
542 if (!shape_table_->DeSerialize(&fp)) {
543 tprintf("Error loading shape table!\n");
544 delete shape_table_;
545 shape_table_ = nullptr;
546 }
547 }
548
550 ReadNewCutoffs(&fp, CharNormCutoffs);
551
554 static_classifier_ = new TessClassifier(false, this);
555 }
556
558
559 AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
560 AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
561 AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
562 TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
563 set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
564 set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
565 zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
566
567 for (uint16_t& BaselineCutoff : BaselineCutoffs) {
568 BaselineCutoff = 0;
569 }
570
572 TFile fp;
573 STRING Filename;
574
575 Filename = imagefile;
576 Filename += ADAPT_TEMPLATE_SUFFIX;
577 if (!fp.Open(Filename.string(), nullptr)) {
579 } else {
580 cprintf("\nReading pre-adapted templates from %s ...\n",
581 Filename.string());
582 fflush(stdout);
584 cprintf("\n");
586
587 for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
588 BaselineCutoffs[i] = CharNormCutoffs[i];
589 }
590 }
591 } else {
592 if (AdaptedTemplates != nullptr)
595 }
596} /* InitAdaptiveClassifier */
597
600 tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
601 NumAdaptationsFailed);
602 }
605 if (BackupAdaptedTemplates != nullptr)
607 BackupAdaptedTemplates = nullptr;
608 NumAdaptationsFailed = 0;
609}
610
611// If there are backup adapted templates, switches to those, otherwise resets
612// the main adaptive classifier (because it is full.)
614 if (BackupAdaptedTemplates == nullptr) {
616 return;
617 }
619 tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
620 NumAdaptationsFailed);
621 }
624 BackupAdaptedTemplates = nullptr;
625 NumAdaptationsFailed = 0;
626}
627
628// Resets the backup adaptive classifier to empty.
630 if (BackupAdaptedTemplates != nullptr)
633}
634
635/*---------------------------------------------------------------------------*/
654
656
657} /* SettupPass1 */
658
659
660/*---------------------------------------------------------------------------*/
670 EnableLearning = false;
672
673} /* SettupPass2 */
674
675
676/*---------------------------------------------------------------------------*/
694 CLASS_ID ClassId,
695 int FontinfoId,
696 ADAPT_CLASS Class,
697 ADAPT_TEMPLATES Templates) {
698 FEATURE_SET Features;
699 int Fid, Pid;
700 FEATURE Feature;
701 int NumFeatures;
702 TEMP_PROTO TempProto;
703 PROTO Proto;
704 INT_CLASS IClass;
706
708 Features = ExtractOutlineFeatures(Blob);
709 NumFeatures = Features->NumFeatures;
710 if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
711 FreeFeatureSet(Features);
712 return;
713 }
714
715 Config = NewTempConfig(NumFeatures - 1, FontinfoId);
716 TempConfigFor(Class, 0) = Config;
717
718 /* this is a kludge to construct cutoffs for adapted templates */
719 if (Templates == AdaptedTemplates)
720 BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
721
722 IClass = ClassForClassId (Templates->Templates, ClassId);
723
724 for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
725 Pid = AddIntProto (IClass);
726 assert (Pid != NO_PROTO);
727
728 Feature = Features->Features[Fid];
729 TempProto = NewTempProto ();
730 Proto = &(TempProto->Proto);
731
732 /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
733 ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
734 instead of the -0.25 to 0.75 used in baseline normalization */
735 Proto->Angle = Feature->Params[OutlineFeatDir];
736 Proto->X = Feature->Params[OutlineFeatX];
737 Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
738 Proto->Length = Feature->Params[OutlineFeatLength];
739 FillABC(Proto);
740
741 TempProto->ProtoId = Pid;
742 SET_BIT (Config->Protos, Pid);
743
744 ConvertProto(Proto, Pid, IClass);
745 AddProtoToProtoPruner(Proto, Pid, IClass,
747
748 Class->TempProtos = push (Class->TempProtos, TempProto);
749 }
750 FreeFeatureSet(Features);
751
752 AddIntConfig(IClass);
753 ConvertConfig (AllProtosOn, 0, IClass);
754
756 tprintf("Added new class '%s' with class id %d and %d protos.\n",
757 unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
759 DisplayAdaptedChar(Blob, IClass);
760 }
761
762 if (IsEmptyAdaptedClass(Class))
763 (Templates->NumNonEmptyClasses)++;
764} /* InitAdaptedClass */
765
766
767/*---------------------------------------------------------------------------*/
787 INT_FEATURE_ARRAY IntFeatures,
788 FEATURE_SET *FloatFeatures) {
789 FEATURE_SET Features;
790 int NumFeatures;
791
793 Features = ExtractPicoFeatures(Blob);
794
795 NumFeatures = Features->NumFeatures;
796 if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
797 FreeFeatureSet(Features);
798 return 0;
799 }
800
801 ComputeIntFeatures(Features, IntFeatures);
802 *FloatFeatures = Features;
803
804 return NumFeatures;
805} /* GetAdaptiveFeatures */
806
807
808/*-----------------------------------------------------------------------------
809 Private Code
810-----------------------------------------------------------------------------*/
811/*---------------------------------------------------------------------------*/
822 if (word->best_choice == nullptr) return false;
823 int BestChoiceLength = word->best_choice->length();
824 float adaptable_score =
826 return // rules that apply in general - simplest to compute first
827 BestChoiceLength > 0 &&
828 BestChoiceLength == word->rebuild_word->NumBlobs() &&
829 BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
830 // This basically ensures that the word is at least a dictionary match
831 // (freq word, user word, system dawg word, etc).
832 // Since all the other adjustments will make adjust factor higher
833 // than higher than adaptable_score=1.1+0.05=1.15
834 // Since these are other flags that ensure that the word is dict word,
835 // this check could be at times redundant.
836 word->best_choice->adjust_factor() <= adaptable_score &&
837 // Make sure that alternative choices are not dictionary words.
838 word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
839}
840
841/*---------------------------------------------------------------------------*/
853void Classify::AdaptToChar(TBLOB* Blob, CLASS_ID ClassId, int FontinfoId,
854 float Threshold,
855 ADAPT_TEMPLATES adaptive_templates) {
856 int NumFeatures;
857 INT_FEATURE_ARRAY IntFeatures;
858 UnicharRating int_result;
859 INT_CLASS IClass;
860 ADAPT_CLASS Class;
861 TEMP_CONFIG TempConfig;
862 FEATURE_SET FloatFeatures;
863 int NewTempConfigId;
864
865 if (!LegalClassId (ClassId))
866 return;
867
868 int_result.unichar_id = ClassId;
869 Class = adaptive_templates->Class[ClassId];
870 assert(Class != nullptr);
871 if (IsEmptyAdaptedClass(Class)) {
872 InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
873 } else {
874 IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
875
876 NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
877 if (NumFeatures <= 0) {
878 return; // Features already freed by GetAdaptiveFeatures.
879 }
880
881 // Only match configs with the matching font.
882 BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
883 for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
884 if (GetFontinfoId(Class, cfg) == FontinfoId) {
885 SET_BIT(MatchingFontConfigs, cfg);
886 } else {
887 reset_bit(MatchingFontConfigs, cfg);
888 }
889 }
890 im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
891 NumFeatures, IntFeatures,
894 FreeBitVector(MatchingFontConfigs);
895
896 SetAdaptiveThreshold(Threshold);
897
898 if (1.0f - int_result.rating <= Threshold) {
899 if (ConfigIsPermanent(Class, int_result.config)) {
901 tprintf("Found good match to perm config %d = %4.1f%%.\n",
902 int_result.config, int_result.rating * 100.0);
903 FreeFeatureSet(FloatFeatures);
904 return;
905 }
906
907 TempConfig = TempConfigFor(Class, int_result.config);
908 IncreaseConfidence(TempConfig);
909 if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
910 Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
911 }
913 tprintf("Increasing reliability of temp config %d to %d.\n",
914 int_result.config, TempConfig->NumTimesSeen);
915
916 if (TempConfigReliable(ClassId, TempConfig)) {
917 MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
918 UpdateAmbigsGroup(ClassId, Blob);
919 }
920 } else {
922 tprintf("Found poor match to temp config %d = %4.1f%%.\n",
923 int_result.config, int_result.rating * 100.0);
925 DisplayAdaptedChar(Blob, IClass);
926 }
927 NewTempConfigId =
928 MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId,
929 NumFeatures, IntFeatures, FloatFeatures);
930 if (NewTempConfigId >= 0 &&
931 TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
932 MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
933 UpdateAmbigsGroup(ClassId, Blob);
934 }
935
936#ifndef GRAPHICS_DISABLED
938 DisplayAdaptedChar(Blob, IClass);
939 }
940#endif
941 }
942 FreeFeatureSet(FloatFeatures);
943 }
944} /* AdaptToChar */
945
947#ifndef GRAPHICS_DISABLED
948 INT_FX_RESULT_STRUCT fx_info;
952 &bl_features);
953 if (sample == nullptr) return;
954
955 UnicharRating int_result;
956 im_.Match(int_class, AllProtosOn, AllConfigsOn,
957 bl_features.size(), &bl_features[0],
960 tprintf("Best match to temp config %d = %4.1f%%.\n",
961 int_result.config, int_result.rating * 100.0);
963 uint32_t ConfigMask;
964 ConfigMask = 1 << int_result.config;
966 im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask),
967 bl_features.size(), &bl_features[0],
971 }
972
973 delete sample;
974#endif
975}
976
995 ADAPT_RESULTS *results) {
996 int old_match = FindScoredUnichar(new_result.unichar_id, *results);
997
998 if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
999 (old_match < results->match.size() &&
1000 new_result.rating <= results->match[old_match].rating))
1001 return; // New one not good enough.
1002
1003 if (!unicharset.get_fragment(new_result.unichar_id))
1004 results->HasNonfragment = true;
1005
1006 if (old_match < results->match.size()) {
1007 results->match[old_match].rating = new_result.rating;
1008 } else {
1009 results->match.push_back(new_result);
1010 }
1011
1012 if (new_result.rating > results->best_rating &&
1013 // Ensure that fragments do not affect best rating, class and config.
1014 // This is needed so that at least one non-fragmented character is
1015 // always present in the results.
1016 // TODO(daria): verify that this helps accuracy and does not
1017 // hurt performance.
1018 !unicharset.get_fragment(new_result.unichar_id)) {
1019 results->best_match_index = old_match;
1020 results->best_rating = new_result.rating;
1021 results->best_unichar_id = new_result.unichar_id;
1022 }
1023} /* AddNewResult */
1024
1025
1026/*---------------------------------------------------------------------------*/
1046 const GenericVector<INT_FEATURE_STRUCT>& int_features,
1047 const INT_FX_RESULT_STRUCT& fx_info,
1048 const TBLOB *blob,
1049 INT_TEMPLATES templates,
1050 ADAPT_CLASS *classes,
1051 UNICHAR_ID *ambiguities,
1052 ADAPT_RESULTS *results) {
1053 if (int_features.empty()) return;
1054 auto* CharNormArray = new uint8_t[unicharset.size()];
1055 UnicharRating int_result;
1056
1057 results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr,
1058 CharNormArray);
1059 bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1060 if (debug)
1061 tprintf("AM Matches = ");
1062
1063 int top = blob->bounding_box().top();
1064 int bottom = blob->bounding_box().bottom();
1065 while (*ambiguities >= 0) {
1066 CLASS_ID class_id = *ambiguities;
1067
1068 int_result.unichar_id = class_id;
1069 im_.Match(ClassForClassId(templates, class_id),
1071 int_features.size(), &int_features[0],
1072 &int_result,
1075
1076 ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0,
1077 results->BlobLength,
1079 CharNormArray, &int_result, results);
1080 ambiguities++;
1081 }
1082 delete [] CharNormArray;
1083} /* AmbigClassifier */
1084
1085/*---------------------------------------------------------------------------*/
1089 int16_t num_features,
1090 const INT_FEATURE_STRUCT* features,
1091 const uint8_t* norm_factors,
1092 ADAPT_CLASS* classes,
1093 int debug,
1094 int matcher_multiplier,
1095 const TBOX& blob_box,
1096 const GenericVector<CP_RESULT_STRUCT>& results,
1097 ADAPT_RESULTS* final_results) {
1098 int top = blob_box.top();
1099 int bottom = blob_box.bottom();
1100 UnicharRating int_result;
1101 for (int c = 0; c < results.size(); c++) {
1102 CLASS_ID class_id = results[c].Class;
1103 BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos
1104 : AllProtosOn;
1105 BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs
1106 : AllConfigsOn;
1107
1108 int_result.unichar_id = class_id;
1109 im_.Match(ClassForClassId(templates, class_id),
1110 protos, configs,
1111 num_features, features,
1112 &int_result, classify_adapt_feature_threshold, debug,
1114 bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1115 ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top,
1116 results[c].Rating,
1117 final_results->BlobLength,
1118 matcher_multiplier, norm_factors,
1119 &int_result, final_results);
1120 }
1121}
1122
1123// Converts configs to fonts, and if the result is not adapted, and a
1124// shape_table_ is present, the shape is expanded to include all
1125// unichar_ids represented, before applying a set of corrections to the
1126// distance rating in int_result, (see ComputeCorrectedRating.)
1127// The results are added to the final_results output.
1129 ADAPT_CLASS* classes, bool debug, int class_id, int bottom, int top,
1130 float cp_rating, int blob_length, int matcher_multiplier,
1131 const uint8_t* cn_factors,
1132 UnicharRating* int_result, ADAPT_RESULTS* final_results) {
1133 if (classes != nullptr) {
1134 // Adapted result. Convert configs to fontinfo_ids.
1135 int_result->adapted = true;
1136 for (int f = 0; f < int_result->fonts.size(); ++f) {
1137 int_result->fonts[f].fontinfo_id =
1138 GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
1139 }
1140 } else {
1141 // Pre-trained result. Map fonts using font_sets_.
1142 int_result->adapted = false;
1143 for (int f = 0; f < int_result->fonts.size(); ++f) {
1144 int_result->fonts[f].fontinfo_id =
1146 int_result->fonts[f].fontinfo_id);
1147 }
1148 if (shape_table_ != nullptr) {
1149 // Two possible cases:
1150 // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1151 // int_result->fonts are the same. In this case build a new vector of
1152 // mapped fonts and replace the fonts in int_result.
1153 // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1154 // by int_result. In this case, build a vector of UnicharRating to
1155 // gather together different font-ids for each unichar. Also covers case1.
1156 GenericVector<UnicharRating> mapped_results;
1157 for (int f = 0; f < int_result->fonts.size(); ++f) {
1158 int shape_id = int_result->fonts[f].fontinfo_id;
1159 const Shape& shape = shape_table_->GetShape(shape_id);
1160 for (int c = 0; c < shape.size(); ++c) {
1161 int unichar_id = shape[c].unichar_id;
1162 if (!unicharset.get_enabled(unichar_id)) continue;
1163 // Find the mapped_result for unichar_id.
1164 int r = 0;
1165 for (r = 0; r < mapped_results.size() &&
1166 mapped_results[r].unichar_id != unichar_id; ++r) {}
1167 if (r == mapped_results.size()) {
1168 mapped_results.push_back(*int_result);
1169 mapped_results[r].unichar_id = unichar_id;
1170 mapped_results[r].fonts.truncate(0);
1171 }
1172 for (int i = 0; i < shape[c].font_ids.size(); ++i) {
1173 mapped_results[r].fonts.push_back(
1174 ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
1175 }
1176 }
1177 }
1178 for (int m = 0; m < mapped_results.size(); ++m) {
1179 mapped_results[m].rating =
1180 ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
1181 cp_rating, int_result->rating,
1182 int_result->feature_misses, bottom, top,
1183 blob_length, matcher_multiplier, cn_factors);
1184 AddNewResult(mapped_results[m], final_results);
1185 }
1186 return;
1187 }
1188 }
1189 if (unicharset.get_enabled(class_id)) {
1190 int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
1191 int_result->rating,
1192 int_result->feature_misses,
1193 bottom, top, blob_length,
1194 matcher_multiplier, cn_factors);
1195 AddNewResult(*int_result, final_results);
1196 }
1197}
1198
1199// Applies a set of corrections to the confidence im_rating,
1200// including the cn_correction, miss penalty and additional penalty
1201// for non-alnums being vertical misfits. Returns the corrected confidence.
1202double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
1203 double cp_rating, double im_rating,
1204 int feature_misses,
1205 int bottom, int top,
1206 int blob_length, int matcher_multiplier,
1207 const uint8_t* cn_factors) {
1208 // Compute class feature corrections.
1209 double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
1210 cn_factors[unichar_id],
1211 matcher_multiplier);
1212 double miss_penalty = tessedit_class_miss_scale * feature_misses;
1213 double vertical_penalty = 0.0;
1214 // Penalize non-alnums for being vertical misfits.
1215 if (!unicharset.get_isalpha(unichar_id) &&
1216 !unicharset.get_isdigit(unichar_id) &&
1217 cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1218 int min_bottom, max_bottom, min_top, max_top;
1219 unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
1220 &min_top, &max_top);
1221 if (debug) {
1222 tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
1223 top, min_top, max_top, bottom, min_bottom, max_bottom);
1224 }
1225 if (top < min_top || top > max_top ||
1226 bottom < min_bottom || bottom > max_bottom) {
1227 vertical_penalty = classify_misfit_junk_penalty;
1228 }
1229 }
1230 double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1231 if (result < WORST_POSSIBLE_RATING)
1232 result = WORST_POSSIBLE_RATING;
1233 if (debug) {
1234 tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1235 unicharset.id_to_unichar(unichar_id),
1236 result * 100.0,
1237 cp_rating * 100.0,
1238 (1.0 - im_rating) * 100.0,
1239 (cn_corrected - (1.0 - im_rating)) * 100.0,
1240 cn_factors[unichar_id],
1241 miss_penalty * 100.0,
1242 vertical_penalty * 100.0);
1243 }
1244 return result;
1245}
1246
1247/*---------------------------------------------------------------------------*/
1266 TBLOB *Blob, const GenericVector<INT_FEATURE_STRUCT>& int_features,
1267 const INT_FX_RESULT_STRUCT& fx_info,
1268 ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) {
1269 if (int_features.empty()) return nullptr;
1270 auto* CharNormArray = new uint8_t[unicharset.size()];
1271 ClearCharNormArray(CharNormArray);
1272
1274 PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
1275 CharNormArray, BaselineCutoffs, &Results->CPResults);
1276
1278 tprintf("BL Matches = ");
1279
1280 MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
1281 CharNormArray,
1282 Templates->Class, matcher_debug_flags, 0,
1283 Blob->bounding_box(), Results->CPResults, Results);
1284
1285 delete [] CharNormArray;
1286 CLASS_ID ClassId = Results->best_unichar_id;
1287 if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
1288 return nullptr;
1289
1290 return Templates->Class[ClassId]->
1291 Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
1292} /* BaselineClassifier */
1293
1294
1295/*---------------------------------------------------------------------------*/
1312 const TrainingSample& sample,
1313 ADAPT_RESULTS *adapt_results) {
1314 // This is the length that is used for scaling ratings vs certainty.
1315 adapt_results->BlobLength =
1316 IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1317 GenericVector<UnicharRating> unichar_results;
1318 static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
1319 -1, &unichar_results);
1320 // Convert results to the format used internally by AdaptiveClassifier.
1321 for (int r = 0; r < unichar_results.size(); ++r) {
1322 AddNewResult(unichar_results[r], adapt_results);
1323 }
1324 return sample.num_features();
1325} /* CharNormClassifier */
1326
1327// As CharNormClassifier, but operates on a TrainingSample and outputs to
1328// a GenericVector of ShapeRating without conversion to classes.
1330 int keep_this,
1331 const TrainingSample& sample,
1333 results->clear();
1334 auto* adapt_results = new ADAPT_RESULTS();
1335 adapt_results->Initialize();
1336 // Compute the bounding box of the features.
1337 uint32_t num_features = sample.num_features();
1338 // Only the top and bottom of the blob_box are used by MasterMatcher, so
1339 // fabricate right and left using top and bottom.
1340 TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1341 sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1342 // Compute the char_norm_array from the saved cn_feature.
1343 FEATURE norm_feature = sample.GetCNFeature();
1344 auto* char_norm_array = new uint8_t[unicharset.size()];
1345 int num_pruner_classes = std::max(unicharset.size(),
1347 auto* pruner_norm_array = new uint8_t[num_pruner_classes];
1348 adapt_results->BlobLength =
1349 static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
1350 ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
1351 pruner_norm_array);
1352
1353 PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
1354 pruner_norm_array,
1355 shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1356 &adapt_results->CPResults);
1357 delete [] pruner_norm_array;
1358 if (keep_this >= 0) {
1359 adapt_results->CPResults[0].Class = keep_this;
1360 adapt_results->CPResults.truncate(1);
1361 }
1362 if (pruner_only) {
1363 // Convert pruner results to output format.
1364 for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
1365 int class_id = adapt_results->CPResults[i].Class;
1366 results->push_back(
1367 UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
1368 }
1369 } else {
1370 MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
1371 char_norm_array,
1372 nullptr, matcher_debug_flags,
1374 blob_box, adapt_results->CPResults, adapt_results);
1375 // Convert master matcher results to output format.
1376 for (int i = 0; i < adapt_results->match.size(); i++) {
1377 results->push_back(adapt_results->match[i]);
1378 }
1380 }
1381 delete [] char_norm_array;
1382 delete adapt_results;
1383 return num_features;
1384} /* CharNormTrainingSample */
1385
1386
1387/*---------------------------------------------------------------------------*/
1400 float rating = results->BlobLength / matcher_avg_noise_size;
1401 rating *= rating;
1402 rating /= 1.0 + rating;
1403
1404 AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1405} /* ClassifyAsNoise */
1406
1413void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
1414 ADAPT_RESULTS *Results,
1415 BLOB_CHOICE_LIST *Choices) {
1416 assert(Choices != nullptr);
1417 float Rating;
1418 float Certainty;
1419 BLOB_CHOICE_IT temp_it;
1420 bool contains_nonfrag = false;
1421 temp_it.set_to_list(Choices);
1422 int choices_length = 0;
1423 // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1424 // number of returned results, but with a shape_table_ we want to have room
1425 // for at least the biggest shape (which might contain hundreds of Indic
1426 // grapheme fragments) and more, so use double the size of the biggest shape
1427 // if that is more than the default.
1428 int max_matches = MAX_MATCHES;
1429 if (shape_table_ != nullptr) {
1430 max_matches = shape_table_->MaxNumUnichars() * 2;
1431 if (max_matches < MAX_MATCHES)
1432 max_matches = MAX_MATCHES;
1433 }
1434
1435 float best_certainty = -FLT_MAX;
1436 for (int i = 0; i < Results->match.size(); i++) {
1437 const UnicharRating& result = Results->match[i];
1438 bool adapted = result.adapted;
1439 bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1440 if (temp_it.length()+1 == max_matches &&
1441 !contains_nonfrag && current_is_frag) {
1442 continue; // look for a non-fragmented character to fill the
1443 // last spot in Choices if only fragments are present
1444 }
1445 // BlobLength can never be legally 0, this means recognition failed.
1446 // But we must return a classification result because some invoking
1447 // functions (chopper/permuter) do not anticipate a null blob choice.
1448 // So we need to assign a poor, but not infinitely bad score.
1449 if (Results->BlobLength == 0) {
1450 Certainty = -20;
1451 Rating = 100; // should be -certainty * real_blob_length
1452 } else {
1453 Rating = Certainty = (1.0f - result.rating);
1454 Rating *= rating_scale * Results->BlobLength;
1455 Certainty *= -(getDict().certainty_scale);
1456 }
1457 // Adapted results, by their very nature, should have good certainty.
1458 // Those that don't are at best misleading, and often lead to errors,
1459 // so don't accept adapted results that are too far behind the best result,
1460 // whether adapted or static.
1461 // TODO(rays) find some way of automatically tuning these constants.
1462 if (Certainty > best_certainty) {
1463 best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1464 } else if (adapted &&
1465 Certainty / classify_adapted_pruning_factor < best_certainty) {
1466 continue; // Don't accept bad adapted results.
1467 }
1468
1469 float min_xheight, max_xheight, yshift;
1470 denorm.XHeightRange(result.unichar_id, unicharset, box,
1471 &min_xheight, &max_xheight, &yshift);
1472 auto* choice =
1473 new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
1475 min_xheight, max_xheight, yshift,
1476 adapted ? BCC_ADAPTED_CLASSIFIER
1478 choice->set_fonts(result.fonts);
1479 temp_it.add_to_end(choice);
1480 contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1481 choices_length++;
1482 if (choices_length >= max_matches) break;
1483 }
1484 Results->match.truncate(choices_length);
1485} // ConvertMatchesToChoices
1486
1487
1488/*---------------------------------------------------------------------------*/
1489#ifndef GRAPHICS_DISABLED
1498 ADAPT_RESULTS *Results) {
1499 if (static_classifier_ == nullptr) return;
1500 INT_FX_RESULT_STRUCT fx_info;
1503 BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1504 if (sample == nullptr) return;
1505 static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
1506 Results->best_unichar_id);
1507} /* DebugAdaptiveClassifier */
1508#endif
1509
1510/*---------------------------------------------------------------------------*/
1531 UNICHAR_ID *Ambiguities;
1532
1533 INT_FX_RESULT_STRUCT fx_info;
1537 &bl_features);
1538 if (sample == nullptr) return;
1539
1540 // TODO: With LSTM, static_classifier_ is nullptr.
1541 // Return to avoid crash in CharNormClassifier.
1542 if (static_classifier_ == nullptr) {
1543 delete sample;
1544 return;
1545 }
1546
1549 CharNormClassifier(Blob, *sample, Results);
1550 } else {
1551 Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
1552 AdaptedTemplates, Results);
1553 if ((!Results->match.empty() &&
1554 MarginalMatch(Results->best_rating,
1556 !tess_bn_matching) ||
1557 Results->match.empty()) {
1558 CharNormClassifier(Blob, *sample, Results);
1559 } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1560 AmbigClassifier(bl_features, fx_info, Blob,
1563 Ambiguities,
1564 Results);
1565 }
1566 }
1567
1568 // Force the blob to be classified as noise
1569 // if the results contain only fragments.
1570 // TODO(daria): verify that this is better than
1571 // just adding a nullptr classification.
1572 if (!Results->HasNonfragment || Results->match.empty())
1573 ClassifyAsNoise(Results);
1574 delete sample;
1575} /* DoAdaptiveMatch */
1576
1577/*---------------------------------------------------------------------------*/
1593 CLASS_ID CorrectClass) {
1594 auto *Results = new ADAPT_RESULTS();
1595 UNICHAR_ID *Ambiguities;
1596 int i;
1597
1598 Results->Initialize();
1599 INT_FX_RESULT_STRUCT fx_info;
1603 &bl_features);
1604 if (sample == nullptr) {
1605 delete Results;
1606 return nullptr;
1607 }
1608
1609 CharNormClassifier(Blob, *sample, Results);
1610 delete sample;
1611 RemoveBadMatches(Results);
1612 Results->match.sort(&UnicharRating::SortDescendingRating);
1613
1614 /* copy the class id's into an string of ambiguities - don't copy if
1615 the correct class is the only class id matched */
1616 Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1617 if (Results->match.size() > 1 ||
1618 (Results->match.size() == 1 &&
1619 Results->match[0].unichar_id != CorrectClass)) {
1620 for (i = 0; i < Results->match.size(); i++)
1621 Ambiguities[i] = Results->match[i].unichar_id;
1622 Ambiguities[i] = -1;
1623 } else {
1624 Ambiguities[0] = -1;
1625 }
1626
1627 delete Results;
1628 return Ambiguities;
1629} /* GetAmbiguities */
1630
1631// Returns true if the given blob looks too dissimilar to any character
1632// present in the classifier templates.
1634 auto *ratings = new BLOB_CHOICE_LIST();
1635 AdaptiveClassifier(blob, ratings);
1636 BLOB_CHOICE_IT ratings_it(ratings);
1639 print_ratings_list("======================\nLooksLikeGarbage() got ",
1640 ratings, unicharset);
1641 }
1642 for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
1643 ratings_it.forward()) {
1644 if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1645 continue;
1646 }
1647 float certainty = ratings_it.data()->certainty();
1648 delete ratings;
1649 return certainty <
1651 }
1652 delete ratings;
1653 return true; // no whole characters in ratings
1654}
1655
1656/*---------------------------------------------------------------------------*/
1679 INT_TEMPLATES templates,
1680 uint8_t* pruner_norm_array,
1681 uint8_t* char_norm_array) {
1682 FEATURE norm_feature = NewFeature(&CharNormDesc);
1684 float scale = MF_SCALE_FACTOR;
1685 norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1686 norm_feature->Params[CharNormLength] =
1687 fx_info.Length * scale / LENGTH_COMPRESSION;
1688 norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1689 norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1690 // Deletes norm_feature.
1691 ComputeCharNormArrays(norm_feature, templates, char_norm_array,
1692 pruner_norm_array);
1694} /* GetCharNormFeature */
1695
1696// Computes the char_norm_array for the unicharset and, if not nullptr, the
1697// pruner_array as appropriate according to the existence of the shape_table.
1699 INT_TEMPLATES_STRUCT* templates,
1700 uint8_t* char_norm_array,
1701 uint8_t* pruner_array) {
1702 ComputeIntCharNormArray(*norm_feature, char_norm_array);
1703 if (pruner_array != nullptr) {
1704 if (shape_table_ == nullptr) {
1705 ComputeIntCharNormArray(*norm_feature, pruner_array);
1706 } else {
1707 memset(pruner_array, UINT8_MAX,
1708 templates->NumClasses * sizeof(pruner_array[0]));
1709 // Each entry in the pruner norm array is the MIN of all the entries of
1710 // the corresponding unichars in the CharNormArray.
1711 for (int id = 0; id < templates->NumClasses; ++id) {
1712 int font_set_id = templates->Class[id]->font_set_id;
1713 const FontSet &fs = fontset_table_.get(font_set_id);
1714 for (int config = 0; config < fs.size; ++config) {
1715 const Shape& shape = shape_table_->GetShape(fs.configs[config]);
1716 for (int c = 0; c < shape.size(); ++c) {
1717 if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
1718 pruner_array[id] = char_norm_array[shape[c].unichar_id];
1719 }
1720 }
1721 }
1722 }
1723 }
1724 FreeFeature(norm_feature);
1725}
1726
1727/*---------------------------------------------------------------------------*/
1741 CLASS_ID ClassId,
1742 int FontinfoId,
1743 int NumFeatures,
1744 INT_FEATURE_ARRAY Features,
1745 FEATURE_SET FloatFeatures) {
1746 INT_CLASS IClass;
1747 ADAPT_CLASS Class;
1748 PROTO_ID OldProtos[MAX_NUM_PROTOS];
1749 FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1750 int NumOldProtos;
1751 int NumBadFeatures;
1752 int MaxProtoId, OldMaxProtoId;
1753 int MaskSize;
1754 int ConfigId;
1756 int i;
1757 int debug_level = NO_DEBUG;
1758
1760 debug_level =
1762
1763 IClass = ClassForClassId(Templates->Templates, ClassId);
1764 Class = Templates->Class[ClassId];
1765
1766 if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1767 ++NumAdaptationsFailed;
1769 cprintf("Cannot make new temporary config: maximum number exceeded.\n");
1770 return -1;
1771 }
1772
1773 OldMaxProtoId = IClass->NumProtos - 1;
1774
1775 NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
1776 NumFeatures, Features,
1778 debug_level);
1779
1780 MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1781 zero_all_bits(TempProtoMask, MaskSize);
1782 for (i = 0; i < NumOldProtos; i++)
1783 SET_BIT(TempProtoMask, OldProtos[i]);
1784
1785 NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
1786 NumFeatures, Features,
1787 BadFeatures,
1789 debug_level);
1790
1791 MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
1792 IClass, Class, TempProtoMask);
1793 if (MaxProtoId == NO_PROTO) {
1794 ++NumAdaptationsFailed;
1796 cprintf("Cannot make new temp protos: maximum number exceeded.\n");
1797 return -1;
1798 }
1799
1800 ConfigId = AddIntConfig(IClass);
1801 ConvertConfig(TempProtoMask, ConfigId, IClass);
1802 Config = NewTempConfig(MaxProtoId, FontinfoId);
1803 TempConfigFor(Class, ConfigId) = Config;
1804 copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1805
1807 cprintf("Making new temp config %d fontinfo id %d"
1808 " using %d old and %d new protos.\n",
1809 ConfigId, Config->FontinfoId,
1810 NumOldProtos, MaxProtoId - OldMaxProtoId);
1811
1812 return ConfigId;
1813} /* MakeNewTemporaryConfig */
1814
1815/*---------------------------------------------------------------------------*/
1835 int NumBadFeat,
1836 FEATURE_ID BadFeat[],
1837 INT_CLASS IClass,
1838 ADAPT_CLASS Class,
1839 BIT_VECTOR TempProtoMask) {
1840 FEATURE_ID *ProtoStart;
1841 FEATURE_ID *ProtoEnd;
1842 FEATURE_ID *LastBad;
1843 TEMP_PROTO TempProto;
1844 PROTO Proto;
1845 FEATURE F1, F2;
1846 float X1, X2, Y1, Y2;
1847 float A1, A2, AngleDelta;
1848 float SegmentLength;
1849 PROTO_ID Pid;
1850
1851 for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
1852 ProtoStart < LastBad; ProtoStart = ProtoEnd) {
1853 F1 = Features->Features[*ProtoStart];
1854 X1 = F1->Params[PicoFeatX];
1855 Y1 = F1->Params[PicoFeatY];
1856 A1 = F1->Params[PicoFeatDir];
1857
1858 for (ProtoEnd = ProtoStart + 1,
1859 SegmentLength = GetPicoFeatureLength();
1860 ProtoEnd < LastBad;
1861 ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1862 F2 = Features->Features[*ProtoEnd];
1863 X2 = F2->Params[PicoFeatX];
1864 Y2 = F2->Params[PicoFeatY];
1865 A2 = F2->Params[PicoFeatDir];
1866
1867 AngleDelta = fabs(A1 - A2);
1868 if (AngleDelta > 0.5)
1869 AngleDelta = 1.0 - AngleDelta;
1870
1871 if (AngleDelta > matcher_clustering_max_angle_delta ||
1872 fabs(X1 - X2) > SegmentLength ||
1873 fabs(Y1 - Y2) > SegmentLength)
1874 break;
1875 }
1876
1877 F2 = Features->Features[*(ProtoEnd - 1)];
1878 X2 = F2->Params[PicoFeatX];
1879 Y2 = F2->Params[PicoFeatY];
1880 A2 = F2->Params[PicoFeatDir];
1881
1882 Pid = AddIntProto(IClass);
1883 if (Pid == NO_PROTO)
1884 return (NO_PROTO);
1885
1886 TempProto = NewTempProto();
1887 Proto = &(TempProto->Proto);
1888
1889 /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1890 ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1891 instead of the -0.25 to 0.75 used in baseline normalization */
1892 Proto->Length = SegmentLength;
1893 Proto->Angle = A1;
1894 Proto->X = (X1 + X2) / 2.0;
1895 Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
1896 FillABC(Proto);
1897
1898 TempProto->ProtoId = Pid;
1899 SET_BIT(TempProtoMask, Pid);
1900
1901 ConvertProto(Proto, Pid, IClass);
1902 AddProtoToProtoPruner(Proto, Pid, IClass,
1904
1905 Class->TempProtos = push(Class->TempProtos, TempProto);
1906 }
1907 return IClass->NumProtos - 1;
1908} /* MakeNewTempProtos */
1909
1910/*---------------------------------------------------------------------------*/
1921 CLASS_ID ClassId,
1922 int ConfigId,
1923 TBLOB *Blob) {
1924 UNICHAR_ID *Ambigs;
1926 ADAPT_CLASS Class;
1927 PROTO_KEY ProtoKey;
1928
1929 Class = Templates->Class[ClassId];
1930 Config = TempConfigFor(Class, ConfigId);
1931
1932 MakeConfigPermanent(Class, ConfigId);
1933 if (Class->NumPermConfigs == 0)
1934 Templates->NumPermClasses++;
1935 Class->NumPermConfigs++;
1936
1937 // Initialize permanent config.
1938 Ambigs = GetAmbiguities(Blob, ClassId);
1939 auto Perm = static_cast<PERM_CONFIG>(malloc(sizeof(PERM_CONFIG_STRUCT)));
1940 Perm->Ambigs = Ambigs;
1941 Perm->FontinfoId = Config->FontinfoId;
1942
1943 // Free memory associated with temporary config (since ADAPTED_CONFIG
1944 // is a union we need to clean up before we record permanent config).
1945 ProtoKey.Templates = Templates;
1946 ProtoKey.ClassId = ClassId;
1947 ProtoKey.ConfigId = ConfigId;
1948 Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1950
1951 // Record permanent config.
1952 PermConfigFor(Class, ConfigId) = Perm;
1953
1955 tprintf("Making config %d for %s (ClassId %d) permanent:"
1956 " fontinfo id %d, ambiguities '",
1957 ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
1958 ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
1959 for (UNICHAR_ID *AmbigsPointer = Ambigs;
1960 *AmbigsPointer >= 0; ++AmbigsPointer)
1961 tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1962 tprintf("'.\n");
1963 }
1964} /* MakePermanent */
1965} // namespace tesseract
1966
1967/*---------------------------------------------------------------------------*/
1980int MakeTempProtoPerm(void *item1, void *item2) {
1981 ADAPT_CLASS Class;
1983 TEMP_PROTO TempProto;
1984 PROTO_KEY *ProtoKey;
1985
1986 TempProto = static_cast<TEMP_PROTO>(item1);
1987 ProtoKey = static_cast<PROTO_KEY *>(item2);
1988
1989 Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
1990 Config = TempConfigFor(Class, ProtoKey->ConfigId);
1991
1992 if (TempProto->ProtoId > Config->MaxProtoId ||
1993 !test_bit (Config->Protos, TempProto->ProtoId))
1994 return false;
1995
1996 MakeProtoPermanent(Class, TempProto->ProtoId);
1997 AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId,
1998 ProtoKey->Templates->Templates);
1999 FreeTempProto(TempProto);
2000
2001 return true;
2002} /* MakeTempProtoPerm */
2003
2004/*---------------------------------------------------------------------------*/
2005namespace tesseract {
2014 for (int i = 0; i < results.match.size(); ++i) {
2015 tprintf("%s ", unicharset.debug_str(results.match[i].unichar_id).string());
2016 results.match[i].Print();
2017 }
2018} /* PrintAdaptiveMatchResults */
2019
2020/*---------------------------------------------------------------------------*/
2034 int Next, NextGood;
2035 float BadMatchThreshold;
2036 static const char* romans = "i v x I V X";
2037 BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
2038
2040 UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
2041 unicharset.unichar_to_id("1") : -1;
2042 UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
2043 unicharset.unichar_to_id("0") : -1;
2044 float scored_one = ScoredUnichar(unichar_id_one, *Results);
2045 float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
2046
2047 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2048 const UnicharRating& match = Results->match[Next];
2049 if (match.rating >= BadMatchThreshold) {
2050 if (!unicharset.get_isalpha(match.unichar_id) ||
2051 strstr(romans,
2052 unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2053 } else if (unicharset.eq(match.unichar_id, "l") &&
2054 scored_one < BadMatchThreshold) {
2055 Results->match[Next].unichar_id = unichar_id_one;
2056 } else if (unicharset.eq(match.unichar_id, "O") &&
2057 scored_zero < BadMatchThreshold) {
2058 Results->match[Next].unichar_id = unichar_id_zero;
2059 } else {
2060 Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
2061 }
2062 if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
2063 if (NextGood == Next) {
2064 ++NextGood;
2065 } else {
2066 Results->match[NextGood++] = Results->match[Next];
2067 }
2068 }
2069 }
2070 }
2071 } else {
2072 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2073 if (Results->match[Next].rating >= BadMatchThreshold) {
2074 if (NextGood == Next) {
2075 ++NextGood;
2076 } else {
2077 Results->match[NextGood++] = Results->match[Next];
2078 }
2079 }
2080 }
2081 }
2082 Results->match.truncate(NextGood);
2083} /* RemoveBadMatches */
2084
2085/*----------------------------------------------------------------------------*/
2094 int Next, NextGood;
2095 int punc_count; /*no of garbage characters */
2096 int digit_count;
2097 /*garbage characters */
2098 static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2099 static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2100
2101 punc_count = 0;
2102 digit_count = 0;
2103 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2104 const UnicharRating& match = Results->match[Next];
2105 bool keep = true;
2106 if (strstr(punc_chars,
2107 unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2108 if (punc_count >= 2)
2109 keep = false;
2110 punc_count++;
2111 } else {
2112 if (strstr(digit_chars,
2113 unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2114 if (digit_count >= 1)
2115 keep = false;
2116 digit_count++;
2117 }
2118 }
2119 if (keep) {
2120 if (NextGood == Next) {
2121 ++NextGood;
2122 } else {
2123 Results->match[NextGood++] = match;
2124 }
2125 }
2126 }
2127 Results->match.truncate(NextGood);
2128} /* RemoveExtraPuncs */
2129
2130/*---------------------------------------------------------------------------*/
2141void Classify::SetAdaptiveThreshold(float Threshold) {
2142 Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
2144 ClipToRange<int>(255 * Threshold, 0, 255));
2146 ClipToRange<int>(255 * Threshold, 0, 255));
2147} /* SetAdaptiveThreshold */
2148
2149/*---------------------------------------------------------------------------*/
2160 const INT_FEATURE_STRUCT* features,
2161 int num_features) {
2162#ifndef GRAPHICS_DISABLED
2163 uint32_t config_mask;
2164 if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2165 tprintf("No built-in templates for class/shape %d\n", shape_id);
2166 return;
2167 }
2168 if (num_features <= 0) {
2169 tprintf("Illegal blob (char norm features)!\n");
2170 return;
2171 }
2172 UnicharRating cn_result;
2176 num_features, features, &cn_result,
2179 tprintf("\n");
2180 config_mask = 1 << cn_result.config;
2181
2182 tprintf("Static Shape ID: %d\n", shape_id);
2185 &config_mask, num_features, features, &cn_result,
2189#endif // GRAPHICS_DISABLED
2190} /* ShowBestMatchFor */
2191
2192// Returns a string for the classifier class_id: either the corresponding
2193// unicharset debug_str or the shape_table_ debug str.
2195 int class_id, int config_id) const {
2196 STRING class_string;
2197 if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2198 int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2199 class_string = shape_table_->DebugStr(shape_id);
2200 } else {
2201 class_string = unicharset.debug_str(class_id);
2202 }
2203 return class_string;
2204}
2205
2206// Converts a classifier class_id index to a shape_table_ index
2208 int int_result_config) const {
2209 int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2210 // Older inttemps have no font_ids.
2211 if (font_set_id < 0)
2212 return kBlankFontinfoId;
2213 const FontSet &fs = fontset_table_.get(font_set_id);
2214 ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
2215 return fs.configs[int_result_config];
2216}
2217
2218// Converts a shape_table_ index to a classifier class_id index (not a
2219// unichar-id!). Uses a search, so not fast.
2220int Classify::ShapeIDToClassID(int shape_id) const {
2221 for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2222 int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2223 ASSERT_HOST(font_set_id >= 0);
2224 const FontSet &fs = fontset_table_.get(font_set_id);
2225 for (int config = 0; config < fs.size; ++config) {
2226 if (fs.configs[config] == shape_id)
2227 return id;
2228 }
2229 }
2230 tprintf("Shape %d not found\n", shape_id);
2231 return -1;
2232}
2233
2234// Returns true if the given TEMP_CONFIG is good enough to make it
2235// a permanent config.
2237 const TEMP_CONFIG &config) {
2239 tprintf("NumTimesSeen for config of %s is %d\n",
2240 getDict().getUnicharset().debug_str(class_id).string(),
2241 config->NumTimesSeen);
2242 }
2244 return true;
2246 return false;
2247 } else if (use_ambigs_for_adaption) {
2248 // Go through the ambigs vector and see whether we have already seen
2249 // enough times all the characters represented by the ambigs vector.
2250 const UnicharIdVector *ambigs =
2252 int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2253 for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2254 ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2255 assert(ambig_class != nullptr);
2256 if (ambig_class->NumPermConfigs == 0 &&
2257 ambig_class->MaxNumTimesSeen <
2260 tprintf("Ambig %s has not been seen enough times,"
2261 " not making config for %s permanent\n",
2262 getDict().getUnicharset().debug_str(
2263 (*ambigs)[ambig]).string(),
2264 getDict().getUnicharset().debug_str(class_id).string());
2265 }
2266 return false;
2267 }
2268 }
2269 }
2270 return true;
2271}
2272
2274 const UnicharIdVector *ambigs =
2276 int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2278 tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2279 getDict().getUnicharset().debug_str(class_id).string(), class_id);
2280 }
2281 for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2282 CLASS_ID ambig_class_id = (*ambigs)[ambig];
2283 const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2284 for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2285 if (ConfigIsPermanent(ambigs_class, cfg)) continue;
2286 const TEMP_CONFIG config =
2287 TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2288 if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2290 tprintf("Making config %d of %s permanent\n", cfg,
2291 getDict().getUnicharset().debug_str(
2292 ambig_class_id).string());
2293 }
2294 MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2295 }
2296 }
2297 }
2298}
2299
2300} // namespace tesseract
const int kBlnBaselineOffset
Definition: normalis.h:25
const int kBlnXHeight
Definition: normalis.h:24
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
@ BCC_ADAPTED_CLASSIFIER
Definition: ratngs.h:45
@ BCC_STATIC_CLASSIFIER
Definition: ratngs.h:44
#define ASSERT_HOST(x)
Definition: errcode.h:88
int IntCastRounded(double x)
Definition: helpers.h:175
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
@ UNICHAR_SPACE
Definition: unicharset.h:34
TEMP_PROTO NewTempProto()
Definition: adaptive.cpp:228
void FreeTempConfig(TEMP_CONFIG Config)
Definition: adaptive.cpp:74
void FreeTempProto(void *arg)
Definition: adaptive.cpp:81
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:182
TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId)
Definition: adaptive.cpp:203
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:79
#define MakeProtoPermanent(Class, ProtoId)
Definition: adaptive.h:88
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:85
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:82
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:95
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:93
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:80
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:78
void SetAdaptiveThreshold(float Threshold)
#define NO_DEBUG
Definition: adaptmatch.cpp:79
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:82
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:75
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:86
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:84
void InitMatcherRatings(float *Rating)
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:131
#define MAX_MATCHES
Definition: adaptmatch.cpp:77
int MakeTempProtoPerm(void *item1, void *item2)
const FEATURE_DESC_STRUCT CharNormDesc
void InitIntegerFX()
Definition: intfx.cpp:49
const double kStandardFeatureLength
Definition: intfx.h:46
void AddProtoToClassPruner(PROTO Proto, CLASS_ID ClassId, INT_TEMPLATES Templates)
Definition: intproto.cpp:328
void UpdateMatchDisplay()
Definition: intproto.cpp:447
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:282
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:261
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:463
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:367
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:698
#define UnusedClassIdIn(T, c)
Definition: intproto.h:177
#define MAX_NUM_PROTOS
Definition: intproto.h:48
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:129
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:152
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
#define ClassForClassId(T, c)
Definition: intproto.h:178
#define LegalClassId(c)
Definition: intproto.h:176
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:188
#define PRINT_PROTO_MATCHES
Definition: intproto.h:192
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:191
@ baseline
Definition: mfoutline.h:63
@ character
Definition: mfoutline.h:63
const float MF_SCALE_FACTOR
Definition: mfoutline.h:71
float ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:32
#define LENGTH_COMPRESSION
Definition: normfeat.h:27
@ CharNormRx
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormLength
Definition: normfeat.h:30
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:78
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:62
void FreeFeature(FEATURE Feature)
Definition: ocrfeatures.cpp:54
@ OutlineFeatDir
Definition: outfeat.h:32
@ OutlineFeatX
Definition: outfeat.h:29
@ OutlineFeatY
Definition: outfeat.h:30
@ OutlineFeatLength
Definition: outfeat.h:31
#define GetPicoFeatureLength()
Definition: picofeat.h:57
@ GeoBottom
Definition: picofeat.h:37
@ GeoTop
Definition: picofeat.h:38
@ PicoFeatY
Definition: picofeat.h:44
@ PicoFeatDir
Definition: picofeat.h:44
@ PicoFeatX
Definition: picofeat.h:44
void FillABC(PROTO Proto)
Definition: protos.cpp:108
#define reset_bit(array, bit)
Definition: bitvec.h:57
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
#define test_bit(array, bit)
Definition: bitvec.h:59
#define SET_BIT(array, bit)
Definition: bitvec.h:55
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:110
LIST push(LIST list, void *element)
Definition: oldlist.cpp:213
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
#define NO_PROTO
Definition: matchdefs.h:41
int16_t PROTO_ID
Definition: matchdefs.h:40
uint8_t FEATURE_ID
Definition: matchdefs.h:46
CLUSTERCONFIG Config
@ TESSDATA_SHAPE_TABLE
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:77
CharSegmentationType
Definition: classify.h:96
@ CST_WHOLE
Definition: classify.h:98
@ CST_FRAGMENT
Definition: classify.h:97
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
int length() const
Definition: genericvector.h:86
void truncate(int size)
Definition: blobs.h:284
TBOX bounding_box() const
Definition: blobs.cpp:468
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:346
const DENORM & denorm() const
Definition: blobs.h:363
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:510
int NumBlobs() const
Definition: blobs.h:448
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
TBOX bounding_box() const
Definition: blobs.cpp:861
void plot(ScrollView *window)
Definition: blobs.cpp:897
Pix * pix() const
Definition: normalis.h:246
void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht, float *max_xht, float *yshift) const
Definition: normalis.cpp:428
TWERD * rebuild_word
Definition: pageres.h:266
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1078
const FontInfo * fontinfo
Definition: pageres.h:309
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
WERD_CHOICE * best_choice
Definition: pageres.h:241
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:561
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:439
GenericVector< int > best_state
Definition: pageres.h:285
TWERD * chopped_word
Definition: pageres.h:212
GenericVector< STRING > correct_text
Definition: pageres.h:289
const STRING debug_string() const
Definition: ratngs.h:495
float adjust_factor() const
Definition: ratngs.h:296
int length() const
Definition: ratngs.h:293
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:188
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:210
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:183
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:192
STRING language_data_path_prefix
Definition: ccutil.h:72
STRING imagefile
Definition: ccutil.h:77
UNICHARSET unicharset
Definition: ccutil.h:73
bool use_ambigs_for_adaption
Definition: ccutil.h:89
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:197
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
bool GetComponent(TessdataType type, TFile *fp)
STRING to_string() const
Definition: unicharset.h:79
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:878
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
int size() const
Definition: unicharset.h:341
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
uint16_t ProtoId
Definition: adaptive.h:28
PROTO_STRUCT Proto
Definition: adaptive.h:29
uint8_t NumTimesSeen
Definition: adaptive.h:36
UNICHAR_ID * Ambigs
Definition: adaptive.h:45
BIT_VECTOR PermProtos
Definition: adaptive.h:59
uint8_t MaxNumTimesSeen
Definition: adaptive.h:57
uint8_t NumPermConfigs
Definition: adaptive.h:56
BIT_VECTOR PermConfigs
Definition: adaptive.h:60
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
INT_TEMPLATES Templates
Definition: adaptive.h:67
uint8_t NumPermClasses
Definition: adaptive.h:69
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:94
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
void Initialize()
Definition: adaptmatch.cpp:102
GenericVector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:98
int best_match_index
Definition: adaptmatch.cpp:95
bool HasNonfragment
Definition: adaptmatch.cpp:93
int32_t BlobLength
Definition: adaptmatch.cpp:92
float best_rating
Definition: adaptmatch.cpp:96
void ComputeBest()
Definition: adaptmatch.cpp:108
CLASS_ID ClassId
Definition: adaptmatch.cpp:124
ADAPT_TEMPLATES Templates
Definition: adaptmatch.cpp:123
double classify_adapted_pruning_factor
Definition: classify.h:477
BIT_VECTOR AllProtosOn
Definition: classify.h:522
bool matcher_debug_separate_windows
Definition: classify.h:494
IntegerMatcher im_
Definition: classify.h:540
double tessedit_class_miss_scale
Definition: classify.h:475
bool classify_debug_character_fragments
Definition: classify.h:491
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:821
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:946
void RemoveBadMatches(ADAPT_RESULTS *Results)
double matcher_bad_match_pad
Definition: classify.h:459
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:244
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:453
bool LooksLikeGarbage(TBLOB *blob)
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:786
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
bool prioritize_division
Definition: classify.h:428
bool classify_enable_adaptive_debugger
Definition: classify.h:450
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
BIT_VECTOR TempProtoMask
Definition: classify.h:525
bool classify_save_adapted_templates
Definition: classify.h:449
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:151
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:994
double classify_adapted_pruning_threshold
Definition: classify.h:479
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
int matcher_min_examples_for_prototyping
Definition: classify.h:464
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:489
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:598
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:629
int classify_adapt_proto_threshold
Definition: classify.h:481
double matcher_perfect_threshold
Definition: classify.h:458
bool classify_nonlinear_norm
Definition: classify.h:452
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:452
ShapeTable * shape_table_
Definition: classify.h:546
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
bool classify_use_pre_adapted_templates
Definition: classify.h:447
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
bool classify_bln_numeric_mode
Definition: classify.h:508
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:129
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:44
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:242
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:514
double classify_misfit_junk_penalty
Definition: classify.h:471
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:226
int matcher_permanent_classes_min
Definition: classify.h:462
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
bool classify_enable_learning
Definition: classify.h:429
double matcher_clustering_max_angle_delta
Definition: classify.h:468
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
double matcher_rating_margin
Definition: classify.h:460
int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
void ClassifyAsNoise(ADAPT_RESULTS *Results)
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:173
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:190
double matcher_avg_noise_size
Definition: classify.h:461
BIT_VECTOR AllConfigsOff
Definition: classify.h:524
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:693
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:613
double certainty_scale
Definition: classify.h:473
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:191
double matcher_reliable_adaptive_result
Definition: classify.h:457
bool disable_character_fragments
Definition: classify.h:486
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
char * classify_learn_debug_str
Definition: classify.h:495
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:62
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
int classify_integer_matcher_multiplier
Definition: classify.h:505
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:718
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:374
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:466
NORM_PROTOS * NormProtos
Definition: classify.h:527
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:63
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
void SetAdaptiveThreshold(float Threshold)
bool classify_enable_adaptive_matcher
Definition: classify.h:445
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:332
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
double matcher_good_threshold
Definition: classify.h:456
void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs)
Definition: cutoffs.cpp:41
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:90
int ShapeIDToClassID(int shape_id) const
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:459
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:519
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:853
virtual Dict & getDict()
Definition: classify.h:107
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:70
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:41
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:488
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
int classify_adapt_feature_threshold
Definition: classify.h:483
int classify_learning_debug_level
Definition: classify.h:455
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:219
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
Definition: cluster.h:32
int16_t Ymean
Definition: intfx.h:37
int32_t Length
Definition: intfx.h:36
int FindBadFeatures(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:657
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:511
int FindGoodProtos(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:589
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)
uint16_t NumProtos
Definition: intproto.h:106
uint8_t NumConfigs
Definition: intproto.h:108
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
float Params[1]
Definition: ocrfeatures.h:61
FEATURE Features[1]
Definition: ocrfeatures.h:68
uint16_t NumFeatures
Definition: ocrfeatures.h:66
float Angle
Definition: protos.h:42
float Length
Definition: protos.h:43
float Y
Definition: protos.h:41
float X
Definition: protos.h:40
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)
GenericVector< ScoredFont > fonts
Definition: shapetable.h:87
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:55
int size() const
Definition: shapetable.h:199
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
int MaxNumUnichars() const
Definition: shapetable.cpp:455
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
double certainty_scale
Definition: dict.h:627
double segment_penalty_dict_case_ok
Definition: dict.h:605
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:368
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108
void EndDangerousAmbigs()
Definition: stopper.cpp:360
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:364
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
static void Update()
Definition: scrollview.cpp:709