tesseract 4.1.1
Loading...
Searching...
No Matches
imagedata.h
Go to the documentation of this file.
1
2// File: imagedata.h
3// Description: Class to hold information about a single image and its
4// corresponding boxes or text file.
5// Author: Ray Smith
6// Created: Mon Jul 22 14:17:06 PDT 2013
7//
8// (C) Copyright 2013, Google Inc.
9// Licensed under the Apache License, Version 2.0 (the "License");
10// you may not use this file except in compliance with the License.
11// You may obtain a copy of the License at
12// http://www.apache.org/licenses/LICENSE-2.0
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS,
15// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16// See the License for the specific language governing permissions and
17// limitations under the License.
19
20#ifndef TESSERACT_IMAGE_IMAGEDATA_H_
21#define TESSERACT_IMAGE_IMAGEDATA_H_
22
23#include "genericvector.h" // for GenericVector, PointerVector, FileReader
24#include "points.h" // for FCOORD
25#include "strngs.h" // for STRING
26#include "svutil.h" // for SVAutoLock, SVMutex
27
28class ScrollView;
29class TBOX;
30struct Pix;
31
32namespace tesseract {
33
34class TFile;
35
36// Amount of padding to apply in output pixels in feature mode.
37const int kFeaturePadding = 2;
38// Number of pixels to pad around text boxes.
39const int kImagePadding = 4;
40
41// Enum to determine the caching and data sequencing strategy.
43 // Reads all of one file before moving on to the next. Requires samples to be
44 // shuffled across files. Uses the count of samples in the first file as
45 // the count in all the files to achieve high-speed random access. As a
46 // consequence, if subsequent files are smaller, they get entries used more
47 // than once, and if subsequent files are larger, some entries are not used.
48 // Best for larger data sets that don't fit in memory.
50 // Reads one sample from each file in rotation. Does not require shuffled
51 // samples, but is extremely disk-intensive. Samples in smaller files also
52 // get used more often than samples in larger files.
53 // Best for smaller data sets that mostly fit in memory.
55};
56
58 public:
60 WordFeature(const FCOORD& fcoord, uint8_t dir);
61
62 // Computes the maximum x and y value in the features.
63 static void ComputeSize(const GenericVector<WordFeature>& features,
64 int* max_x, int* max_y);
65 // Draws the features in the given window.
66 static void Draw(const GenericVector<WordFeature>& features,
67 ScrollView* window);
68
69 // Accessors.
70 int x() const { return x_; }
71 int y() const { return y_; }
72 int dir() const { return dir_; }
73
74 // Writes to the given file. Returns false in case of error.
75 bool Serialize(FILE* fp) const;
76 // Reads from the given file. Returns false in case of error.
77 // If swap is true, assumes a big/little-endian swap is needed.
78 bool DeSerialize(bool swap, FILE* fp);
79
80 private:
81 int16_t x_;
82 uint8_t y_;
83 uint8_t dir_;
84};
85
86// A floating-point version of WordFeature, used as an intermediate during
87// scaling.
89 static void FromWordFeatures(const GenericVector<WordFeature>& word_features,
90 GenericVector<FloatWordFeature>* float_features);
91 // Sort function to sort first by x-bucket, then by y.
92 static int SortByXBucket(const void*, const void*);
93
94 float x;
95 float y;
96 float dir;
98};
99
100// Class to hold information on a single image:
101// Filename, cached image as a Pix*, character boxes, text transcription.
102// The text transcription is the ground truth UTF-8 text for the image.
103// Character boxes are optional and indicate the desired segmentation of
104// the text into recognition units.
106 public:
107 ImageData();
108 // Takes ownership of the pix.
109 ImageData(bool vertical, Pix* pix);
110 ~ImageData();
111
112 // Builds and returns an ImageData from the basic data. Note that imagedata,
113 // truth_text, and box_text are all the actual file data, NOT filenames.
114 static ImageData* Build(const char* name, int page_number, const char* lang,
115 const char* imagedata, int imagedatasize,
116 const char* truth_text, const char* box_text);
117
118 // Writes to the given file. Returns false in case of error.
119 bool Serialize(TFile* fp) const;
120 // Reads from the given file. Returns false in case of error.
121 bool DeSerialize(TFile* fp);
122 // As DeSerialize, but only seeks past the data - hence a static method.
123 static bool SkipDeSerialize(TFile* fp);
124
125 // Other accessors.
126 const STRING& imagefilename() const {
127 return imagefilename_;
128 }
129 void set_imagefilename(const STRING& name) {
130 imagefilename_ = name;
131 }
132 int page_number() const {
133 return page_number_;
134 }
135 void set_page_number(int num) {
136 page_number_ = num;
137 }
139 return image_data_;
140 }
141 const STRING& language() const {
142 return language_;
143 }
144 void set_language(const STRING& lang) {
145 language_ = lang;
146 }
147 const STRING& transcription() const {
148 return transcription_;
149 }
150 const GenericVector<TBOX>& boxes() const {
151 return boxes_;
152 }
154 return box_texts_;
155 }
156 const STRING& box_text(int index) const {
157 return box_texts_[index];
158 }
159 // Saves the given Pix as a PNG-encoded string and destroys it.
160 // In case of missing PNG support in Leptonica use PNM format,
161 // which requires more memory.
162 void SetPix(Pix* pix);
163 // Returns the Pix image for *this. Must be pixDestroyed after use.
164 Pix* GetPix() const;
165 // Gets anything and everything with a non-nullptr pointer, prescaled to a
166 // given target_height (if 0, then the original image height), and aligned.
167 // Also returns (if not nullptr) the width and height of the scaled image.
168 // The return value is the scaled Pix, which must be pixDestroyed after use,
169 // and scale_factor (if not nullptr) is set to the scale factor that was applied
170 // to the image to achieve the target_height.
171 Pix* PreScale(int target_height, int max_height, float* scale_factor,
172 int* scaled_width, int* scaled_height,
174
175 int MemoryUsed() const;
176
177 // Draws the data in a new window.
178 void Display() const;
179
180 // Adds the supplied boxes and transcriptions that correspond to the correct
181 // page number.
183 const GenericVector<STRING>& texts,
184 const GenericVector<int>& box_pages);
185
186 private:
187 // Saves the given Pix as a PNG-encoded string and destroys it.
188 // In case of missing PNG support in Leptonica use PNM format,
189 // which requires more memory.
190 static void SetPixInternal(Pix* pix, GenericVector<char>* image_data);
191 // Returns the Pix image for the image_data. Must be pixDestroyed after use.
192 static Pix* GetPixInternal(const GenericVector<char>& image_data);
193 // Parses the text string as a box file and adds any discovered boxes that
194 // match the page number. Returns false on error.
195 bool AddBoxes(const char* box_text);
196
197 private:
198 STRING imagefilename_; // File to read image from.
199 int32_t page_number_; // Page number if multi-page tif or -1.
200 GenericVector<char> image_data_; // PNG/PNM file data.
201 STRING language_; // Language code for image.
202 STRING transcription_; // UTF-8 ground truth of image.
203 GenericVector<TBOX> boxes_; // If non-empty boxes of the image.
204 GenericVector<STRING> box_texts_; // String for text in each box.
205 bool vertical_text_; // Image has been rotated from vertical.
206};
207
208// A collection of ImageData that knows roughly how much memory it is using.
210 friend void* ReCachePagesFunc(void* data);
211
212 public:
213 explicit DocumentData(const STRING& name);
215
216 // Reads all the pages in the given lstmf filename to the cache. The reader
217 // is used to read the file.
218 bool LoadDocument(const char* filename, int start_page, int64_t max_memory,
219 FileReader reader);
220 // Sets up the document, without actually loading it.
221 void SetDocument(const char* filename, int64_t max_memory, FileReader reader);
222 // Writes all the pages to the given filename. Returns false on error.
223 bool SaveDocument(const char* filename, FileWriter writer);
224 bool SaveToBuffer(GenericVector<char>* buffer);
225
226 // Adds the given page data to this document, counting up memory.
227 void AddPageToDocument(ImageData* page);
228
229 const STRING& document_name() const {
230 SVAutoLock lock(&general_mutex_);
231 return document_name_;
232 }
233 int NumPages() const {
234 SVAutoLock lock(&general_mutex_);
235 return total_pages_;
236 }
237 size_t PagesSize() const {
238 return pages_.size();
239 }
240 int64_t memory_used() const {
241 SVAutoLock lock(&general_mutex_);
242 return memory_used_;
243 }
244 // If the given index is not currently loaded, loads it using a separate
245 // thread. Note: there are 4 cases:
246 // Document uncached: IsCached() returns false, total_pages_ < 0.
247 // Required page is available: IsPageAvailable returns true. In this case,
248 // total_pages_ > 0 and
249 // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
250 // Pages are loaded, but the required one is not.
251 // The requested page is being loaded by LoadPageInBackground. In this case,
252 // index == pages_offset_. Once the loading starts, the pages lock is held
253 // until it completes, at which point IsPageAvailable will unblock and return
254 // true.
255 void LoadPageInBackground(int index);
256 // Returns a pointer to the page with the given index, modulo the total
257 // number of pages. Blocks until the background load is completed.
258 const ImageData* GetPage(int index);
259 // Returns true if the requested page is available, and provides a pointer,
260 // which may be nullptr if the document is empty. May block, even though it
261 // doesn't guarantee to return true.
262 bool IsPageAvailable(int index, ImageData** page);
263 // Takes ownership of the given page index. The page is made nullptr in *this.
264 ImageData* TakePage(int index) {
265 SVAutoLock lock(&pages_mutex_);
266 ImageData* page = pages_[index];
267 pages_[index] = nullptr;
268 return page;
269 }
270 // Returns true if the document is currently loaded or in the process of
271 // loading.
272 bool IsCached() const { return NumPages() >= 0; }
273 // Removes all pages from memory and frees the memory, but does not forget
274 // the document metadata. Returns the memory saved.
275 int64_t UnCache();
276 // Shuffles all the pages in the document.
277 void Shuffle();
278
279 private:
280 // Sets the value of total_pages_ behind a mutex.
281 void set_total_pages(int total) {
282 SVAutoLock lock(&general_mutex_);
283 total_pages_ = total;
284 }
285 void set_memory_used(int64_t memory_used) {
286 SVAutoLock lock(&general_mutex_);
287 memory_used_ = memory_used;
288 }
289 // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
290 // starting at index pages_offset_.
291 bool ReCachePages();
292
293 private:
294 // A name for this document.
295 STRING document_name_;
296 // A group of pages that corresponds in some loose way to a document.
297 PointerVector<ImageData> pages_;
298 // Page number of the first index in pages_.
299 int pages_offset_;
300 // Total number of pages in document (may exceed size of pages_.)
301 int total_pages_;
302 // Total of all pix sizes in the document.
303 int64_t memory_used_;
304 // Max memory to use at any time.
305 int64_t max_memory_;
306 // Saved reader from LoadDocument to allow re-caching.
307 FileReader reader_;
308 // Mutex that protects pages_ and pages_offset_ against multiple parallel
309 // loads, and provides a wait for page.
310 SVMutex pages_mutex_;
311 // Mutex that protects other data members that callers want to access without
312 // waiting for a load operation.
313 mutable SVMutex general_mutex_;
314};
315
316// A collection of DocumentData that knows roughly how much memory it is using.
317// Note that while it supports background read-ahead, it assumes that a single
318// thread is accessing documents, ie it is not safe for multiple threads to
319// access different documents in parallel, as one may de-cache the other's
320// content.
322 public:
323 explicit DocumentCache(int64_t max_memory);
325
326 // Deletes all existing documents from the cache.
327 void Clear() {
328 documents_.clear();
329 num_pages_per_doc_ = 0;
330 }
331 // Adds all the documents in the list of filenames, counting memory.
332 // The reader is used to read the files.
333 bool LoadDocuments(const GenericVector<STRING>& filenames,
334 CachingStrategy cache_strategy, FileReader reader);
335
336 // Adds document to the cache.
337 bool AddToCache(DocumentData* data);
338
339 // Finds and returns a document by name.
340 DocumentData* FindDocument(const STRING& document_name) const;
341
342 // Returns a page by serial number using the current cache_strategy_ to
343 // determine the mapping from serial number to page.
344 const ImageData* GetPageBySerial(int serial) {
345 if (cache_strategy_ == CS_SEQUENTIAL)
346 return GetPageSequential(serial);
347 else
348 return GetPageRoundRobin(serial);
349 }
350
352 return documents_;
353 }
354 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
355 // strategy, could take a long time.
356 int TotalPages();
357
358 private:
359 // Returns a page by serial number, selecting them in a round-robin fashion
360 // from all the documents. Highly disk-intensive, but doesn't need samples
361 // to be shuffled between files to begin with.
362 const ImageData* GetPageRoundRobin(int serial);
363 // Returns a page by serial number, selecting them in sequence from each file.
364 // Requires the samples to be shuffled between the files to give a random or
365 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
366 const ImageData* GetPageSequential(int serial);
367
368 // Helper counts the number of adjacent cached neighbour documents_ of index
369 // looking in direction dir, ie index+dir, index+2*dir etc.
370 int CountNeighbourDocs(int index, int dir);
371
372 // A group of pages that corresponds in some loose way to a document.
374 // Strategy to use for caching and serializing data samples.
375 CachingStrategy cache_strategy_;
376 // Number of pages in the first document, used as a divisor in
377 // GetPageSequential to determine the document index.
378 int num_pages_per_doc_;
379 // Max memory allowed in this cache.
380 int64_t max_memory_;
381};
382
383} // namespace tesseract
384
385
386#endif // TESSERACT_IMAGE_IMAGEDATA_H_
bool(*)(const STRING &, GenericVector< char > *) FileReader
Definition: serialis.h:49
const int kImagePadding
Definition: imagedata.h:39
const int kFeaturePadding
Definition: imagedata.h:37
bool(*)(const GenericVector< char > &, const STRING &) FileWriter
Definition: serialis.h:52
CachingStrategy
Definition: imagedata.h:42
@ CS_SEQUENTIAL
Definition: imagedata.h:49
@ CS_ROUND_ROBIN
Definition: imagedata.h:54
bool DeSerialize(bool swap, FILE *fp)
Definition: imagedata.cpp:93
static void ComputeSize(const GenericVector< WordFeature > &features, int *max_x, int *max_y)
Definition: imagedata.cpp:58
bool Serialize(FILE *fp) const
Definition: imagedata.cpp:86
static void Draw(const GenericVector< WordFeature > &features, ScrollView *window)
Definition: imagedata.cpp:69
static void FromWordFeatures(const GenericVector< WordFeature > &word_features, GenericVector< FloatWordFeature > *float_features)
Definition: imagedata.cpp:100
static int SortByXBucket(const void *, const void *)
Definition: imagedata.cpp:115
int page_number() const
Definition: imagedata.h:132
void SetPix(Pix *pix)
Definition: imagedata.cpp:213
static bool SkipDeSerialize(TFile *fp)
Definition: imagedata.cpp:197
void set_page_number(int num)
Definition: imagedata.h:135
void AddBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, const GenericVector< int > &box_pages)
Definition: imagedata.cpp:315
void set_language(const STRING &lang)
Definition: imagedata.h:144
const GenericVector< char > & image_data() const
Definition: imagedata.h:138
const GenericVector< STRING > & box_texts() const
Definition: imagedata.h:153
Pix * PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, int *scaled_height, GenericVector< TBOX > *boxes) const
Definition: imagedata.cpp:228
const STRING & box_text(int index) const
Definition: imagedata.h:156
void Display() const
Definition: imagedata.cpp:277
const STRING & imagefilename() const
Definition: imagedata.h:126
Pix * GetPix() const
Definition: imagedata.cpp:218
const STRING & transcription() const
Definition: imagedata.h:147
bool Serialize(TFile *fp) const
Definition: imagedata.cpp:166
const GenericVector< TBOX > & boxes() const
Definition: imagedata.h:150
bool DeSerialize(TFile *fp)
Definition: imagedata.cpp:181
void set_imagefilename(const STRING &name)
Definition: imagedata.h:129
const STRING & language() const
Definition: imagedata.h:141
static ImageData * Build(const char *name, int page_number, const char *lang, const char *imagedata, int imagedatasize, const char *truth_text, const char *box_text)
Definition: imagedata.cpp:135
int MemoryUsed() const
Definition: imagedata.cpp:272
ImageData * TakePage(int index)
Definition: imagedata.h:264
bool IsPageAvailable(int index, ImageData **page)
Definition: imagedata.cpp:477
bool SaveToBuffer(GenericVector< char > *buffer)
Definition: imagedata.cpp:427
void SetDocument(const char *filename, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:406
int64_t memory_used() const
Definition: imagedata.h:240
bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:417
void LoadPageInBackground(int index)
Definition: imagedata.cpp:443
int NumPages() const
Definition: imagedata.h:233
bool IsCached() const
Definition: imagedata.h:272
const STRING & document_name() const
Definition: imagedata.h:229
size_t PagesSize() const
Definition: imagedata.h:237
friend void * ReCachePagesFunc(void *data)
Definition: imagedata.cpp:377
bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:398
const ImageData * GetPage(int index)
Definition: imagedata.cpp:455
void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:435
bool AddToCache(DocumentData *data)
Definition: imagedata.cpp:605
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.h:344
bool LoadDocuments(const GenericVector< STRING > &filenames, CachingStrategy cache_strategy, FileReader reader)
Definition: imagedata.cpp:580
DocumentData * FindDocument(const STRING &document_name) const
Definition: imagedata.cpp:611
const PointerVector< DocumentData > & documents() const
Definition: imagedata.h:351
Definition: points.h:189
Definition: rect.h:34
Definition: strngs.h:45
Definition: svutil.h:68