tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly=false)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
bool happy ()
 
int imagenum () const
 

Protected Member Functions

bool BeginDocumentHandler () override
 
bool AddImageHandler (TessBaseAPI *api) override
 
bool EndDocumentHandler () override
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)=0
 
virtual bool EndDocumentHandler ()
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 214 of file renderer.h.

Constructor & Destructor Documentation

◆ TessPDFRenderer()

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly = false 
)

Definition at line 179 of file pdfrenderer.cpp.

181 : TessResultRenderer(outputbase, "pdf"),
182 datadir_(datadir) {
183 obj_ = 0;
184 textonly_ = textonly;
185 offsets_.push_back(0);
186}
struct TessResultRenderer TessResultRenderer
Definition: capi.h:87
int push_back(T object)

Member Function Documentation

◆ AddImageHandler()

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
overrideprotectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 797 of file pdfrenderer.cpp.

797 {
798 Pix *pix = api->GetInputImage();
799 const char* filename = api->GetInputName();
800 int ppi = api->GetSourceYResolution();
801 if (!pix || ppi <= 0)
802 return false;
803 double width = pixGetWidth(pix) * 72.0 / ppi;
804 double height = pixGetHeight(pix) * 72.0 / ppi;
805
806 std::stringstream xobject;
807 // Use "C" locale (needed for int values larger than 999).
808 xobject.imbue(std::locale::classic());
809 if (!textonly_) {
810 xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
811 }
812
813 // PAGE
814 std::stringstream stream;
815 // Use "C" locale (needed for double values width and height).
816 stream.imbue(std::locale::classic());
817 stream.precision(2);
818 stream << std::fixed <<
819 obj_ << " 0 obj\n"
820 "<<\n"
821 " /Type /Page\n"
822 " /Parent 2 0 R\n" // Pages object
823 " /MediaBox [0 0 " << width << " " << height << "]\n"
824 " /Contents " << (obj_ + 1) << " 0 R\n" // Contents object
825 " /Resources\n"
826 " <<\n"
827 " " << xobject.str() << // Image object
828 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
829 " /Font << /f-0-0 3 0 R >>\n" // Type0 Font
830 " >>\n"
831 ">>\n"
832 "endobj\n";
833 pages_.push_back(obj_);
834 AppendPDFObject(stream.str().c_str());
835
836 // CONTENTS
837 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
838 const size_t pdftext_len = strlen(pdftext.get());
839 size_t len;
840 unsigned char *comp_pdftext = zlibCompress(
841 reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
842 long comp_pdftext_len = len;
843 stream.str("");
844 stream <<
845 obj_ << " 0 obj\n"
846 "<<\n"
847 " /Length " << comp_pdftext_len << " /Filter /FlateDecode\n"
848 ">>\n"
849 "stream\n";
850 AppendString(stream.str().c_str());
851 long objsize = stream.str().size();
852 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
853 objsize += comp_pdftext_len;
854 lept_free(comp_pdftext);
855 const char *b2 =
856 "endstream\n"
857 "endobj\n";
858 AppendString(b2);
859 objsize += strlen(b2);
860 AppendPDFObjectDIY(objsize);
861
862 if (!textonly_) {
863 char *pdf_object = nullptr;
864 int jpg_quality;
865 api->GetIntVariable("jpg_quality", &jpg_quality);
866 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize,
867 jpg_quality)) {
868 return false;
869 }
870 AppendData(pdf_object, objsize);
871 AppendPDFObjectDIY(objsize);
872 delete[] pdf_object;
873 }
874 return true;
875}
void AppendString(const char *s)
Definition: renderer.cpp:102
void AppendData(const char *s, int len)
Definition: renderer.cpp:106

◆ BeginDocumentHandler()

bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 494 of file pdfrenderer.cpp.

494 {
495 AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
496
497 // CATALOG
498 AppendPDFObject("1 0 obj\n"
499 "<<\n"
500 " /Type /Catalog\n"
501 " /Pages 2 0 R\n"
502 ">>\nendobj\n");
503
504 // We are reserving object #2 for the /Pages
505 // object, which I am going to create and write
506 // at the end of the PDF file.
507 AppendPDFObject("");
508
509 // TYPE0 FONT
510 AppendPDFObject("3 0 obj\n"
511 "<<\n"
512 " /BaseFont /GlyphLessFont\n"
513 " /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
514 " /Encoding /Identity-H\n"
515 " /Subtype /Type0\n"
516 " /ToUnicode 6 0 R\n" // ToUnicode
517 " /Type /Font\n"
518 ">>\n"
519 "endobj\n");
520
521 // CIDFONTTYPE2
522 std::stringstream stream;
523 // Use "C" locale (needed for int values larger than 999).
524 stream.imbue(std::locale::classic());
525 stream <<
526 "4 0 obj\n"
527 "<<\n"
528 " /BaseFont /GlyphLessFont\n"
529 " /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
530 " /CIDSystemInfo\n"
531 " <<\n"
532 " /Ordering (Identity)\n"
533 " /Registry (Adobe)\n"
534 " /Supplement 0\n"
535 " >>\n"
536 " /FontDescriptor 7 0 R\n" // Font descriptor
537 " /Subtype /CIDFontType2\n"
538 " /Type /Font\n"
539 " /DW " << (1000 / kCharWidth) << "\n"
540 ">>\n"
541 "endobj\n";
542 AppendPDFObject(stream.str().c_str());
543
544 // CIDTOGIDMAP
545 const int kCIDToGIDMapSize = 2 * (1 << 16);
546 const std::unique_ptr<unsigned char[]> cidtogidmap(
547 new unsigned char[kCIDToGIDMapSize]);
548 for (int i = 0; i < kCIDToGIDMapSize; i++) {
549 cidtogidmap[i] = (i % 2) ? 1 : 0;
550 }
551 size_t len;
552 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
553 stream.str("");
554 stream <<
555 "5 0 obj\n"
556 "<<\n"
557 " /Length " << len << " /Filter /FlateDecode\n"
558 ">>\n"
559 "stream\n";
560 AppendString(stream.str().c_str());
561 long objsize = stream.str().size();
562 AppendData(reinterpret_cast<char *>(comp), len);
563 objsize += len;
564 lept_free(comp);
565 const char *endstream_endobj =
566 "endstream\n"
567 "endobj\n";
568 AppendString(endstream_endobj);
569 objsize += strlen(endstream_endobj);
570 AppendPDFObjectDIY(objsize);
571
572 const char stream2[] =
573 "/CIDInit /ProcSet findresource begin\n"
574 "12 dict begin\n"
575 "begincmap\n"
576 "/CIDSystemInfo\n"
577 "<<\n"
578 " /Registry (Adobe)\n"
579 " /Ordering (UCS)\n"
580 " /Supplement 0\n"
581 ">> def\n"
582 "/CMapName /Adobe-Identify-UCS def\n"
583 "/CMapType 2 def\n"
584 "1 begincodespacerange\n"
585 "<0000> <FFFF>\n"
586 "endcodespacerange\n"
587 "1 beginbfrange\n"
588 "<0000> <FFFF> <0000>\n"
589 "endbfrange\n"
590 "endcmap\n"
591 "CMapName currentdict /CMap defineresource pop\n"
592 "end\n"
593 "end\n";
594
595 // TOUNICODE
596 stream.str("");
597 stream <<
598 "6 0 obj\n"
599 "<< /Length " << (sizeof(stream2) - 1) << " >>\n"
600 "stream\n" << stream2 <<
601 "endstream\n"
602 "endobj\n";
603 AppendPDFObject(stream.str().c_str());
604
605 // FONT DESCRIPTOR
606 stream.str("");
607 stream <<
608 "7 0 obj\n"
609 "<<\n"
610 " /Ascent 1000\n"
611 " /CapHeight 1000\n"
612 " /Descent -1\n" // Spec says must be negative
613 " /Flags 5\n" // FixedPitch + Symbolic
614 " /FontBBox [ 0 0 " << (1000 / kCharWidth) << " 1000 ]\n"
615 " /FontFile2 8 0 R\n"
616 " /FontName /GlyphLessFont\n"
617 " /ItalicAngle 0\n"
618 " /StemV 80\n"
619 " /Type /FontDescriptor\n"
620 ">>\n"
621 "endobj\n";
622 AppendPDFObject(stream.str().c_str());
623
624 stream.str("");
625 stream << datadir_.c_str() << "/pdf.ttf";
626 FILE *fp = fopen(stream.str().c_str(), "rb");
627 if (!fp) {
628 tprintf("Cannot open file \"%s\"!\n", stream.str().c_str());
629 return false;
630 }
631 fseek(fp, 0, SEEK_END);
632 auto size = std::ftell(fp);
633 if (size < 0) {
634 fclose(fp);
635 return false;
636 }
637 fseek(fp, 0, SEEK_SET);
638 const std::unique_ptr<char[]> buffer(new char[size]);
639 if (!tesseract::DeSerialize(fp, buffer.get(), size)) {
640 fclose(fp);
641 return false;
642 }
643 fclose(fp);
644 // FONTFILE2
645 stream.str("");
646 stream <<
647 "8 0 obj\n"
648 "<<\n"
649 " /Length " << size << "\n"
650 " /Length1 " << size << "\n"
651 ">>\n"
652 "stream\n";
653 AppendString(stream.str().c_str());
654 objsize = stream.str().size();
655 AppendData(buffer.get(), size);
656 objsize += size;
657 AppendString(endstream_endobj);
658 objsize += strlen(endstream_endobj);
659 AppendPDFObjectDIY(objsize);
660 return true;
661}
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:28

◆ EndDocumentHandler()

bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 878 of file pdfrenderer.cpp.

878 {
879 // We reserved the /Pages object number early, so that the /Page
880 // objects could refer to their parent. We finally have enough
881 // information to go fill it in. Using lower level calls to manipulate
882 // the offset record in two spots, because we are placing objects
883 // out of order in the file.
884
885 // PAGES
886 const long int kPagesObjectNumber = 2;
887 offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
888 std::stringstream stream;
889 // Use "C" locale (needed for int values larger than 999).
890 stream.imbue(std::locale::classic());
891 stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
892 AppendString(stream.str().c_str());
893 size_t pages_objsize = stream.str().size();
894 for (size_t i = 0; i < pages_.unsigned_size(); i++) {
895 stream.str("");
896 stream << pages_[i] << " 0 R ";
897 AppendString(stream.str().c_str());
898 pages_objsize += stream.str().size();
899 }
900 stream.str("");
901 stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
902 AppendString(stream.str().c_str());
903 pages_objsize += stream.str().size();
904 offsets_.back() += pages_objsize; // manipulation #2
905
906 // INFO
907 STRING utf16_title = "FEFF"; // byte_order_marker
908 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
909 char utf16[kMaxBytesPerCodepoint];
910 for (char32 code : unicodes) {
911 if (CodepointToUtf16be(code, utf16)) {
912 utf16_title += utf16;
913 }
914 }
915
916 char* datestr = l_getFormattedDate();
917 stream.str("");
918 stream
919 << obj_ << " 0 obj\n"
920 "<<\n"
921 " /Producer (Tesseract " << tesseract::TessBaseAPI::Version() << ")\n"
922 " /CreationDate (D:" << datestr << ")\n"
923 " /Title <" << utf16_title.c_str() << ">\n"
924 ">>\n"
925 "endobj\n";
926 lept_free(datestr);
927 AppendPDFObject(stream.str().c_str());
928 stream.str("");
929 stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
930 AppendString(stream.str().c_str());
931 for (int i = 1; i < obj_; i++) {
932 stream.str("");
933 stream.width(10);
934 stream.fill('0');
935 stream << offsets_[i] << " 00000 n \n";
936 AppendString(stream.str().c_str());
937 }
938 stream.str("");
939 stream
940 << "trailer\n<<\n /Size " << obj_ << "\n"
941 " /Root 1 0 R\n" // catalog
942 " /Info " << (obj_ - 1) << " 0 R\n" // info
943 ">>\nstartxref\n" << offsets_.back() << "\n%%EOF\n";
944 AppendString(stream.str().c_str());
945 return true;
946}
signed int char32
size_t unsigned_size() const
Definition: genericvector.h:76
T & back() const
static const char * Version()
Definition: baseapi.cpp:233
const char * title() const
Definition: renderer.h:88
Definition: strngs.h:45
const char * c_str() const
Definition: strngs.cpp:205
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215

The documentation for this class was generated from the following files: