27#if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || \
28 defined(__CYGWIN__) || defined(__MINGW32__)
31#elif defined(__linux__)
34#elif defined(__APPLE__)
43#include <mach/mach_time.h>
54GPUEnv OpenclDevice::gpuEnv;
56bool OpenclDevice::deviceIsSelected =
false;
57ds_device OpenclDevice::selectedDevice;
59int OpenclDevice::isInited = 0;
61static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
63static const l_uint32 lmask32[] = {
64 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
65 0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
66 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
67 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
68 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
69 0xfffffffe, 0xffffffff};
71static const l_uint32 rmask32[] = {
72 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
73 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
74 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
75 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
76 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
77 0x7fffffff, 0xffffffff};
79static cl_mem pixsCLBuffer, pixdCLBuffer,
81static cl_mem pixThBuffer;
82static cl_int clStatus;
85#define DS_TAG_VERSION "<version>"
86#define DS_TAG_VERSION_END "</version>"
87#define DS_TAG_DEVICE "<device>"
88#define DS_TAG_DEVICE_END "</device>"
89#define DS_TAG_SCORE "<score>"
90#define DS_TAG_SCORE_END "</score>"
91#define DS_TAG_DEVICE_TYPE "<type>"
92#define DS_TAG_DEVICE_TYPE_END "</type>"
93#define DS_TAG_DEVICE_NAME "<name>"
94#define DS_TAG_DEVICE_NAME_END "</name>"
95#define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
96#define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
98#define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
100#define DS_DEVICE_NAME_LENGTH 256
102enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
105 std::vector<ds_device> devices;
106 unsigned int numDevices;
112 DS_INVALID_PROFILE = 1000,
114 DS_INVALID_PERF_EVALUATOR_TYPE,
115 DS_INVALID_PERF_EVALUATOR,
116 DS_PERF_EVALUATOR_ERROR,
118 DS_UNKNOWN_DEVICE_TYPE,
119 DS_PROFILE_FILE_ERROR,
120 DS_SCORE_SERIALIZER_ERROR,
121 DS_SCORE_DESERIALIZER_ERROR
128typedef ds_status (*ds_perf_evaluator)(ds_device* device,
void* data);
131typedef ds_status (*ds_score_release)(TessDeviceScore* score);
133static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
134 ds_status status = DS_SUCCESS;
135 if (profile !=
nullptr) {
138 for (i = 0; i < profile->numDevices; i++) {
139 free(profile->devices[i].oclDeviceName);
140 free(profile->devices[i].oclDriverVersion);
141 status = sr(profile->devices[i].score);
142 if (status != DS_SUCCESS)
break;
150static ds_status initDSProfile(ds_profile** p,
const char* version) {
152 cl_uint numPlatforms;
153 std::vector<cl_platform_id> platforms;
154 std::vector <cl_device_id> devices;
155 ds_status status = DS_SUCCESS;
159 if (p ==
nullptr)
return DS_INVALID_PROFILE;
161 ds_profile* profile =
new ds_profile;
163 memset(profile, 0,
sizeof(ds_profile));
165 clGetPlatformIDs(0,
nullptr, &numPlatforms);
167 if (numPlatforms > 0) {
168 platforms.reserve(numPlatforms);
169 clGetPlatformIDs(numPlatforms, &platforms[0],
nullptr);
173 for (i = 0; i < numPlatforms; i++) {
175 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0,
nullptr, &num);
179 if (numDevices > 0) {
180 devices.reserve(numDevices);
183 profile->numDevices =
185 profile->devices.reserve(profile->numDevices);
186 memset(&profile->devices[0], 0, profile->numDevices *
sizeof(ds_device));
189 for (i = 0; i < numPlatforms; i++) {
192 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
193 for (j = 0; j < num; j++, next++) {
194 char buffer[DS_DEVICE_NAME_LENGTH];
197 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
198 profile->devices[next].oclDeviceID = devices[j];
200 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
201 DS_DEVICE_NAME_LENGTH, &buffer,
nullptr);
202 length = strlen(buffer);
203 profile->devices[next].oclDeviceName = (
char*)malloc(length + 1);
204 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
206 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
207 DS_DEVICE_NAME_LENGTH, &buffer,
nullptr);
208 length = strlen(buffer);
209 profile->devices[next].oclDriverVersion = (
char*)malloc(length + 1);
210 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
213 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
214 profile->version = version;
220static ds_status profileDevices(ds_profile* profile,
221 const ds_evaluation_type type,
222 ds_perf_evaluator evaluator,
223 void* evaluatorData,
unsigned int* numUpdates) {
224 ds_status status = DS_SUCCESS;
226 unsigned int updates = 0;
228 if (profile ==
nullptr) {
229 return DS_INVALID_PROFILE;
231 if (evaluator ==
nullptr) {
232 return DS_INVALID_PERF_EVALUATOR;
235 for (i = 0; i < profile->numDevices; i++) {
236 ds_status evaluatorStatus;
239 case DS_EVALUATE_NEW_ONLY:
240 if (profile->devices[i].score !=
nullptr)
break;
242 case DS_EVALUATE_ALL:
243 evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
244 if (evaluatorStatus != DS_SUCCESS) {
245 status = evaluatorStatus;
251 return DS_INVALID_PERF_EVALUATOR_TYPE;
255 if (numUpdates) *numUpdates = updates;
259static const char* findString(
const char* contentStart,
const char* contentEnd,
260 const char*
string) {
262 const char* currentPosition;
263 const char* found =
nullptr;
264 stringLength = strlen(
string);
265 currentPosition = contentStart;
266 for (currentPosition = contentStart; currentPosition < contentEnd;
268 if (*currentPosition ==
string[0]) {
269 if (currentPosition + stringLength < contentEnd) {
270 if (strncmp(currentPosition,
string, stringLength) == 0) {
271 found = currentPosition;
280static ds_status readProFile(
const char* fileName,
char** content,
281 size_t* contentSize) {
284 ds_status status = DS_SUCCESS;
285 FILE* input = fopen(fileName,
"rb");
286 if (input ==
nullptr) {
287 status = DS_FILE_ERROR;
289 fseek(input, 0L, SEEK_END);
290 auto pos = std::ftell(input);
294 char *binary =
new char[size];
295 if (fread(binary,
sizeof(
char), size, input) != size) {
296 status = DS_FILE_ERROR;
308typedef ds_status (*ds_score_deserializer)(ds_device* device,
309 const uint8_t* serializedScore,
310 unsigned int serializedScoreSize);
312static ds_status readProfileFromFile(ds_profile* profile,
313 ds_score_deserializer deserializer,
315 ds_status status = DS_SUCCESS;
319 if (profile ==
nullptr)
return DS_INVALID_PROFILE;
321 status = readProFile(file, &contentStart, &contentSize);
322 if (status == DS_SUCCESS) {
323 const char* currentPosition;
324 const char* dataStart;
327 const char* contentEnd = contentStart + contentSize;
328 currentPosition = contentStart;
331 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
332 if (dataStart ==
nullptr) {
333 status = DS_PROFILE_FILE_ERROR;
336 dataStart += strlen(DS_TAG_VERSION);
338 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
339 if (dataEnd ==
nullptr) {
340 status = DS_PROFILE_FILE_ERROR;
344 size_t versionStringLength = strlen(profile->version);
345 if (versionStringLength + dataStart != dataEnd ||
346 strncmp(profile->version, dataStart, versionStringLength) != 0) {
348 status = DS_PROFILE_FILE_ERROR;
351 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
357 const char* deviceTypeStart;
358 const char* deviceTypeEnd;
359 ds_device_type deviceType;
361 const char* deviceNameStart;
362 const char* deviceNameEnd;
364 const char* deviceScoreStart;
365 const char* deviceScoreEnd;
367 const char* deviceDriverStart;
368 const char* deviceDriverEnd;
370 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
371 if (dataStart ==
nullptr) {
375 dataStart += strlen(DS_TAG_DEVICE);
376 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
377 if (dataEnd ==
nullptr) {
378 status = DS_PROFILE_FILE_ERROR;
383 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
384 if (deviceTypeStart ==
nullptr) {
385 status = DS_PROFILE_FILE_ERROR;
388 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
390 findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
391 if (deviceTypeEnd ==
nullptr) {
392 status = DS_PROFILE_FILE_ERROR;
395 memcpy(&deviceType, deviceTypeStart,
sizeof(ds_device_type));
398 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
399 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
400 if (deviceNameStart ==
nullptr) {
401 status = DS_PROFILE_FILE_ERROR;
404 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
406 findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
407 if (deviceNameEnd ==
nullptr) {
408 status = DS_PROFILE_FILE_ERROR;
413 findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
414 if (deviceDriverStart ==
nullptr) {
415 status = DS_PROFILE_FILE_ERROR;
418 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
419 deviceDriverEnd = findString(deviceDriverStart, contentEnd,
420 DS_TAG_DEVICE_DRIVER_VERSION_END);
421 if (deviceDriverEnd ==
nullptr) {
422 status = DS_PROFILE_FILE_ERROR;
427 for (i = 0; i < profile->numDevices; i++) {
428 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
429 size_t actualDeviceNameLength;
430 size_t driverVersionLength;
432 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
433 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
434 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
435 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
436 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
437 actualDeviceNameLength) == 0 &&
438 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
439 driverVersionLength) == 0) {
441 findString(dataStart, contentEnd, DS_TAG_SCORE);
442 deviceScoreStart += strlen(DS_TAG_SCORE);
444 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
445 status = deserializer(&profile->devices[i],
446 (
const unsigned char*)deviceScoreStart,
447 deviceScoreEnd - deviceScoreStart);
448 if (status != DS_SUCCESS) {
454 }
else if (deviceType == DS_DEVICE_NATIVE_CPU) {
455 for (i = 0; i < profile->numDevices; i++) {
456 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
457 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
458 if (deviceScoreStart ==
nullptr) {
459 status = DS_PROFILE_FILE_ERROR;
462 deviceScoreStart += strlen(DS_TAG_SCORE);
464 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
465 status = deserializer(&profile->devices[i],
466 (
const unsigned char*)deviceScoreStart,
467 deviceScoreEnd - deviceScoreStart);
468 if (status != DS_SUCCESS) {
476 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
480 delete[] contentStart;
484typedef ds_status (*ds_score_serializer)(ds_device* device,
485 uint8_t** serializedScore,
486 unsigned int* serializedScoreSize);
487static ds_status writeProfileToFile(ds_profile* profile,
488 ds_score_serializer serializer,
490 ds_status status = DS_SUCCESS;
492 if (profile ==
nullptr)
return DS_INVALID_PROFILE;
494 FILE* profileFile = fopen(file,
"wb");
495 if (profileFile ==
nullptr) {
496 status = DS_FILE_ERROR;
501 fwrite(DS_TAG_VERSION,
sizeof(
char), strlen(DS_TAG_VERSION), profileFile);
502 fwrite(profile->version,
sizeof(
char), strlen(profile->version),
504 fwrite(DS_TAG_VERSION_END,
sizeof(
char), strlen(DS_TAG_VERSION_END),
506 fwrite(
"\n",
sizeof(
char), 1, profileFile);
508 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
509 uint8_t* serializedScore;
510 unsigned int serializedScoreSize;
512 fwrite(DS_TAG_DEVICE,
sizeof(
char), strlen(DS_TAG_DEVICE), profileFile);
514 fwrite(DS_TAG_DEVICE_TYPE,
sizeof(
char), strlen(DS_TAG_DEVICE_TYPE),
516 fwrite(&profile->devices[i].type,
sizeof(ds_device_type), 1, profileFile);
517 fwrite(DS_TAG_DEVICE_TYPE_END,
sizeof(
char),
518 strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
520 switch (profile->devices[i].type) {
521 case DS_DEVICE_NATIVE_CPU: {
532 case DS_DEVICE_OPENCL_DEVICE: {
533 fwrite(DS_TAG_DEVICE_NAME,
sizeof(
char), strlen(DS_TAG_DEVICE_NAME),
535 fwrite(profile->devices[i].oclDeviceName,
sizeof(
char),
536 strlen(profile->devices[i].oclDeviceName), profileFile);
537 fwrite(DS_TAG_DEVICE_NAME_END,
sizeof(
char),
538 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
540 fwrite(DS_TAG_DEVICE_DRIVER_VERSION,
sizeof(
char),
541 strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
542 fwrite(profile->devices[i].oclDriverVersion,
sizeof(
char),
543 strlen(profile->devices[i].oclDriverVersion), profileFile);
544 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END,
sizeof(
char),
545 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
548 status = DS_UNKNOWN_DEVICE_TYPE;
552 fwrite(DS_TAG_SCORE,
sizeof(
char), strlen(DS_TAG_SCORE), profileFile);
553 status = serializer(&profile->devices[i], &serializedScore,
554 &serializedScoreSize);
555 if (status == DS_SUCCESS && serializedScore !=
nullptr &&
556 serializedScoreSize > 0) {
557 fwrite(serializedScore,
sizeof(
char), serializedScoreSize, profileFile);
558 delete[] serializedScore;
560 fwrite(DS_TAG_SCORE_END,
sizeof(
char), strlen(DS_TAG_SCORE_END),
562 fwrite(DS_TAG_DEVICE_END,
sizeof(
char), strlen(DS_TAG_DEVICE_END),
564 fwrite(
"\n",
sizeof(
char), 1, profileFile);
572static void legalizeFileName(
char* fileName) {
574 const char* invalidChars =
577 for (
unsigned i = 0; i < strlen(invalidChars); i++) {
579 invalidStr[0] = invalidChars[i];
580 invalidStr[1] =
'\0';
586 for (
char* pos = strstr(fileName, invalidStr); pos !=
nullptr;
587 pos = strstr(pos + 1, invalidStr)) {
595static void populateGPUEnvFromDevice(GPUEnv* gpuInfo, cl_device_id device) {
598 gpuInfo->mnIsUserCreated = 1;
600 gpuInfo->mpDevID = device;
601 gpuInfo->mpArryDevsID =
new cl_device_id[1];
602 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
603 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
604 sizeof(cl_device_type), &gpuInfo->mDevType, &size);
605 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(TYPE)");
608 clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
609 sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
610 CHECK_OPENCL(clStatus,
"populateGPUEnv::getDeviceInfo(PLATFORM)");
612 cl_context_properties props[3];
613 props[0] = CL_CONTEXT_PLATFORM;
614 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
617 clCreateContext(props, 1, &gpuInfo->mpDevID,
nullptr,
nullptr, &clStatus);
618 CHECK_OPENCL(clStatus,
"populateGPUEnv::createContext");
620 cl_command_queue_properties queueProperties = 0;
621 gpuInfo->mpCmdQueue = clCreateCommandQueue(
622 gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
623 CHECK_OPENCL(clStatus,
"populateGPUEnv::createCommandQueue");
626int OpenclDevice::LoadOpencl() {
628 HINSTANCE HOpenclDll =
nullptr;
629 void* OpenclDll =
nullptr;
631 OpenclDll =
static_cast<HINSTANCE
>(HOpenclDll);
632 OpenclDll = LoadLibrary(
"openCL.dll");
633 if (!
static_cast<HINSTANCE
>(OpenclDll)) {
634 fprintf(stderr,
"[OD] Load opencl.dll failed!\n");
635 FreeLibrary(
static_cast<HINSTANCE
>(OpenclDll));
638 fprintf(stderr,
"[OD] Load opencl.dll successful!\n");
642int OpenclDevice::SetKernelEnv(KernelEnv* envInfo) {
643 envInfo->mpkContext = gpuEnv.mpContext;
644 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
645 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
650static cl_mem allocateZeroCopyBuffer(
const KernelEnv& rEnv,
651 l_uint32* hostbuffer,
size_t nElements,
652 cl_mem_flags flags, cl_int* pStatus) {
654 clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
655 nElements *
sizeof(l_uint32), hostbuffer, pStatus);
660static Pix* mapOutputCLBuffer(
const KernelEnv& rEnv, cl_mem clbuffer, Pix* pixd,
661 Pix* pixs,
int elements, cl_mem_flags flags,
662 bool memcopy =
false,
bool sync =
true) {
665 if ((pixd = pixCreateTemplate(pixs)) ==
nullptr)
668 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
669 pixGetDepth(pixs))) ==
nullptr)
673 l_uint32* pValues = (l_uint32*)clEnqueueMapBuffer(
674 rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
675 elements *
sizeof(l_uint32), 0,
nullptr,
nullptr,
nullptr);
678 memcpy(pixGetData(pixd), pValues, elements *
sizeof(l_uint32));
680 pixSetData(pixd, pValues);
683 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0,
nullptr,
687 clFinish(rEnv.mpkCmdQueue);
693void OpenclDevice::releaseMorphCLBuffers() {
694 if (pixdCLIntermediate !=
nullptr) clReleaseMemObject(pixdCLIntermediate);
695 if (pixsCLBuffer !=
nullptr) clReleaseMemObject(pixsCLBuffer);
696 if (pixdCLBuffer !=
nullptr) clReleaseMemObject(pixdCLBuffer);
697 if (pixThBuffer !=
nullptr) clReleaseMemObject(pixThBuffer);
698 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer =
nullptr;
701int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs) {
704 if (pixThBuffer !=
nullptr) {
705 pixsCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
706 CL_MEM_ALLOC_HOST_PTR, &clStatus);
710 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
711 sizeof(l_uint32) * wpl * h, 0,
nullptr,
nullptr);
715 reinterpret_cast<l_uint32*
>(malloc(wpl * h *
sizeof(l_uint32)));
716 memcpy(srcdata, pixGetData(pixs), wpl * h *
sizeof(l_uint32));
718 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h,
719 CL_MEM_USE_HOST_PTR, &clStatus);
722 pixdCLBuffer = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
723 CL_MEM_ALLOC_HOST_PTR, &clStatus);
725 pixdCLIntermediate = allocateZeroCopyBuffer(rEnv,
nullptr, wpl * h,
726 CL_MEM_ALLOC_HOST_PTR, &clStatus);
728 return (
int)clStatus;
731int OpenclDevice::InitEnv() {
735 if (1 == LoadOpencl())
break;
740 InitOpenclRunEnv_DeviceSelection(0);
744int OpenclDevice::ReleaseOpenclRunEnv() {
745 ReleaseOpenclEnv(&gpuEnv);
752inline int OpenclDevice::AddKernelConfig(
int kCount,
const char* kName) {
754 ASSERT_HOST(strlen(kName) <
sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
755 strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
756 gpuEnv.mnKernelCount++;
760int OpenclDevice::RegistOpenclKernel() {
761 if (!gpuEnv.mnIsUserCreated) memset(&gpuEnv, 0,
sizeof(gpuEnv));
763 gpuEnv.mnFileCount = 0;
764 gpuEnv.mnKernelCount = 0UL;
766 AddKernelConfig(1,
"oclAverageSub1");
770int OpenclDevice::InitOpenclRunEnv_DeviceSelection(
int argc) {
773 ds_device bestDevice_DS = getDeviceSelection();
774 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
776 if (selectedDeviceIsOpenCL()) {
779 populateGPUEnvFromDevice(&gpuEnv, bestDevice);
780 gpuEnv.mnFileCount = 0;
781 gpuEnv.mnKernelCount = 0UL;
782 CompileKernelFile(&gpuEnv,
"");
792OpenclDevice::OpenclDevice() {
796OpenclDevice::~OpenclDevice() {
800int OpenclDevice::ReleaseOpenclEnv(GPUEnv* gpuInfo) {
808 for (i = 0; i < gpuEnv.mnFileCount; i++) {
809 if (gpuEnv.mpArryPrograms[i]) {
810 clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
811 CHECK_OPENCL(clStatus,
"clReleaseProgram");
812 gpuEnv.mpArryPrograms[i] =
nullptr;
815 if (gpuEnv.mpCmdQueue) {
816 clReleaseCommandQueue(gpuEnv.mpCmdQueue);
817 gpuEnv.mpCmdQueue =
nullptr;
819 if (gpuEnv.mpContext) {
820 clReleaseContext(gpuEnv.mpContext);
821 gpuEnv.mpContext =
nullptr;
824 gpuInfo->mnIsUserCreated = 0;
825 delete[] gpuInfo->mpArryDevsID;
828int OpenclDevice::BinaryGenerated(
const char* clFileName, FILE** fhandle) {
833 char fileName[256] = {0}, cl_name[128] = {0};
834 char deviceName[1024];
835 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
836 sizeof(deviceName), deviceName,
nullptr);
837 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
838 const char* str = strstr(clFileName,
".cl");
839 memcpy(cl_name, clFileName, str - clFileName);
840 cl_name[str - clFileName] =
'\0';
841 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
842 legalizeFileName(fileName);
843 fd = fopen(fileName,
"rb");
844 status = (fd !=
nullptr) ? 1 : 0;
850int OpenclDevice::CachedOfKernerPrg(
const GPUEnv* gpuEnvCached,
851 const char* clFileName) {
853 for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
854 if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
855 if (gpuEnvCached->mpArryPrograms[i] !=
nullptr) {
863int OpenclDevice::WriteBinaryToFile(
const char* fileName,
const char* birary,
865 FILE* output =
nullptr;
866 output = fopen(fileName,
"wb");
867 if (output ==
nullptr) {
871 fwrite(birary,
sizeof(
char), numBytes, output);
877int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
878 const char* clFileName) {
883 clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
884 sizeof(numDevices), &numDevices,
nullptr);
885 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
887 std::vector<cl_device_id> mpArryDevsID(numDevices);
890 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
891 sizeof(cl_device_id) * numDevices,
894 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
897 std::vector<size_t> binarySizes(numDevices);
900 clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
901 sizeof(
size_t) * numDevices, &binarySizes[0],
nullptr);
902 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
905 std::vector<char*> binaries(numDevices);
907 for (i = 0; i < numDevices; i++) {
908 if (binarySizes[i] != 0) {
909 binaries[i] =
new char[binarySizes[i]];
911 binaries[i] =
nullptr;
916 clGetProgramInfo(program, CL_PROGRAM_BINARIES,
sizeof(
char*) * numDevices,
917 &binaries[0],
nullptr);
918 CHECK_OPENCL(clStatus,
"clGetProgramInfo");
921 for (i = 0; i < numDevices; i++) {
922 char fileName[256] = {0}, cl_name[128] = {0};
924 if (binarySizes[i] != 0) {
925 char deviceName[1024];
926 clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
927 sizeof(deviceName), deviceName,
nullptr);
928 CHECK_OPENCL(clStatus,
"clGetDeviceInfo");
930 const char* str = strstr(clFileName,
".cl");
931 memcpy(cl_name, clFileName, str - clFileName);
932 cl_name[str - clFileName] =
'\0';
933 sprintf(fileName,
"%s-%s.bin", cl_name, deviceName);
934 legalizeFileName(fileName);
935 if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
936 tprintf(
"[OD] write binary[%s] failed\n", fileName);
939 tprintf(
"[OD] write binary[%s] successfully\n", fileName);
944 for (i = 0; i < numDevices; i++) {
945 delete[] binaries[i];
951int OpenclDevice::CompileKernelFile(GPUEnv* gpuInfo,
const char* buildOption) {
954 size_t source_size[1];
955 int binary_status, binaryExisted, idx;
958 const char* filename =
"kernel.cl";
960 if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
964 idx = gpuInfo->mnFileCount;
968 source_size[0] = strlen(source);
970 binaryExisted = BinaryGenerated(
972 if (binaryExisted == 1) {
973 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
974 sizeof(numDevices), &numDevices,
nullptr);
975 CHECK_OPENCL(clStatus,
"clGetContextInfo");
977 std::vector<cl_device_id> mpArryDevsID(numDevices);
978 bool b_error = fseek(fd, 0, SEEK_END) < 0;
979 auto pos = std::ftell(fd);
980 b_error |= (pos <= 0);
982 b_error |= fseek(fd, 0, SEEK_SET) < 0;
988 std::vector<uint8_t> binary(length + 2);
990 memset(&binary[0], 0, length + 2);
991 b_error |= fread(&binary[0], 1, length, fd) != length;
996 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
997 sizeof(cl_device_id) * numDevices,
998 &mpArryDevsID[0],
nullptr);
999 CHECK_OPENCL(clStatus,
"clGetContextInfo");
1001 const uint8_t* c_binary = &binary[0];
1002 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary(
1003 gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length, &c_binary,
1004 &binary_status, &clStatus);
1005 CHECK_OPENCL(clStatus,
"clCreateProgramWithBinary");
1009 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource(
1010 gpuInfo->mpContext, 1, &source, source_size, &clStatus);
1011 CHECK_OPENCL(clStatus,
"clCreateProgramWithSource");
1014 if (gpuInfo->mpArryPrograms[idx] == (cl_program)
nullptr) {
1021 if (!gpuInfo->mnIsUserCreated) {
1023 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1024 buildOption,
nullptr,
nullptr);
1027 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1028 buildOption,
nullptr,
nullptr);
1030 if (clStatus != CL_SUCCESS) {
1031 tprintf(
"BuildProgram error!\n");
1033 if (!gpuInfo->mnIsUserCreated) {
1034 clStatus = clGetProgramBuildInfo(
1035 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1036 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1039 clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1040 CL_PROGRAM_BUILD_LOG, 0,
nullptr, &length);
1042 if (clStatus != CL_SUCCESS) {
1043 tprintf(
"opencl create build log fail\n");
1046 std::vector<char> buildLog(length);
1047 if (!gpuInfo->mnIsUserCreated) {
1048 clStatus = clGetProgramBuildInfo(
1049 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1050 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1052 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
1053 gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG,
1054 length, &buildLog[0], &length);
1056 if (clStatus != CL_SUCCESS) {
1057 tprintf(
"opencl program build info fail\n");
1061 fd1 = fopen(
"kernel-build.log",
"w+");
1062 if (fd1 !=
nullptr) {
1063 fwrite(&buildLog[0],
sizeof(
char), length, fd1);
1070 strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1071 if (binaryExisted == 0) {
1072 GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1075 gpuInfo->mnFileCount += 1;
1079l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w,
1080 l_int32 h, l_int32 wpl,
1084 size_t globalThreads[2];
1085 size_t localThreads[2];
1091 gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1092 globalThreads[0] = gsize;
1093 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1094 globalThreads[1] = gsize;
1095 localThreads[0] = GROUPSIZE_X;
1096 localThreads[1] = GROUPSIZE_Y;
1098 SetKernelEnv(&rEnv);
1100 l_uint32* pResult = (l_uint32*)malloc(w * h *
sizeof(l_uint32));
1102 clCreateKernel(rEnv.mpkProgram,
"composeRGBPixel", &clStatus);
1103 CHECK_OPENCL(clStatus,
"clCreateKernel composeRGBPixel");
1106 valuesCl = allocateZeroCopyBuffer(
1107 rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1108 outputCl = allocateZeroCopyBuffer(
1109 rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1112 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &valuesCl);
1113 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1114 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(w), &w);
1115 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1116 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(h), &h);
1117 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1118 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1119 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1120 clStatus = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &outputCl);
1121 CHECK_OPENCL(clStatus,
"clSetKernelArg");
1125 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1126 globalThreads, localThreads, 0,
nullptr,
nullptr);
1127 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel");
1130 void* ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE,
1131 CL_MAP_READ, 0, w * h *
sizeof(l_uint32), 0,
1132 nullptr,
nullptr, &clStatus);
1133 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer outputCl");
1134 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0,
nullptr,
nullptr);
1137 clFinish(rEnv.mpkCmdQueue);
1143static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1144 size_t globalThreads[2];
1148 size_t localThreads[2];
1151 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1152 globalThreads[0] = gsize;
1153 globalThreads[1] = GROUPSIZE_HMORY;
1154 localThreads[0] = GROUPSIZE_HMORX;
1155 localThreads[1] = GROUPSIZE_HMORY;
1158 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_5x5", &status);
1159 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_5x5");
1161 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1162 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1163 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1164 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1167 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1168 globalThreads, localThreads, 0,
nullptr,
nullptr);
1171 pixtemp = pixsCLBuffer;
1172 pixsCLBuffer = pixdCLBuffer;
1173 pixdCLBuffer = pixtemp;
1176 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1177 globalThreads[0] = gsize;
1178 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1179 globalThreads[1] = gsize;
1180 localThreads[0] = GROUPSIZE_X;
1181 localThreads[1] = GROUPSIZE_Y;
1184 clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer_5x5", &status);
1185 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer_5x5");
1187 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1188 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1189 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1190 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1192 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1193 globalThreads, localThreads, 0,
nullptr,
nullptr);
1200static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1201 size_t globalThreads[2];
1205 l_uint32 fwmask, lwmask;
1206 size_t localThreads[2];
1208 lwmask = lmask32[31 - 2];
1209 fwmask = rmask32[31 - 2];
1212 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1213 globalThreads[0] = gsize;
1214 globalThreads[1] = GROUPSIZE_HMORY;
1215 localThreads[0] = GROUPSIZE_HMORX;
1216 localThreads[1] = GROUPSIZE_HMORY;
1219 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_5x5", &status);
1220 CHECK_OPENCL(status,
"clCreateKernel morphoErodeHor_5x5");
1222 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1223 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1224 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1225 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1228 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1229 globalThreads, localThreads, 0,
nullptr,
nullptr);
1232 pixtemp = pixsCLBuffer;
1233 pixsCLBuffer = pixdCLBuffer;
1234 pixdCLBuffer = pixtemp;
1237 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1238 globalThreads[0] = gsize;
1239 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1240 globalThreads[1] = gsize;
1241 localThreads[0] = GROUPSIZE_X;
1242 localThreads[1] = GROUPSIZE_Y;
1245 clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer_5x5", &status);
1246 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer_5x5");
1248 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1249 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1250 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1251 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1252 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(fwmask), &fwmask);
1253 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(lwmask), &lwmask);
1255 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1256 globalThreads, localThreads, 0,
nullptr,
nullptr);
1262static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl,
1264 l_int32 xp, yp, xn, yn;
1266 size_t globalThreads[2];
1270 size_t localThreads[2];
1273 OpenclDevice::SetKernelEnv(&rEnv);
1275 if (hsize == 5 && vsize == 5) {
1277 status = pixDilateCL_55(wpl, h);
1281 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1283 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1286 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1287 globalThreads[0] = gsize;
1288 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1289 globalThreads[1] = gsize;
1290 localThreads[0] = GROUPSIZE_X;
1291 localThreads[1] = GROUPSIZE_Y;
1293 if (xp > 31 || xn > 31) {
1296 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor", &status);
1297 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor");
1299 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1300 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1301 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1302 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1303 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1304 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1305 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1306 nullptr, globalThreads, localThreads, 0,
1309 if (yp > 0 || yn > 0) {
1310 pixtemp = pixsCLBuffer;
1311 pixsCLBuffer = pixdCLBuffer;
1312 pixdCLBuffer = pixtemp;
1314 }
else if (xp > 0 || xn > 0) {
1317 clCreateKernel(rEnv.mpkProgram,
"morphoDilateHor_32word", &status);
1318 CHECK_OPENCL(status,
"clCreateKernel morphoDilateHor_32word");
1319 isEven = (xp != xn);
1321 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1322 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1323 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1324 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1325 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1326 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isEven), &isEven);
1327 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1328 nullptr, globalThreads, localThreads, 0,
1331 if (yp > 0 || yn > 0) {
1332 pixtemp = pixsCLBuffer;
1333 pixsCLBuffer = pixdCLBuffer;
1334 pixdCLBuffer = pixtemp;
1338 if (yp > 0 || yn > 0) {
1340 clCreateKernel(rEnv.mpkProgram,
"morphoDilateVer", &status);
1341 CHECK_OPENCL(status,
"clCreateKernel morphoDilateVer");
1343 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1344 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1345 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1346 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1347 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1348 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(yn), &yn);
1349 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1350 nullptr, globalThreads, localThreads, 0,
1358static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl,
1360 l_int32 xp, yp, xn, yn;
1362 size_t globalThreads[2];
1363 size_t localThreads[2];
1367 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1368 l_uint32 rwmask, lwmask;
1371 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1373 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1375 OpenclDevice::SetKernelEnv(&rEnv);
1377 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1379 status = pixErodeCL_55(wpl, h);
1383 lwmask = lmask32[31 - (xn & 31)];
1384 rwmask = rmask32[31 - (xp & 31)];
1387 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1388 globalThreads[0] = gsize;
1389 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1390 globalThreads[1] = gsize;
1391 localThreads[0] = GROUPSIZE_X;
1392 localThreads[1] = GROUPSIZE_Y;
1395 if (xp > 31 || xn > 31) {
1397 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor", &status);
1399 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1400 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1401 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1402 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(xn), &xn);
1403 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(wpl), &wpl);
1404 status = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(h), &h);
1406 clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(isAsymmetric), &isAsymmetric);
1407 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(rwmask), &rwmask);
1408 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(lwmask), &lwmask);
1409 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1410 nullptr, globalThreads, localThreads, 0,
1413 if (yp > 0 || yn > 0) {
1414 pixtemp = pixsCLBuffer;
1415 pixsCLBuffer = pixdCLBuffer;
1416 pixdCLBuffer = pixtemp;
1418 }
else if (xp > 0 || xn > 0) {
1420 clCreateKernel(rEnv.mpkProgram,
"morphoErodeHor_32word", &status);
1421 isEven = (xp != xn);
1423 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1424 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1425 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(xp), &xp);
1426 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1427 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1429 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1430 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(rwmask), &rwmask);
1431 status = clSetKernelArg(rEnv.mpkKernel, 7,
sizeof(lwmask), &lwmask);
1432 status = clSetKernelArg(rEnv.mpkKernel, 8,
sizeof(isEven), &isEven);
1433 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1434 nullptr, globalThreads, localThreads, 0,
1437 if (yp > 0 || yn > 0) {
1438 pixtemp = pixsCLBuffer;
1439 pixsCLBuffer = pixdCLBuffer;
1440 pixdCLBuffer = pixtemp;
1445 if (yp > 0 || yn > 0) {
1446 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram,
"morphoErodeVer", &status);
1447 CHECK_OPENCL(status,
"clCreateKernel morphoErodeVer");
1449 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &pixsCLBuffer);
1450 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &pixdCLBuffer);
1451 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(yp), &yp);
1452 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(wpl), &wpl);
1453 status = clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(h), &h);
1455 clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(isAsymmetric), &isAsymmetric);
1456 status = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(yn), &yn);
1457 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1458 nullptr, globalThreads, localThreads, 0,
1466static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1471 status = pixErodeCL(hsize, vsize, wpl, h);
1473 pixtemp = pixsCLBuffer;
1474 pixsCLBuffer = pixdCLBuffer;
1475 pixdCLBuffer = pixtemp;
1477 status = pixDilateCL(hsize, vsize, wpl, h);
1483static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1488 status = pixDilateCL(hsize, vsize, wpl, h);
1490 pixtemp = pixsCLBuffer;
1491 pixsCLBuffer = pixdCLBuffer;
1492 pixdCLBuffer = pixtemp;
1494 status = pixErodeCL(hsize, vsize, wpl, h);
1500static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1503 size_t globalThreads[2];
1505 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1507 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1508 globalThreads[0] = gsize;
1509 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1510 globalThreads[1] = gsize;
1513 clCreateKernel(rEnv.mpkProgram,
"pixSubtract_inplace", &status);
1514 CHECK_OPENCL(status,
"clCreateKernel pixSubtract_inplace");
1517 status = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &buffer1);
1518 status = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(cl_mem), &buffer2);
1519 status = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(wpl), &wpl);
1520 status = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(h), &h);
1522 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
nullptr,
1523 globalThreads, localThreads, 0,
nullptr,
nullptr);
1531void OpenclDevice::pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline,
1532 Pix** pix_hline, Pix** pixClosed,
1533 bool getpixClosed, l_int32 close_hsize,
1534 l_int32 close_vsize, l_int32 open_hsize,
1535 l_int32 open_vsize, l_int32 line_hsize,
1536 l_int32 line_vsize) {
1540 wpl = pixGetWpl(pixs);
1541 h = pixGetHeight(pixs);
1544 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1548 *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1549 wpl * h, CL_MAP_READ,
true,
false);
1555 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1556 0,
sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1559 pixtemp = pixsCLBuffer;
1560 pixsCLBuffer = pixdCLBuffer;
1561 pixdCLBuffer = pixtemp;
1563 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1566 pixtemp = pixsCLBuffer;
1567 pixsCLBuffer = pixdCLBuffer;
1568 pixdCLBuffer = pixdCLIntermediate;
1569 pixdCLIntermediate = pixtemp;
1571 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1576 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1577 0,
sizeof(
int) * wpl * h, 0,
nullptr,
nullptr);
1579 pixtemp = pixsCLBuffer;
1580 pixsCLBuffer = pixdCLBuffer;
1581 pixdCLBuffer = pixtemp;
1585 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1588 *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1589 CL_MAP_READ,
true,
false);
1591 pixtemp = pixsCLBuffer;
1592 pixsCLBuffer = pixdCLIntermediate;
1593 pixdCLIntermediate = pixtemp;
1597 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1600 *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1601 CL_MAP_READ,
true,
true);
1612int OpenclDevice::HistogramRectOCL(
void* imageData,
1613 int bytes_per_pixel,
int bytes_per_line,
1616 int width,
int height,
int kHistogramSize,
1617 int* histogramAllChannels) {
1621 SetKernelEnv(&histKern);
1622 KernelEnv histRedKern;
1623 SetKernelEnv(&histRedKern);
1629 cl_mem imageBuffer = clCreateBuffer(
1630 histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1631 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1632 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1635 int block_size = 256;
1637 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1638 sizeof(numCUs), &numCUs,
nullptr);
1639 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1641 int requestedOccupancy = 10;
1642 int numWorkGroups = numCUs * requestedOccupancy;
1643 int numThreads = block_size * numWorkGroups;
1644 size_t local_work_size[] = {
static_cast<size_t>(block_size)};
1645 size_t global_work_size[] = {
static_cast<size_t>(numThreads)};
1646 size_t red_global_work_size[] = {
1647 static_cast<size_t>(block_size *
kHistogramSize * bytes_per_pixel)};
1651 cl_mem histogramBuffer = clCreateBuffer(
1652 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1653 kHistogramSize * bytes_per_pixel *
sizeof(
int), histogramAllChannels,
1655 CHECK_OPENCL(clStatus,
"clCreateBuffer histogramBuffer");
1659 int tmpHistogramBins =
kHistogramSize * bytes_per_pixel * histRed;
1661 cl_mem tmpHistogramBuffer =
1662 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1663 tmpHistogramBins *
sizeof(cl_uint),
nullptr, &clStatus);
1664 CHECK_OPENCL(clStatus,
"clCreateBuffer tmpHistogramBuffer");
1667 int* zeroBuffer =
new int[1];
1669 cl_mem atomicSyncBuffer = clCreateBuffer(
1670 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1671 sizeof(cl_int), zeroBuffer, &clStatus);
1672 CHECK_OPENCL(clStatus,
"clCreateBuffer atomicSyncBuffer");
1673 delete[] zeroBuffer;
1675 if (bytes_per_pixel == 1) {
1676 histKern.mpkKernel = clCreateKernel(
1677 histKern.mpkProgram,
"kernel_HistogramRectOneChannel", &clStatus);
1678 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectOneChannel");
1680 histRedKern.mpkKernel =
1681 clCreateKernel(histRedKern.mpkProgram,
1682 "kernel_HistogramRectOneChannelReduction", &clStatus);
1683 CHECK_OPENCL(clStatus,
1684 "clCreateKernel kernel_HistogramRectOneChannelReduction");
1686 histKern.mpkKernel = clCreateKernel(
1687 histKern.mpkProgram,
"kernel_HistogramRectAllChannels", &clStatus);
1688 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_HistogramRectAllChannels");
1690 histRedKern.mpkKernel =
1691 clCreateKernel(histRedKern.mpkProgram,
1692 "kernel_HistogramRectAllChannelsReduction", &clStatus);
1693 CHECK_OPENCL(clStatus,
1694 "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1700 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE,
1701 CL_MAP_WRITE, 0, tmpHistogramBins *
sizeof(cl_uint),
1702 0,
nullptr,
nullptr, &clStatus);
1703 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer tmpHistogramBuffer");
1705 memset(ptr, 0, tmpHistogramBins *
sizeof(cl_uint));
1706 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1711 clSetKernelArg(histKern.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1712 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1713 cl_uint numPixels = width * height;
1714 clStatus = clSetKernelArg(histKern.mpkKernel, 1,
sizeof(cl_uint), &numPixels);
1715 CHECK_OPENCL(clStatus,
"clSetKernelArg numPixels");
1716 clStatus = clSetKernelArg(histKern.mpkKernel, 2,
sizeof(cl_mem),
1717 &tmpHistogramBuffer);
1718 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1721 int n = numThreads / bytes_per_pixel;
1722 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0,
sizeof(cl_int), &n);
1723 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1724 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1,
sizeof(cl_mem),
1725 &tmpHistogramBuffer);
1726 CHECK_OPENCL(clStatus,
"clSetKernelArg tmpHistogramBuffer");
1727 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2,
sizeof(cl_mem),
1729 CHECK_OPENCL(clStatus,
"clSetKernelArg histogramBuffer");
1732 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1733 nullptr, global_work_size, local_work_size,
1734 0,
nullptr,
nullptr);
1735 CHECK_OPENCL(clStatus,
1736 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1737 clFinish(histKern.mpkCmdQueue);
1738 if (clStatus != 0) {
1742 clStatus = clEnqueueNDRangeKernel(
1743 histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1,
nullptr,
1744 red_global_work_size, local_work_size, 0,
nullptr,
nullptr);
1747 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1748 clFinish(histRedKern.mpkCmdQueue);
1749 if (clStatus != 0) {
1754 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1756 kHistogramSize * bytes_per_pixel *
sizeof(
int), 0,
1757 nullptr,
nullptr, &clStatus);
1758 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1759 if (clStatus != 0) {
1762 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1765 clReleaseMemObject(histogramBuffer);
1766 clReleaseMemObject(imageBuffer);
1775int OpenclDevice::ThresholdRectToPixOCL(
unsigned char* imageData,
1776 int bytes_per_pixel,
int bytes_per_line,
1777 int* thresholds,
int* hi_values,
1778 Pix** pix,
int height,
int width,
1779 int top,
int left) {
1782 *pix = pixCreate(width, height, 1);
1783 uint32_t* pixData = pixGetData(*pix);
1784 int wpl = pixGetWpl(*pix);
1785 int pixSize = wpl * height *
sizeof(uint32_t);
1789 SetKernelEnv(&rEnv);
1792 int block_size = 256;
1794 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1795 sizeof(numCUs), &numCUs,
nullptr);
1796 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1798 int requestedOccupancy = 10;
1799 int numWorkGroups = numCUs * requestedOccupancy;
1800 int numThreads = block_size * numWorkGroups;
1801 size_t local_work_size[] = {(size_t)block_size};
1802 size_t global_work_size[] = {(size_t)numThreads};
1809 cl_mem imageBuffer = clCreateBuffer(
1810 rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1811 width * height * bytes_per_pixel *
sizeof(
char), imageData, &clStatus);
1812 CHECK_OPENCL(clStatus,
"clCreateBuffer imageBuffer");
1816 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1817 pixSize, pixData, &clStatus);
1818 CHECK_OPENCL(clStatus,
"clCreateBuffer pix");
1821 cl_mem thresholdsBuffer =
1822 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1823 bytes_per_pixel *
sizeof(
int), thresholds, &clStatus);
1824 CHECK_OPENCL(clStatus,
"clCreateBuffer thresholdBuffer");
1825 cl_mem hiValuesBuffer =
1826 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1827 bytes_per_pixel *
sizeof(
int), hi_values, &clStatus);
1828 CHECK_OPENCL(clStatus,
"clCreateBuffer hiValuesBuffer");
1831 if (bytes_per_pixel == 4) {
1833 clCreateKernel(rEnv.mpkProgram,
"kernel_ThresholdRectToPix", &clStatus);
1834 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix");
1836 rEnv.mpkKernel = clCreateKernel(
1837 rEnv.mpkProgram,
"kernel_ThresholdRectToPix_OneChan", &clStatus);
1838 CHECK_OPENCL(clStatus,
"clCreateKernel kernel_ThresholdRectToPix_OneChan");
1842 clStatus = clSetKernelArg(rEnv.mpkKernel, 0,
sizeof(cl_mem), &imageBuffer);
1843 CHECK_OPENCL(clStatus,
"clSetKernelArg imageBuffer");
1844 clStatus = clSetKernelArg(rEnv.mpkKernel, 1,
sizeof(
int), &height);
1845 CHECK_OPENCL(clStatus,
"clSetKernelArg height");
1846 clStatus = clSetKernelArg(rEnv.mpkKernel, 2,
sizeof(
int), &width);
1847 CHECK_OPENCL(clStatus,
"clSetKernelArg width");
1848 clStatus = clSetKernelArg(rEnv.mpkKernel, 3,
sizeof(
int), &wpl);
1849 CHECK_OPENCL(clStatus,
"clSetKernelArg wpl");
1851 clSetKernelArg(rEnv.mpkKernel, 4,
sizeof(cl_mem), &thresholdsBuffer);
1852 CHECK_OPENCL(clStatus,
"clSetKernelArg thresholdsBuffer");
1853 clStatus = clSetKernelArg(rEnv.mpkKernel, 5,
sizeof(cl_mem), &hiValuesBuffer);
1854 CHECK_OPENCL(clStatus,
"clSetKernelArg hiValuesBuffer");
1855 clStatus = clSetKernelArg(rEnv.mpkKernel, 6,
sizeof(cl_mem), &pixThBuffer);
1856 CHECK_OPENCL(clStatus,
"clSetKernelArg pixThBuffer");
1859 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
1860 nullptr, global_work_size, local_work_size,
1861 0,
nullptr,
nullptr);
1862 CHECK_OPENCL(clStatus,
"clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1863 clFinish(rEnv.mpkCmdQueue);
1864 if (clStatus != 0) {
1865 tprintf(
"Setting return value to -1\n");
1870 clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
1871 pixSize, 0,
nullptr,
nullptr, &clStatus);
1872 CHECK_OPENCL(clStatus,
"clEnqueueMapBuffer histogramBuffer");
1873 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0,
nullptr,
1876 clReleaseMemObject(imageBuffer);
1877 clReleaseMemObject(thresholdsBuffer);
1878 clReleaseMemObject(hiValuesBuffer);
1887struct TessScoreEvaluationInputData {
1891 unsigned char* imageData;
1895static void populateTessScoreEvaluationInputData(
1896 TessScoreEvaluationInputData* input) {
1901 int numChannels = 4;
1902 input->height = height;
1903 input->width = width;
1904 input->numChannels = numChannels;
1905 unsigned char(*imageData4)[4] = (
unsigned char(*)[4])malloc(
1906 height * width * numChannels *
1907 sizeof(
unsigned char));
1908 input->imageData = (
unsigned char*)&imageData4[0];
1911 unsigned char pixelWhite[4] = {0, 0, 0, 255};
1912 unsigned char pixelBlack[4] = {255, 255, 255, 255};
1913 for (
int p = 0; p < height * width; p++) {
1915 imageData4[p][0] = pixelWhite[0];
1916 imageData4[p][1] = pixelWhite[1];
1917 imageData4[p][2] = pixelWhite[2];
1918 imageData4[p][3] = pixelWhite[3];
1921 int maxLineWidth = 64;
1924 for (
int i = 0; i < numLines; i++) {
1925 int lineWidth = rand() % maxLineWidth;
1926 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1928 for (
int row = vertLinePos - lineWidth / 2;
1929 row < vertLinePos + lineWidth / 2; row++) {
1930 for (
int col = 0; col < height; col++) {
1932 imageData4[row * width + col][0] = pixelBlack[0];
1933 imageData4[row * width + col][1] = pixelBlack[1];
1934 imageData4[row * width + col][2] = pixelBlack[2];
1935 imageData4[row * width + col][3] = pixelBlack[3];
1940 for (
int i = 0; i < numLines; i++) {
1941 int lineWidth = rand() % maxLineWidth;
1942 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1944 for (
int row = 0; row < width; row++) {
1945 for (
int col = horLinePos - lineWidth / 2;
1946 col < horLinePos + lineWidth / 2;
1951 imageData4[row * width + col][0] = pixelBlack[0];
1952 imageData4[row * width + col][1] = pixelBlack[1];
1953 imageData4[row * width + col][2] = pixelBlack[2];
1954 imageData4[row * width + col][3] = pixelBlack[3];
1959 float fractionBlack = 0.1;
1961 (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1962 for (
int i = 0; i < numSpots; i++) {
1963 int lineWidth = rand() % maxLineWidth;
1964 int col = lineWidth + rand() % (width - 2 * lineWidth);
1965 int row = lineWidth + rand() % (height - 2 * lineWidth);
1967 for (
int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1968 for (
int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1971 imageData4[r * width + c][0] = pixelBlack[0];
1972 imageData4[r * width + c][1] = pixelBlack[1];
1973 imageData4[r * width + c][2] = pixelBlack[2];
1974 imageData4[r * width + c][3] = pixelBlack[3];
1979 input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1982struct TessDeviceScore {
1992static double composeRGBPixelMicroBench(GPUEnv* env,
1993 TessScoreEvaluationInputData input,
1994 ds_device_type type) {
1997 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1998 QueryPerformanceFrequency(&freq);
2000 mach_timebase_info_data_t info = {0, 0};
2001 mach_timebase_info(&info);
2002 long long start, stop;
2004 timespec time_funct_start, time_funct_end;
2007 l_uint32* tiffdata =
2008 (l_uint32*)input.imageData;
2012 if (type == DS_DEVICE_OPENCL_DEVICE) {
2014 QueryPerformanceCounter(&time_funct_start);
2016 start = mach_absolute_time();
2018 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2021 OpenclDevice::gpuEnv = *env;
2022 int wpl = pixGetWpl(input.pix);
2023 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2026 QueryPerformanceCounter(&time_funct_end);
2027 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2028 (
double)(freq.QuadPart);
2030 stop = mach_absolute_time();
2031 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2033 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2034 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2035 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2040 QueryPerformanceCounter(&time_funct_start);
2042 start = mach_absolute_time();
2044 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2046 Pix* pix = pixCreate(input.width, input.height, 32);
2047 l_uint32* pixData = pixGetData(pix);
2050 for (i = 0; i < input.height; i++) {
2051 for (j = 0; j < input.width; j++) {
2052 l_uint32 tiffword = tiffdata[i * input.width + j];
2053 l_int32 rval = ((tiffword)&0xff);
2054 l_int32 gval = (((tiffword) >> 8) & 0xff);
2055 l_int32 bval = (((tiffword) >> 16) & 0xff);
2056 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2057 pixData[idx] = value;
2062 QueryPerformanceCounter(&time_funct_end);
2063 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2064 (
double)(freq.QuadPart);
2066 stop = mach_absolute_time();
2067 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2069 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2070 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2071 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2079static double histogramRectMicroBench(GPUEnv* env,
2080 TessScoreEvaluationInputData input,
2081 ds_device_type type) {
2084 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2085 QueryPerformanceFrequency(&freq);
2087 mach_timebase_info_data_t info = {0, 0};
2088 mach_timebase_info(&info);
2089 long long start, stop;
2091 timespec time_funct_start, time_funct_end;
2097 int bytes_per_line = input.width * input.numChannels;
2098 int* histogramAllChannels =
new int[
kHistogramSize * input.numChannels];
2100 if (type == DS_DEVICE_OPENCL_DEVICE) {
2102 QueryPerformanceCounter(&time_funct_start);
2104 start = mach_absolute_time();
2106 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2109 OpenclDevice::gpuEnv = *env;
2110 int retVal = OpenclDevice::HistogramRectOCL(
2111 input.imageData, input.numChannels, bytes_per_line, left, top,
2112 input.width, input.height, kHistogramSize, histogramAllChannels);
2115 QueryPerformanceCounter(&time_funct_end);
2116 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2117 (
double)(freq.QuadPart);
2119 stop = mach_absolute_time();
2121 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2126 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2127 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2128 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2133 QueryPerformanceCounter(&time_funct_start);
2135 start = mach_absolute_time();
2137 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2139 for (
int ch = 0; ch < input.numChannels; ++ch) {
2141 input.width, input.height, histogram);
2144 QueryPerformanceCounter(&time_funct_end);
2145 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2146 (
double)(freq.QuadPart);
2148 stop = mach_absolute_time();
2149 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2151 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2152 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2153 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2159 delete[] histogramAllChannels;
2164static void ThresholdRectToPix_Native(
const unsigned char* imagedata,
2165 int bytes_per_pixel,
int bytes_per_line,
2166 const int* thresholds,
2167 const int* hi_values, Pix** pix) {
2170 int width = pixGetWidth(*pix);
2171 int height = pixGetHeight(*pix);
2173 *pix = pixCreate(width, height, 1);
2174 uint32_t* pixdata = pixGetData(*pix);
2175 int wpl = pixGetWpl(*pix);
2176 const unsigned char* srcdata =
2177 imagedata + top * bytes_per_line + left * bytes_per_pixel;
2178 for (
int y = 0; y < height; ++y) {
2179 const uint8_t* linedata = srcdata;
2180 uint32_t* pixline = pixdata + y * wpl;
2181 for (
int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2182 bool white_result =
true;
2183 for (
int ch = 0; ch < bytes_per_pixel; ++ch) {
2184 if (hi_values[ch] >= 0 &&
2185 (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2186 white_result =
false;
2191 CLEAR_DATA_BIT(pixline, x);
2193 SET_DATA_BIT(pixline, x);
2195 srcdata += bytes_per_line;
2199static double thresholdRectToPixMicroBench(GPUEnv* env,
2200 TessScoreEvaluationInputData input,
2201 ds_device_type type) {
2204 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2205 QueryPerformanceFrequency(&freq);
2207 mach_timebase_info_data_t info = {0, 0};
2208 mach_timebase_info(&info);
2209 long long start, stop;
2211 timespec time_funct_start, time_funct_end;
2215 unsigned char pixelHi = (
unsigned char)255;
2216 int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2221 int bytes_per_line = input.width * input.numChannels;
2224 if (type == DS_DEVICE_OPENCL_DEVICE) {
2226 QueryPerformanceCounter(&time_funct_start);
2228 start = mach_absolute_time();
2230 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2233 OpenclDevice::gpuEnv = *env;
2235 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2236 input.imageData, input.numChannels, bytes_per_line, thresholds,
2237 hi_values, &input.pix, input.height, input.width, top, left);
2240 QueryPerformanceCounter(&time_funct_end);
2241 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2242 (
double)(freq.QuadPart);
2244 stop = mach_absolute_time();
2246 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2252 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2253 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2254 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2260 QueryPerformanceCounter(&time_funct_start);
2262 start = mach_absolute_time();
2264 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2266 int hi_values[4] = {};
2267 ThresholdRectToPix_Native(input.imageData, input.numChannels,
2268 bytes_per_line, thresholds, hi_values,
2272 QueryPerformanceCounter(&time_funct_end);
2273 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2274 (
double)(freq.QuadPart);
2276 stop = mach_absolute_time();
2277 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2279 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2280 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2281 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2288static double getLineMasksMorphMicroBench(GPUEnv* env,
2289 TessScoreEvaluationInputData input,
2290 ds_device_type type) {
2293 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2294 QueryPerformanceFrequency(&freq);
2296 mach_timebase_info_data_t info = {0, 0};
2297 mach_timebase_info(&info);
2298 long long start, stop;
2300 timespec time_funct_start, time_funct_end;
2304 int resolution = 300;
2305 int wpl = pixGetWpl(input.pix);
2310 int closing_brick = max_line_width / 3;
2313 if (type == DS_DEVICE_OPENCL_DEVICE) {
2315 QueryPerformanceCounter(&time_funct_start);
2317 start = mach_absolute_time();
2319 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2321 OpenclDevice::gpuEnv = *env;
2322 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2323 Pix *pix_vline =
nullptr, *pix_hline =
nullptr, *pix_closed =
nullptr;
2324 OpenclDevice::pixGetLinesCL(
nullptr, input.pix, &pix_vline, &pix_hline,
2325 &pix_closed,
true, closing_brick, closing_brick,
2326 max_line_width, max_line_width, min_line_length,
2329 OpenclDevice::releaseMorphCLBuffers();
2332 QueryPerformanceCounter(&time_funct_end);
2333 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2334 (
double)(freq.QuadPart);
2336 stop = mach_absolute_time();
2337 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2339 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2340 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2341 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2345 QueryPerformanceCounter(&time_funct_start);
2347 start = mach_absolute_time();
2349 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2353 Pix* src_pix = input.pix;
2355 pixCloseBrick(
nullptr, src_pix, closing_brick, closing_brick);
2357 pixOpenBrick(
nullptr, pix_closed, max_line_width, max_line_width);
2358 Pix* pix_hollow = pixSubtract(
nullptr, pix_closed, pix_solid);
2359 pixDestroy(&pix_solid);
2360 Pix* pix_vline = pixOpenBrick(
nullptr, pix_hollow, 1, min_line_length);
2361 Pix* pix_hline = pixOpenBrick(
nullptr, pix_hollow, min_line_length, 1);
2362 pixDestroy(&pix_hline);
2363 pixDestroy(&pix_vline);
2364 pixDestroy(&pix_hollow);
2367 QueryPerformanceCounter(&time_funct_end);
2368 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2369 (
double)(freq.QuadPart);
2371 stop = mach_absolute_time();
2372 time = ((stop - start) * (
double)info.numer / info.denom) / 1.0E9;
2374 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2375 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2376 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2390static ds_status serializeScore(ds_device* device, uint8_t** serializedScore,
2391 unsigned int* serializedScoreSize) {
2392 *serializedScoreSize =
sizeof(TessDeviceScore);
2393 *serializedScore =
new uint8_t[*serializedScoreSize];
2394 memcpy(*serializedScore, device->score, *serializedScoreSize);
2399static ds_status deserializeScore(ds_device* device,
2400 const uint8_t* serializedScore,
2401 unsigned int serializedScoreSize) {
2403 device->score =
new TessDeviceScore;
2404 memcpy(device->score, serializedScore, serializedScoreSize);
2408static ds_status releaseScore(TessDeviceScore* score) {
2414static ds_status evaluateScoreForDevice(ds_device* device,
void* inputData) {
2417 tprintf(
"\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2418 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2419 GPUEnv* env =
nullptr;
2420 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2421 env = &OpenclDevice::gpuEnv;
2422 memset(env, 0,
sizeof(*env));
2424 populateGPUEnvFromDevice(env, device->oclDeviceID);
2425 env->mnFileCount = 0;
2426 env->mnKernelCount = 0UL;
2428 OpenclDevice::CompileKernelFile(env,
"");
2431 TessScoreEvaluationInputData* input =
2432 static_cast<TessScoreEvaluationInputData*
>(inputData);
2435 double composeRGBPixelTime =
2436 composeRGBPixelMicroBench(env, *input, device->type);
2439 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2442 double thresholdRectToPixTime =
2443 thresholdRectToPixMicroBench(env, *input, device->type);
2446 double getLineMasksMorphTime =
2447 getLineMasksMorphMicroBench(env, *input, device->type);
2451 float composeRGBPixelWeight = 1.2f;
2452 float histogramRectWeight = 2.4f;
2453 float thresholdRectToPixWeight = 4.5f;
2454 float getLineMasksMorphWeight = 5.0f;
2456 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2457 histogramRectWeight * histogramRectTime +
2458 thresholdRectToPixWeight * thresholdRectToPixTime +
2459 getLineMasksMorphWeight * getLineMasksMorphTime;
2460 device->score =
new TessDeviceScore;
2461 device->score->time = weightedTime;
2463 tprintf(
"[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2464 device->type == DS_DEVICE_OPENCL_DEVICE ?
"OpenCL" :
"Native");
2465 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"composeRGBPixel", composeRGBPixelTime,
2466 composeRGBPixelWeight);
2467 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"HistogramRect", histogramRectTime,
2468 histogramRectWeight);
2469 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"ThresholdRectToPix",
2470 thresholdRectToPixTime, thresholdRectToPixWeight);
2471 tprintf(
"[DS]%25s: %f (w=%.1f)\n",
"getLineMasksMorph", getLineMasksMorphTime,
2472 getLineMasksMorphWeight);
2473 tprintf(
"[DS]%25s: %f\n",
"Score", device->score->time);
2478ds_device OpenclDevice::getDeviceSelection() {
2479 if (!deviceIsSelected) {
2481 if (1 == LoadOpencl()) {
2485 ds_profile* profile;
2486 status = initDSProfile(&profile,
"v0.1");
2488 const char* fileName =
"tesseract_opencl_profile_devices.dat";
2489 status = readProfileFromFile(profile, deserializeScore, fileName);
2490 if (status != DS_SUCCESS) {
2492 tprintf(
"[DS] Profile file not available (%s); performing profiling.\n",
2496 TessScoreEvaluationInputData input;
2497 populateTessScoreEvaluationInputData(&input);
2499 unsigned int numUpdates;
2500 status = profileDevices(profile, DS_EVALUATE_ALL,
2501 evaluateScoreForDevice, &input, &numUpdates);
2503 if (status == DS_SUCCESS) {
2504 status = writeProfileToFile(profile, serializeScore, fileName);
2505 if (status == DS_SUCCESS) {
2506 tprintf(
"[DS] Scores written to file (%s).\n", fileName);
2509 "[DS] Error saving scores to file (%s); scores not written to "
2515 "[DS] Unable to evaluate performance; scores not written to "
2519 tprintf(
"[DS] Profile read from file (%s).\n", fileName);
2524 float bestTime = FLT_MAX;
2525 int bestDeviceIdx = -1;
2526 for (
unsigned d = 0; d < profile->numDevices; d++) {
2527 ds_device device = profile->devices[d];
2528 if (device.score ==
nullptr)
continue;
2529 TessDeviceScore score = *device.score;
2531 float time = score.time;
2532 tprintf(
"[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2533 device.oclDeviceName, time);
2534 if (time < bestTime) {
2539 if (bestDeviceIdx >= 0) {
2540 tprintf(
"[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2541 profile->devices[bestDeviceIdx].oclDeviceName,
2542 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2549 bool overridden =
false;
2550 char* overrideDeviceStr = getenv(
"TESSERACT_OPENCL_DEVICE");
2551 if (overrideDeviceStr !=
nullptr) {
2552 int overrideDeviceIdx = atoi(overrideDeviceStr);
2553 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2555 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2557 overrideDeviceStr, overrideDeviceIdx);
2558 bestDeviceIdx = overrideDeviceIdx - 1;
2562 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2563 "valid devices).\n",
2564 overrideDeviceStr, profile->numDevices);
2569 tprintf(
"[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2570 profile->devices[bestDeviceIdx].oclDeviceName,
2571 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2575 selectedDevice = profile->devices[bestDeviceIdx];
2577 releaseDSProfile(profile, releaseScore);
2580 tprintf(
"[DS] OpenCL runtime not available.\n");
2581 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2582 selectedDevice.oclDeviceName =
"(null)";
2583 selectedDevice.score =
nullptr;
2584 selectedDevice.oclDeviceID =
nullptr;
2585 selectedDevice.oclDriverVersion =
nullptr;
2587 deviceIsSelected =
true;
2589 return selectedDevice;
2592bool OpenclDevice::selectedDeviceIsOpenCL() {
2593 ds_device device = getDeviceSelection();
2594 return (device.type == DS_DEVICE_OPENCL_DEVICE);
DLLSYM void tprintf(const char *format,...)
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)