tesseract 4.1.1
Loading...
Searching...
No Matches
openclwrapper.cpp
Go to the documentation of this file.
1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4// http://www.apache.org/licenses/LICENSE-2.0
5// Unless required by applicable law or agreed to in writing, software
6// distributed under the License is distributed on an "AS IS" BASIS,
7// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8// See the License for the specific language governing permissions and
9// limitations under the License.
10#ifdef _WIN32
11#include <io.h>
12#else
13#include <sys/types.h>
14#include <unistd.h>
15#endif
16#include <cfloat>
17#include <ctime> // for clock_gettime
18
19#include "oclkernels.h"
20#include "openclwrapper.h"
21
22// for micro-benchmark
23#include "otsuthr.h"
24#include "thresholder.h"
25
26// platform preprocessor commands
27#if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || \
28 defined(__CYGWIN__) || defined(__MINGW32__)
29#define ON_WINDOWS 1
30#define ON_APPLE 0
31#elif defined(__linux__)
32#define ON_WINDOWS 0
33#define ON_APPLE 0
34#elif defined(__APPLE__)
35#define ON_WINDOWS 0
36#define ON_APPLE 1
37#else
38#define ON_WINDOWS 0
39#define ON_APPLE 0
40#endif
41
42#if ON_APPLE
43#include <mach/mach_time.h>
44#endif
45
46#ifdef USE_OPENCL
47
48#include <cstdio>
49#include <cstring> // for memset, strcpy, ...
50#include <vector>
51
52#include "errcode.h" // for ASSERT_HOST
53
54GPUEnv OpenclDevice::gpuEnv;
55
56bool OpenclDevice::deviceIsSelected = false;
57ds_device OpenclDevice::selectedDevice;
58
59int OpenclDevice::isInited = 0;
60
61static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
62
63static const l_uint32 lmask32[] = {
64 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
65 0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
66 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
67 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
68 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
69 0xfffffffe, 0xffffffff};
70
71static const l_uint32 rmask32[] = {
72 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
73 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
74 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
75 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
76 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
77 0x7fffffff, 0xffffffff};
78
79static cl_mem pixsCLBuffer, pixdCLBuffer,
80 pixdCLIntermediate; // Morph operations buffers
81static cl_mem pixThBuffer; // output from thresholdtopix calculation
82static cl_int clStatus;
83static KernelEnv rEnv;
84
85#define DS_TAG_VERSION "<version>"
86#define DS_TAG_VERSION_END "</version>"
87#define DS_TAG_DEVICE "<device>"
88#define DS_TAG_DEVICE_END "</device>"
89#define DS_TAG_SCORE "<score>"
90#define DS_TAG_SCORE_END "</score>"
91#define DS_TAG_DEVICE_TYPE "<type>"
92#define DS_TAG_DEVICE_TYPE_END "</type>"
93#define DS_TAG_DEVICE_NAME "<name>"
94#define DS_TAG_DEVICE_NAME_END "</name>"
95#define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
96#define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
97
98#define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
99
100#define DS_DEVICE_NAME_LENGTH 256
101
102enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
103
104struct ds_profile {
105 std::vector<ds_device> devices;
106 unsigned int numDevices;
107 const char* version;
108};
109
110enum ds_status {
111 DS_SUCCESS = 0,
112 DS_INVALID_PROFILE = 1000,
113 DS_MEMORY_ERROR,
114 DS_INVALID_PERF_EVALUATOR_TYPE,
115 DS_INVALID_PERF_EVALUATOR,
116 DS_PERF_EVALUATOR_ERROR,
117 DS_FILE_ERROR,
118 DS_UNKNOWN_DEVICE_TYPE,
119 DS_PROFILE_FILE_ERROR,
120 DS_SCORE_SERIALIZER_ERROR,
121 DS_SCORE_DESERIALIZER_ERROR
122};
123
124// Pointer to a function that calculates the score of a device (ex:
125// device->score) update the data size of score. The encoding and the format
126// of the score data is implementation defined. The function should return
127// DS_SUCCESS if there's no error to be reported.
128typedef ds_status (*ds_perf_evaluator)(ds_device* device, void* data);
129
130// deallocate memory used by score
131typedef ds_status (*ds_score_release)(TessDeviceScore* score);
132
133static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) {
134 ds_status status = DS_SUCCESS;
135 if (profile != nullptr) {
136 if (sr != nullptr) {
137 unsigned int i;
138 for (i = 0; i < profile->numDevices; i++) {
139 free(profile->devices[i].oclDeviceName);
140 free(profile->devices[i].oclDriverVersion);
141 status = sr(profile->devices[i].score);
142 if (status != DS_SUCCESS) break;
143 }
144 }
145 delete profile;
146 }
147 return status;
148}
149
150static ds_status initDSProfile(ds_profile** p, const char* version) {
151 int numDevices;
152 cl_uint numPlatforms;
153 std::vector<cl_platform_id> platforms;
154 std::vector <cl_device_id> devices;
155 ds_status status = DS_SUCCESS;
156 unsigned int next;
157 unsigned int i;
158
159 if (p == nullptr) return DS_INVALID_PROFILE;
160
161 ds_profile* profile = new ds_profile;
162
163 memset(profile, 0, sizeof(ds_profile));
164
165 clGetPlatformIDs(0, nullptr, &numPlatforms);
166
167 if (numPlatforms > 0) {
168 platforms.reserve(numPlatforms);
169 clGetPlatformIDs(numPlatforms, &platforms[0], nullptr);
170 }
171
172 numDevices = 0;
173 for (i = 0; i < numPlatforms; i++) {
174 cl_uint num;
175 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num);
176 numDevices += num;
177 }
178
179 if (numDevices > 0) {
180 devices.reserve(numDevices);
181 }
182
183 profile->numDevices =
184 numDevices + 1; // +1 to numDevices to include the native CPU
185 profile->devices.reserve(profile->numDevices);
186 memset(&profile->devices[0], 0, profile->numDevices * sizeof(ds_device));
187
188 next = 0;
189 for (i = 0; i < numPlatforms; i++) {
190 cl_uint num;
191 unsigned j;
192 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
193 for (j = 0; j < num; j++, next++) {
194 char buffer[DS_DEVICE_NAME_LENGTH];
195 size_t length;
196
197 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
198 profile->devices[next].oclDeviceID = devices[j];
199
200 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
201 DS_DEVICE_NAME_LENGTH, &buffer, nullptr);
202 length = strlen(buffer);
203 profile->devices[next].oclDeviceName = (char*)malloc(length + 1);
204 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
205
206 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
207 DS_DEVICE_NAME_LENGTH, &buffer, nullptr);
208 length = strlen(buffer);
209 profile->devices[next].oclDriverVersion = (char*)malloc(length + 1);
210 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
211 }
212 }
213 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
214 profile->version = version;
215
216 *p = profile;
217 return status;
218}
219
220static ds_status profileDevices(ds_profile* profile,
221 const ds_evaluation_type type,
222 ds_perf_evaluator evaluator,
223 void* evaluatorData, unsigned int* numUpdates) {
224 ds_status status = DS_SUCCESS;
225 unsigned int i;
226 unsigned int updates = 0;
227
228 if (profile == nullptr) {
229 return DS_INVALID_PROFILE;
230 }
231 if (evaluator == nullptr) {
232 return DS_INVALID_PERF_EVALUATOR;
233 }
234
235 for (i = 0; i < profile->numDevices; i++) {
236 ds_status evaluatorStatus;
237
238 switch (type) {
239 case DS_EVALUATE_NEW_ONLY:
240 if (profile->devices[i].score != nullptr) break;
241 // else fall through
242 case DS_EVALUATE_ALL:
243 evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
244 if (evaluatorStatus != DS_SUCCESS) {
245 status = evaluatorStatus;
246 return status;
247 }
248 updates++;
249 break;
250 default:
251 return DS_INVALID_PERF_EVALUATOR_TYPE;
252 break;
253 };
254 }
255 if (numUpdates) *numUpdates = updates;
256 return status;
257}
258
259static const char* findString(const char* contentStart, const char* contentEnd,
260 const char* string) {
261 size_t stringLength;
262 const char* currentPosition;
263 const char* found = nullptr;
264 stringLength = strlen(string);
265 currentPosition = contentStart;
266 for (currentPosition = contentStart; currentPosition < contentEnd;
267 currentPosition++) {
268 if (*currentPosition == string[0]) {
269 if (currentPosition + stringLength < contentEnd) {
270 if (strncmp(currentPosition, string, stringLength) == 0) {
271 found = currentPosition;
272 break;
273 }
274 }
275 }
276 }
277 return found;
278}
279
280static ds_status readProFile(const char* fileName, char** content,
281 size_t* contentSize) {
282 *contentSize = 0;
283 *content = nullptr;
284 ds_status status = DS_SUCCESS;
285 FILE* input = fopen(fileName, "rb");
286 if (input == nullptr) {
287 status = DS_FILE_ERROR;
288 } else {
289 fseek(input, 0L, SEEK_END);
290 auto pos = std::ftell(input);
291 rewind(input);
292 if (pos > 0) {
293 size_t size = pos;
294 char *binary = new char[size];
295 if (fread(binary, sizeof(char), size, input) != size) {
296 status = DS_FILE_ERROR;
297 delete[] binary;
298 } else {
299 *contentSize = size;
300 *content = binary;
301 }
302 }
303 fclose(input);
304 }
305 return status;
306}
307
308typedef ds_status (*ds_score_deserializer)(ds_device* device,
309 const uint8_t* serializedScore,
310 unsigned int serializedScoreSize);
311
312static ds_status readProfileFromFile(ds_profile* profile,
313 ds_score_deserializer deserializer,
314 const char* file) {
315 ds_status status = DS_SUCCESS;
316 char* contentStart;
317 size_t contentSize;
318
319 if (profile == nullptr) return DS_INVALID_PROFILE;
320
321 status = readProFile(file, &contentStart, &contentSize);
322 if (status == DS_SUCCESS) {
323 const char* currentPosition;
324 const char* dataStart;
325 const char* dataEnd;
326
327 const char* contentEnd = contentStart + contentSize;
328 currentPosition = contentStart;
329
330 // parse the version string
331 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
332 if (dataStart == nullptr) {
333 status = DS_PROFILE_FILE_ERROR;
334 goto cleanup;
335 }
336 dataStart += strlen(DS_TAG_VERSION);
337
338 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
339 if (dataEnd == nullptr) {
340 status = DS_PROFILE_FILE_ERROR;
341 goto cleanup;
342 }
343
344 size_t versionStringLength = strlen(profile->version);
345 if (versionStringLength + dataStart != dataEnd ||
346 strncmp(profile->version, dataStart, versionStringLength) != 0) {
347 // version mismatch
348 status = DS_PROFILE_FILE_ERROR;
349 goto cleanup;
350 }
351 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
352
353 // parse the device information
354 while (1) {
355 unsigned int i;
356
357 const char* deviceTypeStart;
358 const char* deviceTypeEnd;
359 ds_device_type deviceType;
360
361 const char* deviceNameStart;
362 const char* deviceNameEnd;
363
364 const char* deviceScoreStart;
365 const char* deviceScoreEnd;
366
367 const char* deviceDriverStart;
368 const char* deviceDriverEnd;
369
370 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
371 if (dataStart == nullptr) {
372 // nothing useful remain, quit...
373 break;
374 }
375 dataStart += strlen(DS_TAG_DEVICE);
376 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
377 if (dataEnd == nullptr) {
378 status = DS_PROFILE_FILE_ERROR;
379 goto cleanup;
380 }
381
382 // parse the device type
383 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
384 if (deviceTypeStart == nullptr) {
385 status = DS_PROFILE_FILE_ERROR;
386 goto cleanup;
387 }
388 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
389 deviceTypeEnd =
390 findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
391 if (deviceTypeEnd == nullptr) {
392 status = DS_PROFILE_FILE_ERROR;
393 goto cleanup;
394 }
395 memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
396
397 // parse the device name
398 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
399 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
400 if (deviceNameStart == nullptr) {
401 status = DS_PROFILE_FILE_ERROR;
402 goto cleanup;
403 }
404 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
405 deviceNameEnd =
406 findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
407 if (deviceNameEnd == nullptr) {
408 status = DS_PROFILE_FILE_ERROR;
409 goto cleanup;
410 }
411
412 deviceDriverStart =
413 findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
414 if (deviceDriverStart == nullptr) {
415 status = DS_PROFILE_FILE_ERROR;
416 goto cleanup;
417 }
418 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
419 deviceDriverEnd = findString(deviceDriverStart, contentEnd,
420 DS_TAG_DEVICE_DRIVER_VERSION_END);
421 if (deviceDriverEnd == nullptr) {
422 status = DS_PROFILE_FILE_ERROR;
423 goto cleanup;
424 }
425
426 // check if this device is on the system
427 for (i = 0; i < profile->numDevices; i++) {
428 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
429 size_t actualDeviceNameLength;
430 size_t driverVersionLength;
431
432 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
433 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
434 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
435 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
436 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
437 actualDeviceNameLength) == 0 &&
438 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
439 driverVersionLength) == 0) {
440 deviceScoreStart =
441 findString(dataStart, contentEnd, DS_TAG_SCORE);
442 deviceScoreStart += strlen(DS_TAG_SCORE);
443 deviceScoreEnd =
444 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
445 status = deserializer(&profile->devices[i],
446 (const unsigned char*)deviceScoreStart,
447 deviceScoreEnd - deviceScoreStart);
448 if (status != DS_SUCCESS) {
449 goto cleanup;
450 }
451 }
452 }
453 }
454 } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
455 for (i = 0; i < profile->numDevices; i++) {
456 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
457 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
458 if (deviceScoreStart == nullptr) {
459 status = DS_PROFILE_FILE_ERROR;
460 goto cleanup;
461 }
462 deviceScoreStart += strlen(DS_TAG_SCORE);
463 deviceScoreEnd =
464 findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
465 status = deserializer(&profile->devices[i],
466 (const unsigned char*)deviceScoreStart,
467 deviceScoreEnd - deviceScoreStart);
468 if (status != DS_SUCCESS) {
469 goto cleanup;
470 }
471 }
472 }
473 }
474
475 // skip over the current one to find the next device
476 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
477 }
478 }
479cleanup:
480 delete[] contentStart;
481 return status;
482}
483
484typedef ds_status (*ds_score_serializer)(ds_device* device,
485 uint8_t** serializedScore,
486 unsigned int* serializedScoreSize);
487static ds_status writeProfileToFile(ds_profile* profile,
488 ds_score_serializer serializer,
489 const char* file) {
490 ds_status status = DS_SUCCESS;
491
492 if (profile == nullptr) return DS_INVALID_PROFILE;
493
494 FILE* profileFile = fopen(file, "wb");
495 if (profileFile == nullptr) {
496 status = DS_FILE_ERROR;
497 } else {
498 unsigned int i;
499
500 // write version string
501 fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
502 fwrite(profile->version, sizeof(char), strlen(profile->version),
503 profileFile);
504 fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END),
505 profileFile);
506 fwrite("\n", sizeof(char), 1, profileFile);
507
508 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
509 uint8_t* serializedScore;
510 unsigned int serializedScoreSize;
511
512 fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
513
514 fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE),
515 profileFile);
516 fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
517 fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char),
518 strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
519
520 switch (profile->devices[i].type) {
521 case DS_DEVICE_NATIVE_CPU: {
522 // There's no need to emit a device name for the native CPU device.
523 /*
524 fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
525 profileFile);
526 fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
527 strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
528 fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
529 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
530 */
531 } break;
532 case DS_DEVICE_OPENCL_DEVICE: {
533 fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
534 profileFile);
535 fwrite(profile->devices[i].oclDeviceName, sizeof(char),
536 strlen(profile->devices[i].oclDeviceName), profileFile);
537 fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
538 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
539
540 fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char),
541 strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
542 fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
543 strlen(profile->devices[i].oclDriverVersion), profileFile);
544 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
545 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
546 } break;
547 default:
548 status = DS_UNKNOWN_DEVICE_TYPE;
549 continue;
550 };
551
552 fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
553 status = serializer(&profile->devices[i], &serializedScore,
554 &serializedScoreSize);
555 if (status == DS_SUCCESS && serializedScore != nullptr &&
556 serializedScoreSize > 0) {
557 fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
558 delete[] serializedScore;
559 }
560 fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END),
561 profileFile);
562 fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END),
563 profileFile);
564 fwrite("\n", sizeof(char), 1, profileFile);
565 }
566 fclose(profileFile);
567 }
568 return status;
569}
570
571// substitute invalid characters in device name with _
572static void legalizeFileName(char* fileName) {
573 // tprintf("fileName: %s\n", fileName);
574 const char* invalidChars =
575 "/\?:*\"><| "; // space is valid but can cause headaches
576 // for each invalid char
577 for (unsigned i = 0; i < strlen(invalidChars); i++) {
578 char invalidStr[4];
579 invalidStr[0] = invalidChars[i];
580 invalidStr[1] = '\0';
581 // tprintf("eliminating %s\n", invalidStr);
582 // char *pos = strstr(fileName, invalidStr);
583 // initial ./ is valid for present directory
584 // if (*pos == '.') pos++;
585 // if (*pos == '/') pos++;
586 for (char* pos = strstr(fileName, invalidStr); pos != nullptr;
587 pos = strstr(pos + 1, invalidStr)) {
588 // tprintf("\tfound: %s, ", pos);
589 pos[0] = '_';
590 // tprintf("fileName: %s\n", fileName);
591 }
592 }
593}
594
595static void populateGPUEnvFromDevice(GPUEnv* gpuInfo, cl_device_id device) {
596 // tprintf("[DS] populateGPUEnvFromDevice\n");
597 size_t size;
598 gpuInfo->mnIsUserCreated = 1;
599 // device
600 gpuInfo->mpDevID = device;
601 gpuInfo->mpArryDevsID = new cl_device_id[1];
602 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
603 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
604 sizeof(cl_device_type), &gpuInfo->mDevType, &size);
605 CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
606 // platform
607 clStatus =
608 clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
609 sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
610 CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
611 // context
612 cl_context_properties props[3];
613 props[0] = CL_CONTEXT_PLATFORM;
614 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
615 props[2] = 0;
616 gpuInfo->mpContext =
617 clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, nullptr, &clStatus);
618 CHECK_OPENCL(clStatus, "populateGPUEnv::createContext");
619 // queue
620 cl_command_queue_properties queueProperties = 0;
621 gpuInfo->mpCmdQueue = clCreateCommandQueue(
622 gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
623 CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue");
624}
625
626int OpenclDevice::LoadOpencl() {
627#ifdef WIN32
628 HINSTANCE HOpenclDll = nullptr;
629 void* OpenclDll = nullptr;
630 // fprintf(stderr, " LoadOpenclDllxx... \n");
631 OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
632 OpenclDll = LoadLibrary("openCL.dll");
633 if (!static_cast<HINSTANCE>(OpenclDll)) {
634 fprintf(stderr, "[OD] Load opencl.dll failed!\n");
635 FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
636 return 0;
637 }
638 fprintf(stderr, "[OD] Load opencl.dll successful!\n");
639#endif
640 return 1;
641}
642int OpenclDevice::SetKernelEnv(KernelEnv* envInfo) {
643 envInfo->mpkContext = gpuEnv.mpContext;
644 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
645 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
646
647 return 1;
648}
649
650static cl_mem allocateZeroCopyBuffer(const KernelEnv& rEnv,
651 l_uint32* hostbuffer, size_t nElements,
652 cl_mem_flags flags, cl_int* pStatus) {
653 cl_mem membuffer =
654 clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
655 nElements * sizeof(l_uint32), hostbuffer, pStatus);
656
657 return membuffer;
658}
659
660static Pix* mapOutputCLBuffer(const KernelEnv& rEnv, cl_mem clbuffer, Pix* pixd,
661 Pix* pixs, int elements, cl_mem_flags flags,
662 bool memcopy = false, bool sync = true) {
663 if (!pixd) {
664 if (memcopy) {
665 if ((pixd = pixCreateTemplate(pixs)) == nullptr)
666 tprintf("pixd not made\n");
667 } else {
668 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
669 pixGetDepth(pixs))) == nullptr)
670 tprintf("pixd not made\n");
671 }
672 }
673 l_uint32* pValues = (l_uint32*)clEnqueueMapBuffer(
674 rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
675 elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr);
676
677 if (memcopy) {
678 memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
679 } else {
680 pixSetData(pixd, pValues);
681 }
682
683 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr,
684 nullptr);
685
686 if (sync) {
687 clFinish(rEnv.mpkCmdQueue);
688 }
689
690 return pixd;
691}
692
693void OpenclDevice::releaseMorphCLBuffers() {
694 if (pixdCLIntermediate != nullptr) clReleaseMemObject(pixdCLIntermediate);
695 if (pixsCLBuffer != nullptr) clReleaseMemObject(pixsCLBuffer);
696 if (pixdCLBuffer != nullptr) clReleaseMemObject(pixdCLBuffer);
697 if (pixThBuffer != nullptr) clReleaseMemObject(pixThBuffer);
698 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr;
699}
700
701int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs) {
702 SetKernelEnv(&rEnv);
703
704 if (pixThBuffer != nullptr) {
705 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
706 CL_MEM_ALLOC_HOST_PTR, &clStatus);
707
708 // Get the output from ThresholdToPix operation
709 clStatus =
710 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
711 sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr);
712 } else {
713 // Get data from the source image
714 l_uint32* srcdata =
715 reinterpret_cast<l_uint32*>(malloc(wpl * h * sizeof(l_uint32)));
716 memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32));
717
718 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h,
719 CL_MEM_USE_HOST_PTR, &clStatus);
720 }
721
722 pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
723 CL_MEM_ALLOC_HOST_PTR, &clStatus);
724
725 pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
726 CL_MEM_ALLOC_HOST_PTR, &clStatus);
727
728 return (int)clStatus;
729}
730
731int OpenclDevice::InitEnv() {
732// tprintf("[OD] OpenclDevice::InitEnv()\n");
733#ifdef SAL_WIN32
734 while (1) {
735 if (1 == LoadOpencl()) break;
736 }
737#endif
738 // sets up environment, compiles programs
739
740 InitOpenclRunEnv_DeviceSelection(0);
741 return 1;
742}
743
744int OpenclDevice::ReleaseOpenclRunEnv() {
745 ReleaseOpenclEnv(&gpuEnv);
746#ifdef SAL_WIN32
747 FreeOpenclDll();
748#endif
749 return 1;
750}
751
752inline int OpenclDevice::AddKernelConfig(int kCount, const char* kName) {
753 ASSERT_HOST(kCount > 0);
754 ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
755 strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
756 gpuEnv.mnKernelCount++;
757 return 0;
758}
759
760int OpenclDevice::RegistOpenclKernel() {
761 if (!gpuEnv.mnIsUserCreated) memset(&gpuEnv, 0, sizeof(gpuEnv));
762
763 gpuEnv.mnFileCount = 0; // argc;
764 gpuEnv.mnKernelCount = 0UL;
765
766 AddKernelConfig(1, "oclAverageSub1");
767 return 0;
768}
769
770int OpenclDevice::InitOpenclRunEnv_DeviceSelection(int argc) {
771 if (!isInited) {
772 // after programs compiled, selects best device
773 ds_device bestDevice_DS = getDeviceSelection();
774 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
775 // overwrite global static GPUEnv with new device
776 if (selectedDeviceIsOpenCL()) {
777 // tprintf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice()
778 // for selected device\n");
779 populateGPUEnvFromDevice(&gpuEnv, bestDevice);
780 gpuEnv.mnFileCount = 0; // argc;
781 gpuEnv.mnKernelCount = 0UL;
782 CompileKernelFile(&gpuEnv, "");
783 } else {
784 // tprintf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice()
785 // b/c native cpu selected\n");
786 }
787 isInited = 1;
788 }
789 return 0;
790}
791
792OpenclDevice::OpenclDevice() {
793 // InitEnv();
794}
795
796OpenclDevice::~OpenclDevice() {
797 // ReleaseOpenclRunEnv();
798}
799
800int OpenclDevice::ReleaseOpenclEnv(GPUEnv* gpuInfo) {
801 int i = 0;
802 int clStatus = 0;
803
804 if (!isInited) {
805 return 1;
806 }
807
808 for (i = 0; i < gpuEnv.mnFileCount; i++) {
809 if (gpuEnv.mpArryPrograms[i]) {
810 clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
811 CHECK_OPENCL(clStatus, "clReleaseProgram");
812 gpuEnv.mpArryPrograms[i] = nullptr;
813 }
814 }
815 if (gpuEnv.mpCmdQueue) {
816 clReleaseCommandQueue(gpuEnv.mpCmdQueue);
817 gpuEnv.mpCmdQueue = nullptr;
818 }
819 if (gpuEnv.mpContext) {
820 clReleaseContext(gpuEnv.mpContext);
821 gpuEnv.mpContext = nullptr;
822 }
823 isInited = 0;
824 gpuInfo->mnIsUserCreated = 0;
825 delete[] gpuInfo->mpArryDevsID;
826 return 1;
827}
828int OpenclDevice::BinaryGenerated(const char* clFileName, FILE** fhandle) {
829 unsigned int i = 0;
830 cl_int clStatus;
831 int status = 0;
832 FILE* fd = nullptr;
833 char fileName[256] = {0}, cl_name[128] = {0};
834 char deviceName[1024];
835 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
836 sizeof(deviceName), deviceName, nullptr);
837 CHECK_OPENCL(clStatus, "clGetDeviceInfo");
838 const char* str = strstr(clFileName, ".cl");
839 memcpy(cl_name, clFileName, str - clFileName);
840 cl_name[str - clFileName] = '\0';
841 sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
842 legalizeFileName(fileName);
843 fd = fopen(fileName, "rb");
844 status = (fd != nullptr) ? 1 : 0;
845 if (fd != nullptr) {
846 *fhandle = fd;
847 }
848 return status;
849}
850int OpenclDevice::CachedOfKernerPrg(const GPUEnv* gpuEnvCached,
851 const char* clFileName) {
852 int i;
853 for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
854 if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
855 if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
856 return 1;
857 }
858 }
859 }
860
861 return 0;
862}
863int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary,
864 size_t numBytes) {
865 FILE* output = nullptr;
866 output = fopen(fileName, "wb");
867 if (output == nullptr) {
868 return 0;
869 }
870
871 fwrite(birary, sizeof(char), numBytes, output);
872 fclose(output);
873
874 return 1;
875}
876
877int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
878 const char* clFileName) {
879 unsigned int i = 0;
880 cl_int clStatus;
881 cl_uint numDevices;
882
883 clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
884 sizeof(numDevices), &numDevices, nullptr);
885 CHECK_OPENCL(clStatus, "clGetProgramInfo");
886
887 std::vector<cl_device_id> mpArryDevsID(numDevices);
888
889 /* grab the handles to all of the devices in the program. */
890 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
891 sizeof(cl_device_id) * numDevices,
892 &mpArryDevsID[0],
893 nullptr);
894 CHECK_OPENCL(clStatus, "clGetProgramInfo");
895
896 /* figure out the sizes of each of the binaries. */
897 std::vector<size_t> binarySizes(numDevices);
898
899 clStatus =
900 clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
901 sizeof(size_t) * numDevices, &binarySizes[0], nullptr);
902 CHECK_OPENCL(clStatus, "clGetProgramInfo");
903
904 /* copy over all of the generated binaries. */
905 std::vector<char*> binaries(numDevices);
906
907 for (i = 0; i < numDevices; i++) {
908 if (binarySizes[i] != 0) {
909 binaries[i] = new char[binarySizes[i]];
910 } else {
911 binaries[i] = nullptr;
912 }
913 }
914
915 clStatus =
916 clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char*) * numDevices,
917 &binaries[0], nullptr);
918 CHECK_OPENCL(clStatus, "clGetProgramInfo");
919
920 /* dump out each binary into its own separate file. */
921 for (i = 0; i < numDevices; i++) {
922 char fileName[256] = {0}, cl_name[128] = {0};
923
924 if (binarySizes[i] != 0) {
925 char deviceName[1024];
926 clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
927 sizeof(deviceName), deviceName, nullptr);
928 CHECK_OPENCL(clStatus, "clGetDeviceInfo");
929
930 const char* str = strstr(clFileName, ".cl");
931 memcpy(cl_name, clFileName, str - clFileName);
932 cl_name[str - clFileName] = '\0';
933 sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
934 legalizeFileName(fileName);
935 if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
936 tprintf("[OD] write binary[%s] failed\n", fileName);
937 return 0;
938 } // else
939 tprintf("[OD] write binary[%s] successfully\n", fileName);
940 }
941 }
942
943 // Release all resources and memory
944 for (i = 0; i < numDevices; i++) {
945 delete[] binaries[i];
946 }
947
948 return 1;
949}
950
951int OpenclDevice::CompileKernelFile(GPUEnv* gpuInfo, const char* buildOption) {
952 cl_int clStatus = 0;
953 const char* source;
954 size_t source_size[1];
955 int binary_status, binaryExisted, idx;
956 cl_uint numDevices;
957 FILE *fd, *fd1;
958 const char* filename = "kernel.cl";
959 // fprintf(stderr, "[OD] CompileKernelFile ... \n");
960 if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
961 return 1;
962 }
963
964 idx = gpuInfo->mnFileCount;
965
966 source = kernel_src;
967
968 source_size[0] = strlen(source);
969 binaryExisted = 0;
970 binaryExisted = BinaryGenerated(
971 filename, &fd); // don't check for binary during microbenchmark
972 if (binaryExisted == 1) {
973 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
974 sizeof(numDevices), &numDevices, nullptr);
975 CHECK_OPENCL(clStatus, "clGetContextInfo");
976
977 std::vector<cl_device_id> mpArryDevsID(numDevices);
978 bool b_error = fseek(fd, 0, SEEK_END) < 0;
979 auto pos = std::ftell(fd);
980 b_error |= (pos <= 0);
981 size_t length = pos;
982 b_error |= fseek(fd, 0, SEEK_SET) < 0;
983 if (b_error) {
984 fclose(fd);
985 return 0;
986 }
987
988 std::vector<uint8_t> binary(length + 2);
989
990 memset(&binary[0], 0, length + 2);
991 b_error |= fread(&binary[0], 1, length, fd) != length;
992
993 fclose(fd);
994 fd = nullptr;
995 // grab the handles to all of the devices in the context.
996 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
997 sizeof(cl_device_id) * numDevices,
998 &mpArryDevsID[0], nullptr);
999 CHECK_OPENCL(clStatus, "clGetContextInfo");
1000 // fprintf(stderr, "[OD] Create kernel from binary\n");
1001 const uint8_t* c_binary = &binary[0];
1002 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary(
1003 gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length, &c_binary,
1004 &binary_status, &clStatus);
1005 CHECK_OPENCL(clStatus, "clCreateProgramWithBinary");
1006 } else {
1007 // create a CL program using the kernel source
1008 // fprintf(stderr, "[OD] Create kernel from source\n");
1009 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource(
1010 gpuInfo->mpContext, 1, &source, source_size, &clStatus);
1011 CHECK_OPENCL(clStatus, "clCreateProgramWithSource");
1012 }
1013
1014 if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) {
1015 return 0;
1016 }
1017
1018 // char options[512];
1019 // create a cl program executable for all the devices specified
1020 // tprintf("[OD] BuildProgram.\n");
1021 if (!gpuInfo->mnIsUserCreated) {
1022 clStatus =
1023 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1024 buildOption, nullptr, nullptr);
1025 } else {
1026 clStatus =
1027 clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1028 buildOption, nullptr, nullptr);
1029 }
1030 if (clStatus != CL_SUCCESS) {
1031 tprintf("BuildProgram error!\n");
1032 size_t length;
1033 if (!gpuInfo->mnIsUserCreated) {
1034 clStatus = clGetProgramBuildInfo(
1035 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1036 CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1037 } else {
1038 clStatus =
1039 clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1040 CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1041 }
1042 if (clStatus != CL_SUCCESS) {
1043 tprintf("opencl create build log fail\n");
1044 return 0;
1045 }
1046 std::vector<char> buildLog(length);
1047 if (!gpuInfo->mnIsUserCreated) {
1048 clStatus = clGetProgramBuildInfo(
1049 gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1050 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1051 } else {
1052 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
1053 gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG,
1054 length, &buildLog[0], &length);
1055 }
1056 if (clStatus != CL_SUCCESS) {
1057 tprintf("opencl program build info fail\n");
1058 return 0;
1059 }
1060
1061 fd1 = fopen("kernel-build.log", "w+");
1062 if (fd1 != nullptr) {
1063 fwrite(&buildLog[0], sizeof(char), length, fd1);
1064 fclose(fd1);
1065 }
1066
1067 return 0;
1068 }
1069
1070 strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1071 if (binaryExisted == 0) {
1072 GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1073 }
1074
1075 gpuInfo->mnFileCount += 1;
1076 return 1;
1077}
1078
1079l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32* tiffdata, l_int32 w,
1080 l_int32 h, l_int32 wpl,
1081 l_uint32* line) {
1082 cl_int clStatus;
1083 KernelEnv rEnv;
1084 size_t globalThreads[2];
1085 size_t localThreads[2];
1086 int gsize;
1087 cl_mem valuesCl;
1088 cl_mem outputCl;
1089
1090 // global and local work dimensions for Horizontal pass
1091 gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1092 globalThreads[0] = gsize;
1093 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1094 globalThreads[1] = gsize;
1095 localThreads[0] = GROUPSIZE_X;
1096 localThreads[1] = GROUPSIZE_Y;
1097
1098 SetKernelEnv(&rEnv);
1099
1100 l_uint32* pResult = (l_uint32*)malloc(w * h * sizeof(l_uint32));
1101 rEnv.mpkKernel =
1102 clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus);
1103 CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1104
1105 // Allocate input and output OCL buffers
1106 valuesCl = allocateZeroCopyBuffer(
1107 rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1108 outputCl = allocateZeroCopyBuffer(
1109 rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1110
1111 // Kernel arguments
1112 clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1113 CHECK_OPENCL(clStatus, "clSetKernelArg");
1114 clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1115 CHECK_OPENCL(clStatus, "clSetKernelArg");
1116 clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1117 CHECK_OPENCL(clStatus, "clSetKernelArg");
1118 clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1119 CHECK_OPENCL(clStatus, "clSetKernelArg");
1120 clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1121 CHECK_OPENCL(clStatus, "clSetKernelArg");
1122
1123 // Kernel enqueue
1124 clStatus =
1125 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1126 globalThreads, localThreads, 0, nullptr, nullptr);
1127 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1128
1129 /* map results back from gpu */
1130 void* ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE,
1131 CL_MAP_READ, 0, w * h * sizeof(l_uint32), 0,
1132 nullptr, nullptr, &clStatus);
1133 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1134 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
1135
1136 // Sync
1137 clFinish(rEnv.mpkCmdQueue);
1138 return pResult;
1139}
1140
1141// Morphology Dilate operation for 5x5 structuring element. Invokes the relevant
1142// OpenCL kernels
1143static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1144 size_t globalThreads[2];
1145 cl_mem pixtemp;
1146 cl_int status;
1147 int gsize;
1148 size_t localThreads[2];
1149
1150 // Horizontal pass
1151 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1152 globalThreads[0] = gsize;
1153 globalThreads[1] = GROUPSIZE_HMORY;
1154 localThreads[0] = GROUPSIZE_HMORX;
1155 localThreads[1] = GROUPSIZE_HMORY;
1156
1157 rEnv.mpkKernel =
1158 clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_5x5", &status);
1159 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1160
1161 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1162 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1163 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1164 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1165
1166 status =
1167 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1168 globalThreads, localThreads, 0, nullptr, nullptr);
1169
1170 // Swap source and dest buffers
1171 pixtemp = pixsCLBuffer;
1172 pixsCLBuffer = pixdCLBuffer;
1173 pixdCLBuffer = pixtemp;
1174
1175 // Vertical
1176 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1177 globalThreads[0] = gsize;
1178 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1179 globalThreads[1] = gsize;
1180 localThreads[0] = GROUPSIZE_X;
1181 localThreads[1] = GROUPSIZE_Y;
1182
1183 rEnv.mpkKernel =
1184 clCreateKernel(rEnv.mpkProgram, "morphoDilateVer_5x5", &status);
1185 CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1186
1187 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1188 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1189 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1190 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1191 status =
1192 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1193 globalThreads, localThreads, 0, nullptr, nullptr);
1194
1195 return status;
1196}
1197
1198// Morphology Erode operation for 5x5 structuring element. Invokes the relevant
1199// OpenCL kernels
1200static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1201 size_t globalThreads[2];
1202 cl_mem pixtemp;
1203 cl_int status;
1204 int gsize;
1205 l_uint32 fwmask, lwmask;
1206 size_t localThreads[2];
1207
1208 lwmask = lmask32[31 - 2];
1209 fwmask = rmask32[31 - 2];
1210
1211 // Horizontal pass
1212 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1213 globalThreads[0] = gsize;
1214 globalThreads[1] = GROUPSIZE_HMORY;
1215 localThreads[0] = GROUPSIZE_HMORX;
1216 localThreads[1] = GROUPSIZE_HMORY;
1217
1218 rEnv.mpkKernel =
1219 clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_5x5", &status);
1220 CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1221
1222 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1223 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1224 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1225 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1226
1227 status =
1228 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1229 globalThreads, localThreads, 0, nullptr, nullptr);
1230
1231 // Swap source and dest buffers
1232 pixtemp = pixsCLBuffer;
1233 pixsCLBuffer = pixdCLBuffer;
1234 pixdCLBuffer = pixtemp;
1235
1236 // Vertical
1237 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1238 globalThreads[0] = gsize;
1239 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1240 globalThreads[1] = gsize;
1241 localThreads[0] = GROUPSIZE_X;
1242 localThreads[1] = GROUPSIZE_Y;
1243
1244 rEnv.mpkKernel =
1245 clCreateKernel(rEnv.mpkProgram, "morphoErodeVer_5x5", &status);
1246 CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1247
1248 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1249 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1250 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1251 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1252 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1253 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1254 status =
1255 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1256 globalThreads, localThreads, 0, nullptr, nullptr);
1257
1258 return status;
1259}
1260
1261// Morphology Dilate operation. Invokes the relevant OpenCL kernels
1262static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl,
1263 l_int32 h) {
1264 l_int32 xp, yp, xn, yn;
1265 SEL* sel;
1266 size_t globalThreads[2];
1267 cl_mem pixtemp;
1268 cl_int status = 0;
1269 int gsize;
1270 size_t localThreads[2];
1271 char isEven;
1272
1273 OpenclDevice::SetKernelEnv(&rEnv);
1274
1275 if (hsize == 5 && vsize == 5) {
1276 // Specific case for 5x5
1277 status = pixDilateCL_55(wpl, h);
1278 return status;
1279 }
1280
1281 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1282
1283 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1284 selDestroy(&sel);
1285 // global and local work dimensions for Horizontal pass
1286 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1287 globalThreads[0] = gsize;
1288 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1289 globalThreads[1] = gsize;
1290 localThreads[0] = GROUPSIZE_X;
1291 localThreads[1] = GROUPSIZE_Y;
1292
1293 if (xp > 31 || xn > 31) {
1294 // Generic case.
1295 rEnv.mpkKernel =
1296 clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1297 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1298
1299 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1300 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1301 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1302 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1303 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1304 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1305 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1306 nullptr, globalThreads, localThreads, 0,
1307 nullptr, nullptr);
1308
1309 if (yp > 0 || yn > 0) {
1310 pixtemp = pixsCLBuffer;
1311 pixsCLBuffer = pixdCLBuffer;
1312 pixdCLBuffer = pixtemp;
1313 }
1314 } else if (xp > 0 || xn > 0) {
1315 // Specific Horizontal pass kernel for half width < 32
1316 rEnv.mpkKernel =
1317 clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1318 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1319 isEven = (xp != xn);
1320
1321 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1322 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1323 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1324 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1325 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1326 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1327 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1328 nullptr, globalThreads, localThreads, 0,
1329 nullptr, nullptr);
1330
1331 if (yp > 0 || yn > 0) {
1332 pixtemp = pixsCLBuffer;
1333 pixsCLBuffer = pixdCLBuffer;
1334 pixdCLBuffer = pixtemp;
1335 }
1336 }
1337
1338 if (yp > 0 || yn > 0) {
1339 rEnv.mpkKernel =
1340 clCreateKernel(rEnv.mpkProgram, "morphoDilateVer", &status);
1341 CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1342
1343 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1344 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1345 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1346 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1347 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1348 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1349 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1350 nullptr, globalThreads, localThreads, 0,
1351 nullptr, nullptr);
1352 }
1353
1354 return status;
1355}
1356
1357// Morphology Erode operation. Invokes the relevant OpenCL kernels
1358static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl,
1359 l_uint32 h) {
1360 l_int32 xp, yp, xn, yn;
1361 SEL* sel;
1362 size_t globalThreads[2];
1363 size_t localThreads[2];
1364 cl_mem pixtemp;
1365 cl_int status = 0;
1366 int gsize;
1367 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1368 l_uint32 rwmask, lwmask;
1369 char isEven;
1370
1371 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1372
1373 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1374 selDestroy(&sel);
1375 OpenclDevice::SetKernelEnv(&rEnv);
1376
1377 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1378 // Specific kernel for 5x5
1379 status = pixErodeCL_55(wpl, h);
1380 return status;
1381 }
1382
1383 lwmask = lmask32[31 - (xn & 31)];
1384 rwmask = rmask32[31 - (xp & 31)];
1385
1386 // global and local work dimensions for Horizontal pass
1387 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1388 globalThreads[0] = gsize;
1389 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1390 globalThreads[1] = gsize;
1391 localThreads[0] = GROUPSIZE_X;
1392 localThreads[1] = GROUPSIZE_Y;
1393
1394 // Horizontal Pass
1395 if (xp > 31 || xn > 31) {
1396 // Generic case.
1397 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1398
1399 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1400 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1401 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1402 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1403 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1404 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1405 status =
1406 clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1407 status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1408 status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1409 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1410 nullptr, globalThreads, localThreads, 0,
1411 nullptr, nullptr);
1412
1413 if (yp > 0 || yn > 0) {
1414 pixtemp = pixsCLBuffer;
1415 pixsCLBuffer = pixdCLBuffer;
1416 pixdCLBuffer = pixtemp;
1417 }
1418 } else if (xp > 0 || xn > 0) {
1419 rEnv.mpkKernel =
1420 clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1421 isEven = (xp != xn);
1422
1423 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1424 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1425 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1426 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1427 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1428 status =
1429 clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1430 status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1431 status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1432 status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1433 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1434 nullptr, globalThreads, localThreads, 0,
1435 nullptr, nullptr);
1436
1437 if (yp > 0 || yn > 0) {
1438 pixtemp = pixsCLBuffer;
1439 pixsCLBuffer = pixdCLBuffer;
1440 pixdCLBuffer = pixtemp;
1441 }
1442 }
1443
1444 // Vertical Pass
1445 if (yp > 0 || yn > 0) {
1446 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1447 CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1448
1449 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1450 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1451 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1452 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1453 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1454 status =
1455 clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1456 status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1457 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1458 nullptr, globalThreads, localThreads, 0,
1459 nullptr, nullptr);
1460 }
1461
1462 return status;
1463}
1464
1465// Morphology Open operation. Invokes the relevant OpenCL kernels
1466static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1467 cl_int status;
1468 cl_mem pixtemp;
1469
1470 // Erode followed by Dilate
1471 status = pixErodeCL(hsize, vsize, wpl, h);
1472
1473 pixtemp = pixsCLBuffer;
1474 pixsCLBuffer = pixdCLBuffer;
1475 pixdCLBuffer = pixtemp;
1476
1477 status = pixDilateCL(hsize, vsize, wpl, h);
1478
1479 return status;
1480}
1481
1482// Morphology Close operation. Invokes the relevant OpenCL kernels
1483static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1484 cl_int status;
1485 cl_mem pixtemp;
1486
1487 // Dilate followed by Erode
1488 status = pixDilateCL(hsize, vsize, wpl, h);
1489
1490 pixtemp = pixsCLBuffer;
1491 pixsCLBuffer = pixdCLBuffer;
1492 pixdCLBuffer = pixtemp;
1493
1494 status = pixErodeCL(hsize, vsize, wpl, h);
1495
1496 return status;
1497}
1498
1499// output = buffer1 & ~(buffer2)
1500static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1501 cl_mem buffer2) {
1502 cl_int status;
1503 size_t globalThreads[2];
1504 int gsize;
1505 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1506
1507 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1508 globalThreads[0] = gsize;
1509 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1510 globalThreads[1] = gsize;
1511
1512 rEnv.mpkKernel =
1513 clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1514 CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1515
1516 // Enqueue a kernel run call.
1517 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1518 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1519 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1520 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1521 status =
1522 clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1523 globalThreads, localThreads, 0, nullptr, nullptr);
1524
1525 return status;
1526}
1527
1528// OpenCL implementation of Get Lines from pix function
1529// Note: Assumes the source and dest opencl buffer are initialized. No check
1530// done
1531void OpenclDevice::pixGetLinesCL(Pix* pixd, Pix* pixs, Pix** pix_vline,
1532 Pix** pix_hline, Pix** pixClosed,
1533 bool getpixClosed, l_int32 close_hsize,
1534 l_int32 close_vsize, l_int32 open_hsize,
1535 l_int32 open_vsize, l_int32 line_hsize,
1536 l_int32 line_vsize) {
1537 l_uint32 wpl, h;
1538 cl_mem pixtemp;
1539
1540 wpl = pixGetWpl(pixs);
1541 h = pixGetHeight(pixs);
1542
1543 // First step : Close Morph operation: Dilate followed by Erode
1544 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1545
1546 // Copy the Close output to CPU buffer
1547 if (getpixClosed) {
1548 *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1549 wpl * h, CL_MAP_READ, true, false);
1550 }
1551
1552 // Store the output of close operation in an intermediate buffer
1553 // this will be later used for pixsubtract
1554 clStatus =
1555 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1556 0, sizeof(int) * wpl * h, 0, nullptr, nullptr);
1557
1558 // Second step: Open Operation - Erode followed by Dilate
1559 pixtemp = pixsCLBuffer;
1560 pixsCLBuffer = pixdCLBuffer;
1561 pixdCLBuffer = pixtemp;
1562
1563 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1564
1565 // Third step: Subtract : (Close - Open)
1566 pixtemp = pixsCLBuffer;
1567 pixsCLBuffer = pixdCLBuffer;
1568 pixdCLBuffer = pixdCLIntermediate;
1569 pixdCLIntermediate = pixtemp;
1570
1571 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1572
1573 // Store the output of Hollow operation in an intermediate buffer
1574 // this will be later used
1575 clStatus =
1576 clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1577 0, sizeof(int) * wpl * h, 0, nullptr, nullptr);
1578
1579 pixtemp = pixsCLBuffer;
1580 pixsCLBuffer = pixdCLBuffer;
1581 pixdCLBuffer = pixtemp;
1582
1583 // Fourth step: Get vertical line
1584 // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
1585 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1586
1587 // Copy the vertical line output to CPU buffer
1588 *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1589 CL_MAP_READ, true, false);
1590
1591 pixtemp = pixsCLBuffer;
1592 pixsCLBuffer = pixdCLIntermediate;
1593 pixdCLIntermediate = pixtemp;
1594
1595 // Fifth step: Get horizontal line
1596 // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
1597 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1598
1599 // Copy the horizontal line output to CPU buffer
1600 *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1601 CL_MAP_READ, true, true);
1602
1603 return;
1604}
1605
1606/*************************************************************************
1607 * HistogramRect
1608 * Otsu Thresholding Operations
1609 * histogramAllChannels is laid out as all channel 0, then all channel 1...
1610 * only supports 1 or 4 channels (bytes_per_pixel)
1611 ************************************************************************/
1612int OpenclDevice::HistogramRectOCL(void* imageData,
1613 int bytes_per_pixel, int bytes_per_line,
1614 int left, // always 0
1615 int top, // always 0
1616 int width, int height, int kHistogramSize,
1617 int* histogramAllChannels) {
1618 cl_int clStatus;
1619 int retVal = 0;
1620 KernelEnv histKern;
1621 SetKernelEnv(&histKern);
1622 KernelEnv histRedKern;
1623 SetKernelEnv(&histRedKern);
1624 /* map imagedata to device as read only */
1625 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1626 // coherent which we don't need.
1627 // faster option would be to allocate initial image buffer
1628 // using a garlic bus memory type
1629 cl_mem imageBuffer = clCreateBuffer(
1630 histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1631 width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1632 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1633
1634 /* setup work group size parameters */
1635 int block_size = 256;
1636 cl_uint numCUs;
1637 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1638 sizeof(numCUs), &numCUs, nullptr);
1639 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1640
1641 int requestedOccupancy = 10;
1642 int numWorkGroups = numCUs * requestedOccupancy;
1643 int numThreads = block_size * numWorkGroups;
1644 size_t local_work_size[] = {static_cast<size_t>(block_size)};
1645 size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1646 size_t red_global_work_size[] = {
1647 static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1648
1649 /* map histogramAllChannels as write only */
1650
1651 cl_mem histogramBuffer = clCreateBuffer(
1652 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1653 kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels,
1654 &clStatus);
1655 CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1656
1657 /* intermediate histogram buffer */
1658 int histRed = 256;
1659 int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1660
1661 cl_mem tmpHistogramBuffer =
1662 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1663 tmpHistogramBins * sizeof(cl_uint), nullptr, &clStatus);
1664 CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1665
1666 /* atomic sync buffer */
1667 int* zeroBuffer = new int[1];
1668 zeroBuffer[0] = 0;
1669 cl_mem atomicSyncBuffer = clCreateBuffer(
1670 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1671 sizeof(cl_int), zeroBuffer, &clStatus);
1672 CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1673 delete[] zeroBuffer;
1674 // Create kernel objects based on bytes_per_pixel
1675 if (bytes_per_pixel == 1) {
1676 histKern.mpkKernel = clCreateKernel(
1677 histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1678 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1679
1680 histRedKern.mpkKernel =
1681 clCreateKernel(histRedKern.mpkProgram,
1682 "kernel_HistogramRectOneChannelReduction", &clStatus);
1683 CHECK_OPENCL(clStatus,
1684 "clCreateKernel kernel_HistogramRectOneChannelReduction");
1685 } else {
1686 histKern.mpkKernel = clCreateKernel(
1687 histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus);
1688 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1689
1690 histRedKern.mpkKernel =
1691 clCreateKernel(histRedKern.mpkProgram,
1692 "kernel_HistogramRectAllChannelsReduction", &clStatus);
1693 CHECK_OPENCL(clStatus,
1694 "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1695 }
1696
1697 void* ptr;
1698
1699 // Initialize tmpHistogramBuffer buffer
1700 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE,
1701 CL_MAP_WRITE, 0, tmpHistogramBins * sizeof(cl_uint),
1702 0, nullptr, nullptr, &clStatus);
1703 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1704
1705 memset(ptr, 0, tmpHistogramBins * sizeof(cl_uint));
1706 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1707 nullptr, nullptr);
1708
1709 /* set kernel 1 arguments */
1710 clStatus =
1711 clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1712 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1713 cl_uint numPixels = width * height;
1714 clStatus = clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1715 CHECK_OPENCL(clStatus, "clSetKernelArg numPixels");
1716 clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem),
1717 &tmpHistogramBuffer);
1718 CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1719
1720 /* set kernel 2 arguments */
1721 int n = numThreads / bytes_per_pixel;
1722 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1723 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1724 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem),
1725 &tmpHistogramBuffer);
1726 CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1727 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem),
1728 &histogramBuffer);
1729 CHECK_OPENCL(clStatus, "clSetKernelArg histogramBuffer");
1730
1731 /* launch histogram */
1732 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1733 nullptr, global_work_size, local_work_size,
1734 0, nullptr, nullptr);
1735 CHECK_OPENCL(clStatus,
1736 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1737 clFinish(histKern.mpkCmdQueue);
1738 if (clStatus != 0) {
1739 retVal = -1;
1740 }
1741 /* launch histogram */
1742 clStatus = clEnqueueNDRangeKernel(
1743 histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
1744 red_global_work_size, local_work_size, 0, nullptr, nullptr);
1745 CHECK_OPENCL(
1746 clStatus,
1747 "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1748 clFinish(histRedKern.mpkCmdQueue);
1749 if (clStatus != 0) {
1750 retVal = -1;
1751 }
1752
1753 /* map results back from gpu */
1754 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1755 CL_MAP_READ, 0,
1756 kHistogramSize * bytes_per_pixel * sizeof(int), 0,
1757 nullptr, nullptr, &clStatus);
1758 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1759 if (clStatus != 0) {
1760 retVal = -1;
1761 }
1762 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1763 nullptr, nullptr);
1764
1765 clReleaseMemObject(histogramBuffer);
1766 clReleaseMemObject(imageBuffer);
1767 return retVal;
1768}
1769
1770/*************************************************************************
1771 * Threshold the rectangle, taking everything except the image buffer pointer
1772 * from the class, using thresholds/hi_values to the output IMAGE.
1773 * only supports 1 or 4 channels
1774 ************************************************************************/
1775int OpenclDevice::ThresholdRectToPixOCL(unsigned char* imageData,
1776 int bytes_per_pixel, int bytes_per_line,
1777 int* thresholds, int* hi_values,
1778 Pix** pix, int height, int width,
1779 int top, int left) {
1780 int retVal = 0;
1781 /* create pix result buffer */
1782 *pix = pixCreate(width, height, 1);
1783 uint32_t* pixData = pixGetData(*pix);
1784 int wpl = pixGetWpl(*pix);
1785 int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1786
1787 cl_int clStatus;
1788 KernelEnv rEnv;
1789 SetKernelEnv(&rEnv);
1790
1791 /* setup work group size parameters */
1792 int block_size = 256;
1793 cl_uint numCUs = 6;
1794 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1795 sizeof(numCUs), &numCUs, nullptr);
1796 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1797
1798 int requestedOccupancy = 10;
1799 int numWorkGroups = numCUs * requestedOccupancy;
1800 int numThreads = block_size * numWorkGroups;
1801 size_t local_work_size[] = {(size_t)block_size};
1802 size_t global_work_size[] = {(size_t)numThreads};
1803
1804 /* map imagedata to device as read only */
1805 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1806 // coherent which we don't need.
1807 // faster option would be to allocate initial image buffer
1808 // using a garlic bus memory type
1809 cl_mem imageBuffer = clCreateBuffer(
1810 rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1811 width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1812 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1813
1814 /* map pix as write only */
1815 pixThBuffer =
1816 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1817 pixSize, pixData, &clStatus);
1818 CHECK_OPENCL(clStatus, "clCreateBuffer pix");
1819
1820 /* map thresholds and hi_values */
1821 cl_mem thresholdsBuffer =
1822 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1823 bytes_per_pixel * sizeof(int), thresholds, &clStatus);
1824 CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
1825 cl_mem hiValuesBuffer =
1826 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1827 bytes_per_pixel * sizeof(int), hi_values, &clStatus);
1828 CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
1829
1830 /* compile kernel */
1831 if (bytes_per_pixel == 4) {
1832 rEnv.mpkKernel =
1833 clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
1834 CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
1835 } else {
1836 rEnv.mpkKernel = clCreateKernel(
1837 rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
1838 CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
1839 }
1840
1841 /* set kernel arguments */
1842 clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1843 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1844 clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
1845 CHECK_OPENCL(clStatus, "clSetKernelArg height");
1846 clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
1847 CHECK_OPENCL(clStatus, "clSetKernelArg width");
1848 clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
1849 CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
1850 clStatus =
1851 clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
1852 CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
1853 clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
1854 CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
1855 clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
1856 CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
1857
1858 /* launch kernel & wait */
1859 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
1860 nullptr, global_work_size, local_work_size,
1861 0, nullptr, nullptr);
1862 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1863 clFinish(rEnv.mpkCmdQueue);
1864 if (clStatus != 0) {
1865 tprintf("Setting return value to -1\n");
1866 retVal = -1;
1867 }
1868 /* map results back from gpu */
1869 void* ptr =
1870 clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
1871 pixSize, 0, nullptr, nullptr, &clStatus);
1872 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1873 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr,
1874 nullptr);
1875
1876 clReleaseMemObject(imageBuffer);
1877 clReleaseMemObject(thresholdsBuffer);
1878 clReleaseMemObject(hiValuesBuffer);
1879
1880 return retVal;
1881}
1882
1883/******************************************************************************
1884 * Data Types for Device Selection
1885 *****************************************************************************/
1886
1887struct TessScoreEvaluationInputData {
1888 int height;
1889 int width;
1890 int numChannels;
1891 unsigned char* imageData;
1892 Pix* pix;
1893};
1894
1895static void populateTessScoreEvaluationInputData(
1896 TessScoreEvaluationInputData* input) {
1897 srand(1);
1898 // 8.5x11 inches @ 300dpi rounded to clean multiples
1899 int height = 3328; // %256
1900 int width = 2560; // %512
1901 int numChannels = 4;
1902 input->height = height;
1903 input->width = width;
1904 input->numChannels = numChannels;
1905 unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc(
1906 height * width * numChannels *
1907 sizeof(unsigned char)); // new unsigned char[4][height*width];
1908 input->imageData = (unsigned char*)&imageData4[0];
1909
1910 // zero out image
1911 unsigned char pixelWhite[4] = {0, 0, 0, 255};
1912 unsigned char pixelBlack[4] = {255, 255, 255, 255};
1913 for (int p = 0; p < height * width; p++) {
1914 // unsigned char tmp[4] = imageData4[0];
1915 imageData4[p][0] = pixelWhite[0];
1916 imageData4[p][1] = pixelWhite[1];
1917 imageData4[p][2] = pixelWhite[2];
1918 imageData4[p][3] = pixelWhite[3];
1919 }
1920 // random lines to be eliminated
1921 int maxLineWidth = 64; // pixels wide
1922 int numLines = 10;
1923 // vertical lines
1924 for (int i = 0; i < numLines; i++) {
1925 int lineWidth = rand() % maxLineWidth;
1926 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1927 // tprintf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
1928 for (int row = vertLinePos - lineWidth / 2;
1929 row < vertLinePos + lineWidth / 2; row++) {
1930 for (int col = 0; col < height; col++) {
1931 // imageData4[row*width+col] = pixelBlack;
1932 imageData4[row * width + col][0] = pixelBlack[0];
1933 imageData4[row * width + col][1] = pixelBlack[1];
1934 imageData4[row * width + col][2] = pixelBlack[2];
1935 imageData4[row * width + col][3] = pixelBlack[3];
1936 }
1937 }
1938 }
1939 // horizontal lines
1940 for (int i = 0; i < numLines; i++) {
1941 int lineWidth = rand() % maxLineWidth;
1942 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1943 // tprintf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
1944 for (int row = 0; row < width; row++) {
1945 for (int col = horLinePos - lineWidth / 2;
1946 col < horLinePos + lineWidth / 2;
1947 col++) { // for (int row = vertLinePos-lineWidth/2; row <
1948 // vertLinePos+lineWidth/2; row++) {
1949 // tprintf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
1950 // imageData4[row*width+col] = pixelBlack;
1951 imageData4[row * width + col][0] = pixelBlack[0];
1952 imageData4[row * width + col][1] = pixelBlack[1];
1953 imageData4[row * width + col][2] = pixelBlack[2];
1954 imageData4[row * width + col][3] = pixelBlack[3];
1955 }
1956 }
1957 }
1958 // spots (noise, squares)
1959 float fractionBlack = 0.1; // how much of the image should be blackened
1960 int numSpots =
1961 (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1962 for (int i = 0; i < numSpots; i++) {
1963 int lineWidth = rand() % maxLineWidth;
1964 int col = lineWidth + rand() % (width - 2 * lineWidth);
1965 int row = lineWidth + rand() % (height - 2 * lineWidth);
1966 // tprintf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
1967 for (int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1968 for (int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1969 // tprintf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
1970 // imageData4[row*width+col] = pixelBlack;
1971 imageData4[r * width + c][0] = pixelBlack[0];
1972 imageData4[r * width + c][1] = pixelBlack[1];
1973 imageData4[r * width + c][2] = pixelBlack[2];
1974 imageData4[r * width + c][3] = pixelBlack[3];
1975 }
1976 }
1977 }
1978
1979 input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1980}
1981
1982struct TessDeviceScore {
1983 float time; // small time means faster device
1984 bool clError; // were there any opencl errors
1985 bool valid; // was the correct response generated
1986};
1987
1988/******************************************************************************
1989 * Micro Benchmarks for Device Selection
1990 *****************************************************************************/
1991
1992static double composeRGBPixelMicroBench(GPUEnv* env,
1993 TessScoreEvaluationInputData input,
1994 ds_device_type type) {
1995 double time = 0;
1996#if ON_WINDOWS
1997 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1998 QueryPerformanceFrequency(&freq);
1999#elif ON_APPLE
2000 mach_timebase_info_data_t info = {0, 0};
2001 mach_timebase_info(&info);
2002 long long start, stop;
2003#else
2004 timespec time_funct_start, time_funct_end;
2005#endif
2006 // input data
2007 l_uint32* tiffdata =
2008 (l_uint32*)input.imageData; // same size and random data; data doesn't
2009 // change workload
2010
2011 // function call
2012 if (type == DS_DEVICE_OPENCL_DEVICE) {
2013#if ON_WINDOWS
2014 QueryPerformanceCounter(&time_funct_start);
2015#elif ON_APPLE
2016 start = mach_absolute_time();
2017#else
2018 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2019#endif
2020
2021 OpenclDevice::gpuEnv = *env;
2022 int wpl = pixGetWpl(input.pix);
2023 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2024 wpl, nullptr);
2025#if ON_WINDOWS
2026 QueryPerformanceCounter(&time_funct_end);
2027 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2028 (double)(freq.QuadPart);
2029#elif ON_APPLE
2030 stop = mach_absolute_time();
2031 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2032#else
2033 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2034 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2035 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2036#endif
2037
2038 } else {
2039#if ON_WINDOWS
2040 QueryPerformanceCounter(&time_funct_start);
2041#elif ON_APPLE
2042 start = mach_absolute_time();
2043#else
2044 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2045#endif
2046 Pix* pix = pixCreate(input.width, input.height, 32);
2047 l_uint32* pixData = pixGetData(pix);
2048 int i, j;
2049 int idx = 0;
2050 for (i = 0; i < input.height; i++) {
2051 for (j = 0; j < input.width; j++) {
2052 l_uint32 tiffword = tiffdata[i * input.width + j];
2053 l_int32 rval = ((tiffword)&0xff);
2054 l_int32 gval = (((tiffword) >> 8) & 0xff);
2055 l_int32 bval = (((tiffword) >> 16) & 0xff);
2056 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2057 pixData[idx] = value;
2058 idx++;
2059 }
2060 }
2061#if ON_WINDOWS
2062 QueryPerformanceCounter(&time_funct_end);
2063 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2064 (double)(freq.QuadPart);
2065#elif ON_APPLE
2066 stop = mach_absolute_time();
2067 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2068#else
2069 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2070 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2071 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2072#endif
2073 pixDestroy(&pix);
2074 }
2075
2076 return time;
2077}
2078
2079static double histogramRectMicroBench(GPUEnv* env,
2080 TessScoreEvaluationInputData input,
2081 ds_device_type type) {
2082 double time;
2083#if ON_WINDOWS
2084 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2085 QueryPerformanceFrequency(&freq);
2086#elif ON_APPLE
2087 mach_timebase_info_data_t info = {0, 0};
2088 mach_timebase_info(&info);
2089 long long start, stop;
2090#else
2091 timespec time_funct_start, time_funct_end;
2092#endif
2093
2094 const int left = 0;
2095 const int top = 0;
2096 int kHistogramSize = 256;
2097 int bytes_per_line = input.width * input.numChannels;
2098 int* histogramAllChannels = new int[kHistogramSize * input.numChannels];
2099 // function call
2100 if (type == DS_DEVICE_OPENCL_DEVICE) {
2101#if ON_WINDOWS
2102 QueryPerformanceCounter(&time_funct_start);
2103#elif ON_APPLE
2104 start = mach_absolute_time();
2105#else
2106 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2107#endif
2108
2109 OpenclDevice::gpuEnv = *env;
2110 int retVal = OpenclDevice::HistogramRectOCL(
2111 input.imageData, input.numChannels, bytes_per_line, left, top,
2112 input.width, input.height, kHistogramSize, histogramAllChannels);
2113
2114#if ON_WINDOWS
2115 QueryPerformanceCounter(&time_funct_end);
2116 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2117 (double)(freq.QuadPart);
2118#elif ON_APPLE
2119 stop = mach_absolute_time();
2120 if (retVal == 0) {
2121 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2122 } else {
2123 time = FLT_MAX;
2124 }
2125#else
2126 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2127 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2128 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2129#endif
2130 } else {
2131 int* histogram = new int[kHistogramSize];
2132#if ON_WINDOWS
2133 QueryPerformanceCounter(&time_funct_start);
2134#elif ON_APPLE
2135 start = mach_absolute_time();
2136#else
2137 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2138#endif
2139 for (int ch = 0; ch < input.numChannels; ++ch) {
2140 tesseract::HistogramRect(input.pix, input.numChannels, left, top,
2141 input.width, input.height, histogram);
2142 }
2143#if ON_WINDOWS
2144 QueryPerformanceCounter(&time_funct_end);
2145 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2146 (double)(freq.QuadPart);
2147#elif ON_APPLE
2148 stop = mach_absolute_time();
2149 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2150#else
2151 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2152 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2153 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2154#endif
2155 delete[] histogram;
2156 }
2157
2158 // cleanup
2159 delete[] histogramAllChannels;
2160 return time;
2161}
2162
2163// Reproducing the ThresholdRectToPix native version
2164static void ThresholdRectToPix_Native(const unsigned char* imagedata,
2165 int bytes_per_pixel, int bytes_per_line,
2166 const int* thresholds,
2167 const int* hi_values, Pix** pix) {
2168 int top = 0;
2169 int left = 0;
2170 int width = pixGetWidth(*pix);
2171 int height = pixGetHeight(*pix);
2172
2173 *pix = pixCreate(width, height, 1);
2174 uint32_t* pixdata = pixGetData(*pix);
2175 int wpl = pixGetWpl(*pix);
2176 const unsigned char* srcdata =
2177 imagedata + top * bytes_per_line + left * bytes_per_pixel;
2178 for (int y = 0; y < height; ++y) {
2179 const uint8_t* linedata = srcdata;
2180 uint32_t* pixline = pixdata + y * wpl;
2181 for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2182 bool white_result = true;
2183 for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2184 if (hi_values[ch] >= 0 &&
2185 (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2186 white_result = false;
2187 break;
2188 }
2189 }
2190 if (white_result)
2191 CLEAR_DATA_BIT(pixline, x);
2192 else
2193 SET_DATA_BIT(pixline, x);
2194 }
2195 srcdata += bytes_per_line;
2196 }
2197}
2198
2199static double thresholdRectToPixMicroBench(GPUEnv* env,
2200 TessScoreEvaluationInputData input,
2201 ds_device_type type) {
2202 double time;
2203#if ON_WINDOWS
2204 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2205 QueryPerformanceFrequency(&freq);
2206#elif ON_APPLE
2207 mach_timebase_info_data_t info = {0, 0};
2208 mach_timebase_info(&info);
2209 long long start, stop;
2210#else
2211 timespec time_funct_start, time_funct_end;
2212#endif
2213
2214 // input data
2215 unsigned char pixelHi = (unsigned char)255;
2216 int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2217
2218 // Pix* pix = pixCreate(width, height, 1);
2219 int top = 0;
2220 int left = 0;
2221 int bytes_per_line = input.width * input.numChannels;
2222
2223 // function call
2224 if (type == DS_DEVICE_OPENCL_DEVICE) {
2225#if ON_WINDOWS
2226 QueryPerformanceCounter(&time_funct_start);
2227#elif ON_APPLE
2228 start = mach_absolute_time();
2229#else
2230 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2231#endif
2232
2233 OpenclDevice::gpuEnv = *env;
2234 int hi_values[4];
2235 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2236 input.imageData, input.numChannels, bytes_per_line, thresholds,
2237 hi_values, &input.pix, input.height, input.width, top, left);
2238
2239#if ON_WINDOWS
2240 QueryPerformanceCounter(&time_funct_end);
2241 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2242 (double)(freq.QuadPart);
2243#elif ON_APPLE
2244 stop = mach_absolute_time();
2245 if (retVal == 0) {
2246 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2247 } else {
2248 time = FLT_MAX;
2249 }
2250
2251#else
2252 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2253 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2254 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2255#endif
2256 } else {
2257 tesseract::ImageThresholder thresholder;
2258 thresholder.SetImage(input.pix);
2259#if ON_WINDOWS
2260 QueryPerformanceCounter(&time_funct_start);
2261#elif ON_APPLE
2262 start = mach_absolute_time();
2263#else
2264 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2265#endif
2266 int hi_values[4] = {};
2267 ThresholdRectToPix_Native(input.imageData, input.numChannels,
2268 bytes_per_line, thresholds, hi_values,
2269 &input.pix);
2270
2271#if ON_WINDOWS
2272 QueryPerformanceCounter(&time_funct_end);
2273 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2274 (double)(freq.QuadPart);
2275#elif ON_APPLE
2276 stop = mach_absolute_time();
2277 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2278#else
2279 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2280 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2281 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2282#endif
2283 }
2284
2285 return time;
2286}
2287
2288static double getLineMasksMorphMicroBench(GPUEnv* env,
2289 TessScoreEvaluationInputData input,
2290 ds_device_type type) {
2291 double time = 0;
2292#if ON_WINDOWS
2293 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2294 QueryPerformanceFrequency(&freq);
2295#elif ON_APPLE
2296 mach_timebase_info_data_t info = {0, 0};
2297 mach_timebase_info(&info);
2298 long long start, stop;
2299#else
2300 timespec time_funct_start, time_funct_end;
2301#endif
2302
2303 // input data
2304 int resolution = 300;
2305 int wpl = pixGetWpl(input.pix);
2306 int kThinLineFraction = 20; // tess constant
2307 int kMinLineLengthFraction = 4; // tess constant
2308 int max_line_width = resolution / kThinLineFraction;
2309 int min_line_length = resolution / kMinLineLengthFraction;
2310 int closing_brick = max_line_width / 3;
2311
2312 // function call
2313 if (type == DS_DEVICE_OPENCL_DEVICE) {
2314#if ON_WINDOWS
2315 QueryPerformanceCounter(&time_funct_start);
2316#elif ON_APPLE
2317 start = mach_absolute_time();
2318#else
2319 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2320#endif
2321 OpenclDevice::gpuEnv = *env;
2322 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2323 Pix *pix_vline = nullptr, *pix_hline = nullptr, *pix_closed = nullptr;
2324 OpenclDevice::pixGetLinesCL(nullptr, input.pix, &pix_vline, &pix_hline,
2325 &pix_closed, true, closing_brick, closing_brick,
2326 max_line_width, max_line_width, min_line_length,
2327 min_line_length);
2328
2329 OpenclDevice::releaseMorphCLBuffers();
2330
2331#if ON_WINDOWS
2332 QueryPerformanceCounter(&time_funct_end);
2333 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2334 (double)(freq.QuadPart);
2335#elif ON_APPLE
2336 stop = mach_absolute_time();
2337 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2338#else
2339 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2340 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2341 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2342#endif
2343 } else {
2344#if ON_WINDOWS
2345 QueryPerformanceCounter(&time_funct_start);
2346#elif ON_APPLE
2347 start = mach_absolute_time();
2348#else
2349 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2350#endif
2351
2352 // native serial code
2353 Pix* src_pix = input.pix;
2354 Pix* pix_closed =
2355 pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);
2356 Pix* pix_solid =
2357 pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);
2358 Pix* pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);
2359 pixDestroy(&pix_solid);
2360 Pix* pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
2361 Pix* pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
2362 pixDestroy(&pix_hline);
2363 pixDestroy(&pix_vline);
2364 pixDestroy(&pix_hollow);
2365
2366#if ON_WINDOWS
2367 QueryPerformanceCounter(&time_funct_end);
2368 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) /
2369 (double)(freq.QuadPart);
2370#elif ON_APPLE
2371 stop = mach_absolute_time();
2372 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2373#else
2374 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2375 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2376 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2377#endif
2378 }
2379
2380 return time;
2381}
2382
2383/******************************************************************************
2384 * Device Selection
2385 *****************************************************************************/
2386
2387#include <cstdlib>
2388
2389// encode score object as byte string
2390static ds_status serializeScore(ds_device* device, uint8_t** serializedScore,
2391 unsigned int* serializedScoreSize) {
2392 *serializedScoreSize = sizeof(TessDeviceScore);
2393 *serializedScore = new uint8_t[*serializedScoreSize];
2394 memcpy(*serializedScore, device->score, *serializedScoreSize);
2395 return DS_SUCCESS;
2396}
2397
2398// parses byte string and stores in score object
2399static ds_status deserializeScore(ds_device* device,
2400 const uint8_t* serializedScore,
2401 unsigned int serializedScoreSize) {
2402 // check that serializedScoreSize == sizeof(TessDeviceScore);
2403 device->score = new TessDeviceScore;
2404 memcpy(device->score, serializedScore, serializedScoreSize);
2405 return DS_SUCCESS;
2406}
2407
2408static ds_status releaseScore(TessDeviceScore* score) {
2409 delete score;
2410 return DS_SUCCESS;
2411}
2412
2413// evaluate devices
2414static ds_status evaluateScoreForDevice(ds_device* device, void* inputData) {
2415 // overwrite statuc gpuEnv w/ current device
2416 // so native opencl calls can be used; they use static gpuEnv
2417 tprintf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2418 device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2419 GPUEnv* env = nullptr;
2420 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2421 env = &OpenclDevice::gpuEnv;
2422 memset(env, 0, sizeof(*env));
2423 // tprintf("[DS] populating tmp GPUEnv from device\n");
2424 populateGPUEnvFromDevice(env, device->oclDeviceID);
2425 env->mnFileCount = 0; // argc;
2426 env->mnKernelCount = 0UL;
2427 // tprintf("[DS] compiling kernels for tmp GPUEnv\n");
2428 OpenclDevice::CompileKernelFile(env, "");
2429 }
2430
2431 TessScoreEvaluationInputData* input =
2432 static_cast<TessScoreEvaluationInputData*>(inputData);
2433
2434 // pixReadTiff
2435 double composeRGBPixelTime =
2436 composeRGBPixelMicroBench(env, *input, device->type);
2437
2438 // HistogramRect
2439 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2440
2441 // ThresholdRectToPix
2442 double thresholdRectToPixTime =
2443 thresholdRectToPixMicroBench(env, *input, device->type);
2444
2445 // getLineMasks
2446 double getLineMasksMorphTime =
2447 getLineMasksMorphMicroBench(env, *input, device->type);
2448
2449 // weigh times (% of cpu time)
2450 // these weights should be the % execution time that the native cpu code took
2451 float composeRGBPixelWeight = 1.2f;
2452 float histogramRectWeight = 2.4f;
2453 float thresholdRectToPixWeight = 4.5f;
2454 float getLineMasksMorphWeight = 5.0f;
2455
2456 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2457 histogramRectWeight * histogramRectTime +
2458 thresholdRectToPixWeight * thresholdRectToPixTime +
2459 getLineMasksMorphWeight * getLineMasksMorphTime;
2460 device->score = new TessDeviceScore;
2461 device->score->time = weightedTime;
2462
2463 tprintf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2464 device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2465 tprintf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime,
2466 composeRGBPixelWeight);
2467 tprintf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime,
2468 histogramRectWeight);
2469 tprintf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix",
2470 thresholdRectToPixTime, thresholdRectToPixWeight);
2471 tprintf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime,
2472 getLineMasksMorphWeight);
2473 tprintf("[DS]%25s: %f\n", "Score", device->score->time);
2474 return DS_SUCCESS;
2475}
2476
2477// initial call to select device
2478ds_device OpenclDevice::getDeviceSelection() {
2479 if (!deviceIsSelected) {
2480 // check if opencl is available at runtime
2481 if (1 == LoadOpencl()) {
2482 // opencl is available
2483 // setup devices
2484 ds_status status;
2485 ds_profile* profile;
2486 status = initDSProfile(&profile, "v0.1");
2487 // try reading scores from file
2488 const char* fileName = "tesseract_opencl_profile_devices.dat";
2489 status = readProfileFromFile(profile, deserializeScore, fileName);
2490 if (status != DS_SUCCESS) {
2491 // need to run evaluation
2492 tprintf("[DS] Profile file not available (%s); performing profiling.\n",
2493 fileName);
2494
2495 // create input data
2496 TessScoreEvaluationInputData input;
2497 populateTessScoreEvaluationInputData(&input);
2498 // perform evaluations
2499 unsigned int numUpdates;
2500 status = profileDevices(profile, DS_EVALUATE_ALL,
2501 evaluateScoreForDevice, &input, &numUpdates);
2502 // write scores to file
2503 if (status == DS_SUCCESS) {
2504 status = writeProfileToFile(profile, serializeScore, fileName);
2505 if (status == DS_SUCCESS) {
2506 tprintf("[DS] Scores written to file (%s).\n", fileName);
2507 } else {
2508 tprintf(
2509 "[DS] Error saving scores to file (%s); scores not written to "
2510 "file.\n",
2511 fileName);
2512 }
2513 } else {
2514 tprintf(
2515 "[DS] Unable to evaluate performance; scores not written to "
2516 "file.\n");
2517 }
2518 } else {
2519 tprintf("[DS] Profile read from file (%s).\n", fileName);
2520 }
2521
2522 // we now have device scores either from file or evaluation
2523 // select fastest using custom Tesseract selection algorithm
2524 float bestTime = FLT_MAX; // begin search with worst possible time
2525 int bestDeviceIdx = -1;
2526 for (unsigned d = 0; d < profile->numDevices; d++) {
2527 ds_device device = profile->devices[d];
2528 if (device.score == nullptr) continue;
2529 TessDeviceScore score = *device.score;
2530
2531 float time = score.time;
2532 tprintf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type,
2533 device.oclDeviceName, time);
2534 if (time < bestTime) {
2535 bestTime = time;
2536 bestDeviceIdx = d;
2537 }
2538 }
2539 if (bestDeviceIdx >= 0) {
2540 tprintf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2541 profile->devices[bestDeviceIdx].oclDeviceName,
2542 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2543 ? "OpenCL"
2544 : "Native");
2545 }
2546 // cleanup
2547 // TODO: call destructor for profile object?
2548
2549 bool overridden = false;
2550 char* overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2551 if (overrideDeviceStr != nullptr) {
2552 int overrideDeviceIdx = atoi(overrideDeviceStr);
2553 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2554 tprintf(
2555 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2556 "%i)\n",
2557 overrideDeviceStr, overrideDeviceIdx);
2558 bestDeviceIdx = overrideDeviceIdx - 1;
2559 overridden = true;
2560 } else {
2561 tprintf(
2562 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2563 "valid devices).\n",
2564 overrideDeviceStr, profile->numDevices);
2565 }
2566 }
2567
2568 if (overridden) {
2569 tprintf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2570 profile->devices[bestDeviceIdx].oclDeviceName,
2571 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2572 ? "OpenCL"
2573 : "Native");
2574 }
2575 selectedDevice = profile->devices[bestDeviceIdx];
2576 // cleanup
2577 releaseDSProfile(profile, releaseScore);
2578 } else {
2579 // opencl isn't available at runtime, select native cpu device
2580 tprintf("[DS] OpenCL runtime not available.\n");
2581 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2582 selectedDevice.oclDeviceName = "(null)";
2583 selectedDevice.score = nullptr;
2584 selectedDevice.oclDeviceID = nullptr;
2585 selectedDevice.oclDriverVersion = nullptr;
2586 }
2587 deviceIsSelected = true;
2588 }
2589 return selectedDevice;
2590}
2591
2592bool OpenclDevice::selectedDeviceIsOpenCL() {
2593 ds_device device = getDeviceSelection();
2594 return (device.type == DS_DEVICE_OPENCL_DEVICE);
2595}
2596
2597#endif
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
const int kHistogramSize
Definition: otsuthr.h:27
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:150
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:43
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:41
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:65