00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00022
00023 #if defined(CL_SIFTGPU_ENABLED)
00024
00025 #include <CL/opencl.h>
00026 #include <GL/glew.h>
00027
00028 #include <iostream>
00029 #include <iomanip>
00030 #include <vector>
00031 #include <strstream>
00032 #include <algorithm>
00033 #include <stdlib.h>
00034 #include <math.h>
00035 #include <string.h>
00036 using namespace std;
00037
00038 #include "GlobalUtil.h"
00039 #include "CLTexImage.h"
00040 #include "ProgramCL.h"
00041 #include "SiftGPU.h"
00042
00043
00044 #if defined(_WIN32)
00045 #pragma comment (lib, "OpenCL.lib")
00046 #endif
00047
00048 #ifndef _INC_WINDOWS
00049 #ifndef WIN32_LEAN_AND_MEAN
00050 #define WIN32_LEAN_AND_MEAN
00051 #endif
00052 #include <windows.h>
00053 #endif
00054
00056
00058
00059 ProgramCL::ProgramCL()
00060 {
00061 _program = NULL;
00062 _kernel = NULL;
00063 _valid = 0;
00064 }
00065
00066 ProgramCL::~ProgramCL()
00067 {
00068 if(_kernel) clReleaseKernel(_kernel);
00069 if(_program) clReleaseProgram(_program);
00070 }
00071
00072 ProgramCL::ProgramCL(const char* name, const char * code, cl_context context, cl_device_id device) : _valid(1)
00073 {
00074 const char * src[1] = {code}; cl_int status;
00075
00076 _program = clCreateProgramWithSource(context, 1, src, NULL, &status);
00077 if(status != CL_SUCCESS) _valid = 0;
00078
00079 status = clBuildProgram(_program, 0, NULL,
00080 GlobalUtil::_debug ?
00081 "-cl-fast-relaxed-math -cl-single-precision-constant -cl-nv-verbose" :
00082 "-cl-fast-relaxed-math -cl-single-precision-constant", NULL, NULL);
00083
00084 if(status != CL_SUCCESS) {PrintBuildLog(device, 1); _valid = 0;}
00085 else if(GlobalUtil::_debug) PrintBuildLog(device, 0);
00086
00087 _kernel = clCreateKernel(_program, name, &status);
00088 if(status != CL_SUCCESS) _valid = 0;
00089 }
00090
00091 void ProgramCL::PrintBuildLog(cl_device_id device, int all)
00092 {
00093 char buffer[10240] = "\0";
00094 cl_int status = clGetProgramBuildInfo(
00095 _program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
00096 if(all )
00097 {
00098 std::cerr << buffer << endl;
00099 }else
00100 {
00101 const char * pos = strstr(buffer, "ptxas");
00102 if(pos) std::cerr << pos << endl;
00103 }
00104 }
00105
00108
00109 ProgramBagCL::ProgramBagCL()
00110 {
00112 _context = NULL; _queue = NULL;
00113 s_gray = s_sampling = NULL;
00114 s_packup = s_zero_pass = NULL;
00115 s_gray_pack = s_unpack = NULL;
00116 s_sampling_u = NULL;
00117 s_dog_pass = NULL;
00118 s_grad_pass = NULL;
00119 s_grad_pass2 = NULL;
00120 s_unpack_dog = NULL;
00121 s_unpack_grd = NULL;
00122 s_unpack_key = NULL;
00123 s_keypoint = NULL;
00124 f_gaussian_skip0 = NULL;
00125 f_gaussian_skip1 = NULL;
00126 f_gaussian_step = 0;
00127
00129 GlobalUtil::StartTimer("Initialize OpenCL");
00130 if(!InitializeContext()) return;
00131 GlobalUtil::StopTimer();
00132
00133 }
00134
00135
00136
00137 ProgramBagCL::~ProgramBagCL()
00138 {
00139 if(s_gray) delete s_gray;
00140 if(s_sampling) delete s_sampling;
00141 if(s_zero_pass) delete s_zero_pass;
00142 if(s_packup) delete s_packup;
00143 if(s_unpack) delete s_unpack;
00144 if(s_gray_pack) delete s_gray_pack;
00145 if(s_sampling_u) delete s_sampling_u;
00146 if(s_dog_pass) delete s_dog_pass;
00147 if(s_grad_pass) delete s_grad_pass;
00148 if(s_grad_pass2) delete s_grad_pass2;
00149 if(s_unpack_dog) delete s_unpack_dog;
00150 if(s_unpack_grd) delete s_unpack_grd;
00151 if(s_unpack_key) delete s_unpack_key;
00152 if(s_keypoint) delete s_keypoint;
00153
00154 if(f_gaussian_skip1) delete f_gaussian_skip1;
00155
00156 for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
00157 {
00158 if(f_gaussian_skip0_v[i]) delete f_gaussian_skip0_v[i];
00159 }
00160 if(f_gaussian_step && _gaussian_step_num > 0)
00161 {
00162 for(int i = 0; i< _gaussian_step_num; i++)
00163 {
00164 delete f_gaussian_step[i];
00165 }
00166 delete[] f_gaussian_step;
00167 }
00168
00170 if(_context) clReleaseContext(_context);
00171 if(_queue) clReleaseCommandQueue(_queue);
00172 }
00173
00174 bool ProgramBagCL::InitializeContext()
00175 {
00176 cl_uint num_platform, num_device;
00177 cl_int status;
00178
00179 status = clGetPlatformIDs (0, NULL, &num_platform);
00180 if (status != CL_SUCCESS || num_platform == 0) return false;
00181
00182 cl_platform_id platforms[16];
00183 if(num_platform > 16 ) num_platform = 16;
00184 status = clGetPlatformIDs (num_platform, platforms, NULL);
00185 _platform = platforms[0];
00186
00188 status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_device);
00189 if(status != CL_SUCCESS || num_device == 0) return false;
00190
00191
00192 cl_device_id* devices = new cl_device_id [num_device];
00193 status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, num_device, devices, NULL);
00194 _device = (status == CL_SUCCESS? devices[0] : 0); delete[] devices;
00195 if(status != CL_SUCCESS) return false;
00196
00197
00198 if(GlobalUtil::_verbose)
00199 {
00200 cl_device_mem_cache_type is_gcache;
00201 clGetDeviceInfo(_device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(is_gcache), &is_gcache, NULL);
00202 if(is_gcache == CL_NONE) std::cout << "No cache for global memory\n";
00203
00204
00205 }
00206
00207
00208 if(GlobalUtil::_UseSiftGPUEX)
00209 {
00210 cl_context_properties prop[] = {
00211 CL_CONTEXT_PLATFORM, (cl_context_properties)_platform,
00212 CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
00213 CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0 };
00214 _context = clCreateContext(prop, 1, &_device, NULL, NULL, &status);
00215 if(status != CL_SUCCESS) return false;
00216 }else
00217 {
00218 _context = clCreateContext(0, 1, &_device, NULL, NULL, &status);
00219 if(status != CL_SUCCESS) return false;
00220 }
00221
00222
00223 _queue = clCreateCommandQueue(_context, _device, 0, &status);
00224 return status == CL_SUCCESS;
00225 }
00226
00227 void ProgramBagCL::InitProgramBag(SiftParam¶m)
00228 {
00229 GlobalUtil::StartTimer("Load Programs");
00230 LoadFixedShaders();
00231 LoadDynamicShaders(param);
00232 if(GlobalUtil::_UseSiftGPUEX) LoadDisplayShaders();
00233 GlobalUtil::StopTimer();
00234 }
00235
00236
00237 void ProgramBagCL::UnloadProgram()
00238 {
00239
00240 }
00241
00242 void ProgramBagCL::FinishCL()
00243 {
00244 clFinish(_queue);
00245 }
00246
00247 void ProgramBagCL::LoadFixedShaders()
00248 {
00249
00250
00251 s_gray = new ProgramCL( "gray",
00252 "__kernel void gray(__read_only image2d_t input, __write_only image2d_t output) {\n"
00253 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00254 "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
00255 "float4 weight = (float4)(0.299, 0.587, 0.114, 0.0);\n"
00256 "float intensity = dot(weight, read_imagef(input,sampler, coord ));\n"
00257 "float4 result= (float4)(intensity, intensity, intensity, 1.0);\n"
00258 "write_imagef(output, coord, result); }", _context, _device );
00259
00260
00261 s_sampling = new ProgramCL("sampling",
00262 "__kernel void sampling(__read_only image2d_t input, __write_only image2d_t output,\n"
00263 " int width, int height) {\n"
00264 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00265 "int x = get_global_id(0), y = get_global_id(1); \n"
00266 "if( x >= width || y >= height) return;\n"
00267 "int xa = x + x, ya = y + y; \n"
00268 "int xb = xa + 1, yb = ya + 1; \n"
00269 "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
00270 "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
00271 "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
00272 "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
00273 "float4 result = (float4) (v1, v2, v3, v4);"
00274 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
00275
00276 s_sampling_k = new ProgramCL("sampling_k",
00277 "__kernel void sampling_k(__read_only image2d_t input, __write_only image2d_t output, "
00278 " int width, int height,\n"
00279 " int step, int halfstep) {\n"
00280 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00281 "int x = get_global_id(0), y = get_global_id(1); \n"
00282 "if( x >= width || y >= height) return;\n"
00283 "int xa = x * step, ya = y *step; \n"
00284 "int xb = xa + halfstep, yb = ya + halfstep; \n"
00285 "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
00286 "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
00287 "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
00288 "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
00289 "float4 result = (float4) (v1, v2, v3, v4);"
00290 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
00291
00292
00293 s_sampling_u = new ProgramCL("sampling_u",
00294 "__kernel void sampling_u(__read_only image2d_t input, \n"
00295 " __write_only image2d_t output,\n"
00296 " int width, int height,\n"
00297 " float step, float halfstep) {\n"
00298 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
00299 "int x = get_global_id(0), y = get_global_id(1); \n"
00300 "if( x >= width || y >= height) return;\n"
00301 "float xa = x * step, ya = y *step; \n"
00302 "float xb = xa + halfstep, yb = ya + halfstep; \n"
00303 "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
00304 "float v2 = read_imagef(input, sampler, (float2) (xb, ya)).x; \n"
00305 "float v3 = read_imagef(input, sampler, (float2) (xa, yb)).x; \n"
00306 "float v4 = read_imagef(input, sampler, (float2) (xb, yb)).x; \n"
00307 "float4 result = (float4) (v1, v2, v3, v4);"
00308 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
00309
00310
00311 s_zero_pass = new ProgramCL("zero_pass",
00312 "__kernel void zero_pass(__write_only image2d_t output){\n"
00313 "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
00314 "write_imagef(output, coord, (float4)(0.0));}", _context, _device);
00315
00316 s_packup = new ProgramCL("packup",
00317 "__kernel void packup(__global float* input, __write_only image2d_t output,\n"
00318 " int twidth, int theight, int width){\n"
00319 "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
00320 "if(coord.x >= twidth || coord.y >= theight) return;\n"
00321 "int index0 = (coord.y + coord.y) * width; \n"
00322 "int index1 = index0 + coord.x;\n"
00323 "int x2 = min(width -1, coord.x); \n"
00324 "float v1 = input[index1 + coord.x], v2 = input[index1 + x2]; \n"
00325 "int index2 = index1 + width; \n"
00326 "float v3 = input[index2 + coord.x], v4 = input[index2 + x2]; \n "
00327 "write_imagef(output, coord, (float4) (v1, v2, v3, v4));}", _context, _device);
00328
00329 s_dog_pass = new ProgramCL("dog_pass",
00330 "__kernel void dog_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
00331 " __write_only image2d_t dog, int width, int height) {\n"
00332 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
00333 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00334 "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
00335 "if( coord.x >= width || coord.y >= height) return;\n"
00336 "float4 cc = read_imagef(tex , sampler, coord); \n"
00337 "float4 cp = read_imagef(texp, sampler, coord);\n"
00338 "write_imagef(dog, coord, cc - cp); }\n", _context, _device);
00339
00340 s_grad_pass = new ProgramCL("grad_pass",
00341 "__kernel void grad_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
00342 " __write_only image2d_t dog, int width, int height,\n"
00343 " __write_only image2d_t grad, __write_only image2d_t rot) {\n"
00344 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
00345 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00346 "int x = get_global_id(0), y = get_global_id(1); \n"
00347 "if( x >= width || y >= height) return;\n"
00348 "int2 coord = (int2) (x, y);\n"
00349 "float4 cc = read_imagef(tex , sampler, coord); \n"
00350 "float4 cp = read_imagef(texp, sampler, coord);\n"
00351 "float2 cl = read_imagef(tex, sampler, (int2)(x - 1, y)).yw;\n"
00352 "float2 cr = read_imagef(tex, sampler, (int2)(x + 1, y)).xz;\n"
00353 "float2 cd = read_imagef(tex, sampler, (int2)(x, y - 1)).zw;\n"
00354 "float2 cu = read_imagef(tex, sampler, (int2)(x, y + 1)).xy;\n"
00355 "write_imagef(dog, coord, cc - cp); \n"
00356 "float4 dx = (float4)(cc.y - cl.x, cr.x - cc.x, cc.w - cl.y, cr.y - cc.z);\n"
00357 "float4 dy = (float4)(cc.zw - cd.xy, cu.xy - cc.xy);\n"
00358 "write_imagef(grad, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
00359 "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
00360
00361 s_grad_pass2 = new ProgramCL("grad_pass2",
00362 "#define BLOCK_DIMX 32\n"
00363 "#define BLOCK_DIMY 14\n"
00364 "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
00365 "__kernel void grad_pass2(__read_only image2d_t tex, __read_only image2d_t texp,\n"
00366 " __write_only image2d_t dog, int width, int height,\n"
00367 " __write_only image2d_t grd, __write_only image2d_t rot){\n"
00368 "__local float block[BLOCK_SIZE * 4]; \n"
00369 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
00370 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00371 "int2 coord = (int2) ( get_global_id(0) - get_group_id(0) * 2 - 1, \n"
00372 " get_global_id(1) - get_group_id(1) * 2- 1); \n"
00373 "int idx = mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
00374 "float4 cc = read_imagef(tex, sampler, coord);\n"
00375 "block[idx ] = cc.x;\n"
00376 "block[idx + BLOCK_SIZE ] = cc.y;\n"
00377 "block[idx + BLOCK_SIZE * 2] = cc.z;\n"
00378 "block[idx + BLOCK_SIZE * 3] = cc.w;\n"
00379 "barrier(CLK_LOCAL_MEM_FENCE);\n"
00380 "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
00381 "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
00382 "if( coord.x >= width) return; \n"
00383 "if( coord.y >= height) return;\n"
00384 "float4 cp = read_imagef(texp, sampler, coord);\n"
00385 "float4 dx = (float4)( cc.y - block[idx - 1 + BLOCK_SIZE], \n"
00386 " block[idx + 1] - cc.x, \n"
00387 " cc.w - block[idx - 1 + 3 * BLOCK_SIZE], \n"
00388 " block[idx + 1 + 2 * BLOCK_SIZE] - cc.z);\n"
00389 "float4 dy = (float4)( cc.z - block[idx - BLOCK_DIMX + 2 * BLOCK_SIZE], \n"
00390 " cc.w - block[idx - BLOCK_DIMX + 3 * BLOCK_SIZE],"
00391
00392 " block[idx + BLOCK_DIMX] - cc.x,\n "
00393 " block[idx + BLOCK_DIMX + BLOCK_SIZE] - cc.y);\n"
00394
00395 "write_imagef(dog, coord, cc - cp); \n"
00396 "write_imagef(grd, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
00397 "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
00398 }
00399
00400 void ProgramBagCL::LoadDynamicShaders(SiftParam& param)
00401 {
00402 LoadKeypointShader();
00403 LoadGenListShader(param._dog_level_num, 0);
00404 CreateGaussianFilters(param);
00405 }
00406
00407
00408 void ProgramBagCL::SelectInitialSmoothingFilter(int octave_min, SiftParam¶m)
00409 {
00410 float sigma = param.GetInitialSmoothSigma(octave_min);
00411 if(sigma == 0)
00412 {
00413 f_gaussian_skip0 = NULL;
00414 }else
00415 {
00416 for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
00417 {
00418 if(f_gaussian_skip0_v[i]->_id == octave_min)
00419 {
00420 f_gaussian_skip0 = f_gaussian_skip0_v[i];
00421 return ;
00422 }
00423 }
00424 FilterCL * filter = CreateGaussianFilter(sigma);
00425 filter->_id = octave_min;
00426 f_gaussian_skip0_v.push_back(filter);
00427 f_gaussian_skip0 = filter;
00428 }
00429
00430 }
00431
00432 void ProgramBagCL::CreateGaussianFilters(SiftParam¶m)
00433 {
00434 if(param._sigma_skip0>0.0f)
00435 {
00436 f_gaussian_skip0 = CreateGaussianFilter(param._sigma_skip0);
00437 f_gaussian_skip0->_id = GlobalUtil::_octave_min_default;
00438 f_gaussian_skip0_v.push_back(f_gaussian_skip0);
00439 }
00440 if(param._sigma_skip1>0.0f)
00441 {
00442 f_gaussian_skip1 = CreateGaussianFilter(param._sigma_skip1);
00443 }
00444
00445 f_gaussian_step = new FilterCL*[param._sigma_num];
00446 for(int i = 0; i< param._sigma_num; i++)
00447 {
00448 f_gaussian_step[i] = CreateGaussianFilter(param._sigma[i]);
00449 }
00450 _gaussian_step_num = param._sigma_num;
00451 }
00452
00453
00454 FilterCL* ProgramBagCL::CreateGaussianFilter(float sigma)
00455 {
00456
00457 int sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;
00458 int width = 2*sz + 1;
00459
00460
00461 if(GlobalUtil::_MaxFilterWidth >0 && width > GlobalUtil::_MaxFilterWidth)
00462 {
00463 std::cout<<"Filter size truncated from "<<width<<" to "<<GlobalUtil::_MaxFilterWidth<<endl;
00464 sz = GlobalUtil::_MaxFilterWidth>>1;
00465 width = 2 * sz + 1;
00466 }
00467
00468 int i;
00469 float * kernel = new float[width];
00470 float rv = 1.0f/(sigma*sigma);
00471 float v, ksum =0;
00472
00473
00474 for( i = -sz ; i <= sz ; ++i)
00475 {
00476 kernel[i+sz] = v = exp(-0.5f * i * i *rv) ;
00477 ksum += v;
00478 }
00479
00480
00481 rv = 1.0f / ksum;
00482 for(i = 0; i< width ;i++) kernel[i]*=rv;
00483
00484 FilterCL * filter = CreateFilter(kernel, width);
00485 delete [] kernel;
00486 if(GlobalUtil::_verbose && GlobalUtil::_timingL) std::cout<<"Filter: sigma = "<<sigma<<", size = "<<width<<"x"<<width<<endl;
00487 return filter;
00488 }
00489
00490 FilterCL* ProgramBagCL::CreateFilter(float kernel[], int width)
00491 {
00492 FilterCL * filter = new FilterCL;
00493 filter->s_shader_h = CreateFilterH(kernel, width);
00494 filter->s_shader_v = CreateFilterV(kernel, width);
00495 filter->_size = width;
00496 filter->_id = 0;
00497 return filter;
00498 }
00499
00500 ProgramCL* ProgramBagCL::CreateFilterH(float kernel[], int width)
00501 {
00502 int halfwidth = width >>1;
00503 float * pf = kernel + halfwidth;
00504 int nhpixel = (halfwidth+1)>>1;
00505 int npixel = (nhpixel<<1)+1;
00506 float weight[3];
00507
00509 char buffer[10240];
00510 ostrstream out(buffer, 10240);
00511 out<<setprecision(8);
00512
00513
00514
00515 out<<
00516 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
00517 "__kernel void filter_h(__read_only image2d_t input, \n"
00518 " __write_only image2d_t output, int width_, int height_) {\n"
00519 "int x = get_global_id(0);\n"
00520 "int y = get_global_id(1); \n"
00521 "if( x > width_ || y > height_) return; \n"
00522 "float4 pc; int2 coord; \n"
00523 "float4 result = (float4)(0.0);\n";
00524 for(int i = 0 ; i < npixel ; i++)
00525 {
00526 out<<"coord = (int2)(x + ("<< (i - nhpixel) << "), y);\n";
00527 out<<"pc= read_imagef(input, sampler, coord);\n";
00528 if(GlobalUtil::_PreciseBorder)
00529 out<<"if(coord.x < 0) pc = pc.xxzz; else if (coord.x > width_) pc = pc.yyww; \n";
00530
00531 int xw = (i - nhpixel)*2;
00532 for(int j = 0; j < 3; j++)
00533 {
00534 int xwn = xw + j -1;
00535 weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
00536 }
00537 if(weight[1] == 0.0)
00538 {
00539 out<<"result += (float4)("<<weight[2]<<","<<weight[0]<<","<<weight[2]<<","<<weight[0]<<") * pc.yxwz;\n";
00540 }
00541 else
00542 {
00543 out<<"result += (float4)("<<weight[1]<<", "<<weight[0]<<", "<<weight[1]<<", "<<weight[0]<<") * pc.xxzz;\n";
00544 out<<"result += (float4)("<<weight[2]<<", "<<weight[1]<<", "<<weight[2]<<", "<<weight[1]<<") * pc.yyww;\n";
00545 }
00546 }
00547 out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
00548 return new ProgramCL("filter_h", buffer, _context, _device);
00549 }
00550
00551
00552
00553 ProgramCL* ProgramBagCL::CreateFilterV(float kernel[], int width)
00554 {
00555
00556 int halfwidth = width >>1;
00557 float * pf = kernel + halfwidth;
00558 int nhpixel = (halfwidth+1)>>1;
00559 int npixel = (nhpixel<<1)+1;
00560 float weight[3];
00561
00563 char buffer[10240];
00564 ostrstream out(buffer, 10240);
00565 out<<setprecision(8);
00566
00567
00568
00569 out<<
00570 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
00571 "__kernel void filter_v(__read_only image2d_t input, \n"
00572 " __write_only image2d_t output, int width_, int height_) {\n"
00573 "int x = get_global_id(0);\n"
00574 "int y = get_global_id(1); \n"
00575 "if( x > width_ || y >= height_) return; \n"
00576 "float4 pc; int2 coord; \n"
00577 "float4 result = (float4)(0.0);\n";
00578 for(int i = 0 ; i < npixel ; i++)
00579 {
00580 out<<"coord = (int2)(x, y + ("<< (i - nhpixel) << "));\n";
00581 out<<"pc= read_imagef(input, sampler, coord);\n";
00582 if(GlobalUtil::_PreciseBorder)
00583 out<<"if(coord.y < 0) pc = pc.xyxy; else if (coord.y > height_) pc = pc.zwzw; \n";
00584
00585 int xw = (i - nhpixel)*2;
00586 for(int j = 0; j < 3; j++)
00587 {
00588 int xwn = xw + j -1;
00589 weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
00590 }
00591 if(weight[1] == 0.0)
00592 {
00593 out<<"result += (float4)("<<weight[2]<<","<<weight[2]<<","<<weight[0]<<","<<weight[0]<<") * pc.zwxy;\n";
00594 }
00595 else
00596 {
00597 out<<"result += (float4)("<<weight[1]<<", "<<weight[1]<<", "<<weight[0]<<", "<<weight[0]<<") * pc.xyxy;\n";
00598 out<<"result += (float4)("<<weight[2]<<", "<<weight[2]<<", "<<weight[1]<<", "<<weight[1]<<") * pc.zwzw;\n";
00599 }
00600 }
00601 out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
00602 return new ProgramCL("filter_v", buffer, _context, _device);
00603
00604 }
00605
00606 void ProgramBagCL::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
00607 {
00608 cl_kernel kernelh = filter->s_shader_h->_kernel;
00609 cl_kernel kernelv = filter->s_shader_v->_kernel;
00611
00612 cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
00613 cl_int w_ = w - 1, h_ = h - 1;
00614
00615 size_t dim0 = 16, dim1 = 16;
00616 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00617
00618 clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
00619 clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
00620 clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
00621 clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
00622 status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz, lsz, 0, NULL, NULL);
00623 CheckErrorCL(status, "ProgramBagCL::FilterImageH");
00624 if(status != CL_SUCCESS) return;
00625
00626 clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
00627 clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
00628 clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
00629 clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
00630 size_t gsz2[2] = {(w + dim1 - 1) / dim1 * dim1, (h + dim0 - 1) / dim0 * dim0}, lsz2[2] = {dim1, dim0};
00631 status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
00632 CheckErrorCL(status, "ProgramBagCL::FilterImageV");
00633
00634 }
00635
00636 void ProgramBagCL::SampleImageU(CLTexImage *dst, CLTexImage *src, int log_scale)
00637 {
00638 cl_kernel kernel= s_sampling_u->_kernel;
00639 float scale = 1.0f / (1 << log_scale);
00640 float offset = scale * 0.5f;
00641 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00642 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00643 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00644 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00645 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00646 clSetKernelArg(kernel, 4, sizeof(cl_float), &(scale));
00647 clSetKernelArg(kernel, 5, sizeof(cl_float), &(offset));
00648
00649 size_t dim0 = 16, dim1 = 16;
00650
00651 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00652 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00653 CheckErrorCL(status, "ProgramBagCL::SampleImageU");
00654 }
00655
00656 void ProgramBagCL::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
00657 {
00658 cl_kernel kernel;
00659 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00660 if(log_scale == 1)
00661 {
00662 kernel = s_sampling->_kernel;
00663 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00664 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00665 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00666 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00667 }else
00668 {
00669 cl_int fullstep = (1 << log_scale);
00670 cl_int halfstep = fullstep >> 1;
00671 kernel = s_sampling_k->_kernel;
00672 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00673 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00674 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00675 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00676 clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
00677 clSetKernelArg(kernel, 5, sizeof(cl_int), &(halfstep));
00678 }
00679 size_t dim0 = 128, dim1 = 1;
00680
00681 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00682 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00683 CheckErrorCL(status, "ProgramBagCL::SampleImageD");
00684 }
00685
00686 void ProgramBagCL::FilterInitialImage(CLTexImage* tex, CLTexImage* buf)
00687 {
00688 if(f_gaussian_skip0) FilterImage(f_gaussian_skip0, tex, tex, buf);
00689 }
00690
00691 void ProgramBagCL::FilterSampledImage(CLTexImage* tex, CLTexImage* buf)
00692 {
00693 if(f_gaussian_skip1) FilterImage(f_gaussian_skip1, tex, tex, buf);
00694 }
00695
00696 void ProgramBagCL::ComputeDOG(CLTexImage*tex, CLTexImage* texp, CLTexImage* dog, CLTexImage* grad, CLTexImage* rot)
00697 {
00698 int margin = 0, use_gm2 = 1;
00699 bool both_grad_dog = rot->_clData && grad->_clData;
00700 cl_int w = tex->GetImgWidth(), h = tex->GetImgHeight();
00701 cl_kernel kernel ; size_t dim0, dim1;
00702 if(!both_grad_dog) {kernel = s_dog_pass->_kernel; dim0 = 16; dim1 = 12; }
00703 else if(use_gm2) {kernel = s_grad_pass2->_kernel; dim0 = 32; dim1 = 14; margin = 2; }
00704 else {kernel = s_grad_pass->_kernel; dim0 = 16; dim1 = 20; }
00705 size_t gsz[2] = { (w + dim0 - 1 - margin) / (dim0 - margin) * dim0,
00706 (h + dim1 - 1 - margin) / (dim1 - margin) * dim1};
00707 size_t lsz[2] = {dim0, dim1};
00708 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(tex->_clData));
00709 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(texp->_clData));
00710 clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dog->_clData));
00711 clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
00712 clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
00713 if(both_grad_dog)
00714 {
00715 clSetKernelArg(kernel, 5, sizeof(cl_mem), &(grad->_clData));
00716 clSetKernelArg(kernel, 6, sizeof(cl_mem), &(rot->_clData));
00717 }
00719 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00720 CheckErrorCL(status, "ProgramBagCL::ComputeDOG");
00721 }
00722
00723
00724 void ProgramBagCL::ComputeKEY(CLTexImage*dog, CLTexImage* key, float Tdog, float Tedge)
00725 {
00726 cl_kernel kernel = s_keypoint->_kernel;
00727 cl_int w = key->GetImgWidth(), h = key->GetImgHeight();
00728 float threshold0 = Tdog* (GlobalUtil::_SubpixelLocalization?0.8f:1.0f);
00729 float threshold1 = Tdog;
00730 float threshold2 = (Tedge+1)*(Tedge+1)/Tedge;
00731
00732 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
00733 clSetKernelArg(kernel, 1, sizeof(cl_mem), &((dog + 1)->_clData));
00734 clSetKernelArg(kernel, 2, sizeof(cl_mem), &((dog - 1)->_clData));
00735 clSetKernelArg(kernel, 3, sizeof(cl_mem), &(key->_clData));
00736 clSetKernelArg(kernel, 4, sizeof(cl_float), &(threshold0));
00737 clSetKernelArg(kernel, 5, sizeof(cl_float), &(threshold1));
00738 clSetKernelArg(kernel, 6, sizeof(cl_float), &(threshold2));
00739 clSetKernelArg(kernel, 7, sizeof(cl_int), &(w));
00740 clSetKernelArg(kernel, 8, sizeof(cl_int), &(h));
00741
00742 size_t dim0 = 8, dim1 = 8;
00743
00744 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00745 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00746 CheckErrorCL(status, "ProgramBagCL::ComputeKEY");
00747 }
00748
00749 void ProgramBagCL::UnpackImage(CLTexImage*src, CLTexImage* dst)
00750 {
00751 cl_kernel kernel = s_unpack->_kernel;
00752 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00753 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00754 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00755 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00756 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00757 const size_t dim0 = 16, dim1 = 16;
00758 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00759 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00760
00761 CheckErrorCL(status, "ProgramBagCL::UnpackImage");
00762 FinishCL();
00763
00764 }
00765
00766 void ProgramBagCL::UnpackImageDOG(CLTexImage*src, CLTexImage* dst)
00767 {
00768 if(s_unpack_dog == NULL) return;
00769 cl_kernel kernel = s_unpack_dog->_kernel;
00770 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00771 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00772 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00773 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00774 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00775 const size_t dim0 = 16, dim1 = 16;
00776 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00777 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00778
00779 CheckErrorCL(status, "ProgramBagCL::UnpackImage");
00780 FinishCL();
00781 }
00782
00783 void ProgramBagCL::UnpackImageGRD(CLTexImage*src, CLTexImage* dst)
00784 {
00785 if(s_unpack_grd == NULL) return;
00786 cl_kernel kernel = s_unpack_grd->_kernel;
00787 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00788 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00789 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00790 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00791 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00792 const size_t dim0 = 16, dim1 = 16;
00793 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00794 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00795
00796 CheckErrorCL(status, "ProgramBagCL::UnpackImage");
00797 FinishCL();
00798 }
00799 void ProgramBagCL::UnpackImageKEY(CLTexImage*src, CLTexImage* dog, CLTexImage* dst)
00800 {
00801 if(s_unpack_key == NULL) return;
00802 cl_kernel kernel = s_unpack_key->_kernel;
00803 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00804 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
00805 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(src->_clData));
00806 clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dst->_clData));
00807 clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
00808 clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
00809 const size_t dim0 = 16, dim1 = 16;
00810 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00811 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00812
00813 CheckErrorCL(status, "ProgramBagCL::UnpackImageKEY");
00814 FinishCL();
00815 }
00816 void ProgramBagCL::LoadDescriptorShader()
00817 {
00818 GlobalUtil::_DescriptorPPT = 16;
00819 LoadDescriptorShaderF2();
00820 }
00821
00822 void ProgramBagCL::LoadDescriptorShaderF2()
00823 {
00824
00825 }
00826
00827 void ProgramBagCL::LoadOrientationShader(void)
00828 {
00829
00830 }
00831
00832 void ProgramBagCL::LoadGenListShader(int ndoglev,int nlev)
00833 {
00834
00835 }
00836
00837 void ProgramBagCL::LoadKeypointShader()
00838 {
00839 int i; char buffer[20240];
00840 ostrstream out(buffer, 20240);
00841 streampos pos;
00842
00843
00844
00845
00846 out<<
00847 "__kernel void keypoint(__read_only image2d_t tex, __read_only image2d_t texU,\n"
00848 " __read_only image2d_t texD, __write_only image2d_t texK,\n"
00849 " float THRESHOLD0, float THRESHOLD1, \n"
00850 " float THRESHOLD2, int width, int height)\n"
00851 "{\n"
00852 " sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
00853 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
00854 " int x = get_global_id(0), y = get_global_id(1);\n"
00855 " if(x >= width || y >= height) return; \n"
00856 " int xp = x - 1, xn = x + 1;\n"
00857 " int yp = y - 1, yn = y + 1;\n"
00858 " int2 coord0 = (int2) (x, y); \n"
00859 " int2 coord1 = (int2) (xp, y); \n"
00860 " int2 coord2 = (int2) (xn, y); \n"
00861 " int2 coord3 = (int2) (x, yp); \n"
00862 " int2 coord4 = (int2) (x, yn); \n"
00863 " int2 coord5 = (int2) (xp, yp); \n"
00864 " int2 coord6 = (int2) (xp, yn); \n"
00865 " int2 coord7 = (int2) (xn, yp); \n"
00866 " int2 coord8 = (int2) (xn, yn); \n"
00867 " float4 ccc = read_imagef(tex, sampler,coord0);\n"
00868 " float4 clc = read_imagef(tex, sampler,coord1);\n"
00869 " float4 crc = read_imagef(tex, sampler,coord2);\n"
00870 " float4 ccd = read_imagef(tex, sampler,coord3);\n"
00871 " float4 ccu = read_imagef(tex, sampler,coord4);\n"
00872 " float4 cld = read_imagef(tex, sampler,coord5);\n"
00873 " float4 clu = read_imagef(tex, sampler,coord6);\n"
00874 " float4 crd = read_imagef(tex, sampler,coord7);\n"
00875 " float4 cru = read_imagef(tex, sampler,coord8);\n"
00876 " float4 cc = ccc;\n"
00877 " float4 v1[4], v2[4];\n"
00878 " v1[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
00879 " v1[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
00880 " v1[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
00881 " v1[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
00882 " v2[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
00883 " v2[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
00884 " v2[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
00885 " v2[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n"
00886 " float4 key4 = (float4)(0); \n";
00887
00888
00889
00890 for(i = 0; i < 4; ++i)
00891 out<<
00892 " if(cc.s"<<i<<" > THRESHOLD0){ \n"
00893 " if(all(isgreater((float4)(cc.s"<<i<<"), max(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = 1.0;\n"
00894 " }else if(cc.s"<<i<<" < -THRESHOLD0){ \n"
00895 " if(all(isless((float4)(cc.s"<<i<<"), min(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = -1.0;\n"
00896 " }";
00897
00898 out<<
00899 " if(x ==0) {key4.x = key4.z= 0; }\n"
00900 " else if(x + 1 == width) {key4.y = key4.w = 0;}\n"
00901 " if(y ==0) {key4.x = key4.y = 0; }\n"
00902 " else if(y + 1 == height) {key4.z = key4.w = 0;}\n"
00903 " float4 ak = fabs(key4); \n"
00904 " float keysum = ak.x + ak.y + ak.z + ak.w; \n"
00905 " float4 result = (float4)(0.0);\n"
00906 " if(keysum == 1.0) {\n"
00907 " float fxx[4], fyy[4], fxy[4], fx[4], fy[4];\n";
00908
00909
00910
00911
00912 for(i = 0; i < 4; ++i)
00913 out <<
00914 " if(key4.s"<<i<<" != 0)\n"
00915 " {\n"
00916 " float4 D2 = v1["<<i<<"].xyzw - cc.s"<<i<<";\n"
00917 " float2 D4 = v2["<<i<<"].xw - v2["<<i<<"].yz;\n"
00918 " float2 D5 = 0.5*(v1["<<i<<"].yw-v1["<<i<<"].xz); \n"
00919 " fx["<<i<<"] = D5.x; fy["<<i<<"] = D5.y ;\n"
00920 " fxx["<<i<<"] = D2.x + D2.y;\n"
00921 " fyy["<<i<<"] = D2.z + D2.w;\n"
00922 " fxy["<<i<<"] = 0.25*(D4.x + D4.y);\n"
00923 " float fxx_plus_fyy = fxx["<<i<<"] + fyy["<<i<<"];\n"
00924 " float score_up = fxx_plus_fyy*fxx_plus_fyy; \n"
00925 " float score_down = (fxx["<<i<<"]*fyy["<<i<<"] - fxy["<<i<<"]*fxy["<<i<<"]);\n"
00926 " if( score_down <= 0 || score_up > THRESHOLD2 * score_down)keysum = 0;\n"
00927 " }\n";
00928
00929 out <<
00930 " if(keysum == 1) {\n";
00932
00933 out<<
00934 " float4 v4[4], v5[4], v6[4];\n"
00935 " ccc = read_imagef(texU, sampler,coord0);\n"
00936 " clc = read_imagef(texU, sampler,coord1);\n"
00937 " crc = read_imagef(texU, sampler,coord2);\n"
00938 " ccd = read_imagef(texU, sampler,coord3);\n"
00939 " ccu = read_imagef(texU, sampler,coord4);\n"
00940 " cld = read_imagef(texU, sampler,coord5);\n"
00941 " clu = read_imagef(texU, sampler,coord6);\n"
00942 " crd = read_imagef(texU, sampler,coord7);\n"
00943 " cru = read_imagef(texU, sampler,coord8);\n"
00944 " float4 cu = ccc;\n"
00945 " v4[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
00946 " v4[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
00947 " v4[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
00948 " v4[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
00949 " v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
00950 " v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
00951 " v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
00952 " v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
00953
00954 for(i = 0; i < 4; ++i)
00955 out <<
00956 " if(key4.s"<<i<<" == 1.0)\n"
00957 " {\n"
00958 " if(cc.s"<<i<<" < cu.s"<<i<<" || \n"
00959 " any(isless((float4)(cc.s"<<i<<"), max(v4["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
00960 " }else if(key4.s"<<i<<" == -1.0)\n"
00961 " {\n"
00962 " if(cc.s"<<i<<" > cu.s"<<i<<" || \n"
00963 " any(isgreater((float4)(cc.s"<<i<<"), min(v4["<<i<<"], v6["<<i<<"]))) )keysum = 0; \n"
00964 " }\n";
00965
00966 out <<
00967 " if(keysum == 1.0) { \n";
00968 out <<
00969 " ccc = read_imagef(texD, sampler,coord0);\n"
00970 " clc = read_imagef(texD, sampler,coord1);\n"
00971 " crc = read_imagef(texD, sampler,coord2);\n"
00972 " ccd = read_imagef(texD, sampler,coord3);\n"
00973 " ccu = read_imagef(texD, sampler,coord4);\n"
00974 " cld = read_imagef(texD, sampler,coord5);\n"
00975 " clu = read_imagef(texD, sampler,coord6);\n"
00976 " crd = read_imagef(texD, sampler,coord7);\n"
00977 " cru = read_imagef(texD, sampler,coord8);\n"
00978 " float4 cd = ccc;\n"
00979 " v5[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
00980 " v5[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
00981 " v5[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
00982 " v5[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
00983 " v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
00984 " v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
00985 " v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
00986 " v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
00987 for(i = 0; i < 4; ++i)
00988 out <<
00989 " if(key4.s"<<i<<" == 1.0)\n"
00990 " {\n"
00991 " if(cc.s"<<i<<" < cd.s"<<i<<" ||\n"
00992 " any(isless((float4)(cc.s"<<i<<"), max(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
00993 " }else if(key4.s"<<i<<" == -1.0)\n"
00994 " {\n"
00995 " if(cc.s"<<i<<" > cd.s"<<i<<" ||\n"
00996 " any(isgreater((float4)(cc.s"<<i<<"), min(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
00997 " }\n";
00998
00999 out <<
01000 " if(keysum==1.0) {\n";
01002 if(GlobalUtil::_SubpixelLocalization)
01003 {
01004 out <<
01005 " float4 offset = (float4)(0); \n";
01006 for(i = 1; i < 4; ++i)
01007 out <<
01008 " if(key4.s"<<i<<" != 0) \n"
01009 " {\n"
01010 " cu.s0 = cu.s"<<i<<"; cd.s0 = cd.s"<<i<<"; cc.s0 = cc.s"<<i<<"; \n"
01011 " v4[0] = v4["<<i<<"]; v5[0] = v5["<<i<<"]; \n"
01012 " fxy[0] = fxy["<<i<<"]; fxx[0] = fxx["<<i<<"]; fyy[0] = fyy["<<i<<"]; \n"
01013 " fx[0] = fx["<<i<<"]; fy[0] = fy["<<i<<"]; \n"
01014 " }\n";
01015
01016 out <<
01017 " float fs = 0.5*( cu.s0 - cd.s0 ); \n"
01018 " float fss = cu.s0 + cd.s0 - cc.s0 - cc.s0;\n"
01019 " float fxs = 0.25 * (v4[0].y + v5[0].x - v4[0].x - v5[0].y);\n"
01020 " float fys = 0.25 * (v4[0].w + v5[0].z - v4[0].z - v5[0].w);\n"
01021 " float4 A0, A1, A2 ; \n"
01022 " A0 = (float4)(fxx[0], fxy[0], fxs, -fx[0]); \n"
01023 " A1 = (float4)(fxy[0], fyy[0], fys, -fy[0]); \n"
01024 " A2 = (float4)(fxs, fys, fss, -fs); \n"
01025 " float4 x3 = fabs((float4)(fxx[0], fxy[0], fxs, 0)); \n"
01026 " float maxa = max(max(x3.x, x3.y), x3.z); \n"
01027 " if(maxa >= 1e-10 ) \n"
01028 " { \n"
01029 " if(x3.y ==maxa ) \n"
01030 " { \n"
01031 " float4 TEMP = A1; A1 = A0; A0 = TEMP; \n"
01032 " }else if( x3.z == maxa ) \n"
01033 " { \n"
01034 " float4 TEMP = A2; A2 = A0; A0 = TEMP; \n"
01035 " } \n"
01036 " A0 /= A0.x; \n"
01037 " A1 -= A1.x * A0; \n"
01038 " A2 -= A2.x * A0; \n"
01039 " float2 x2 = fabs((float2)(A1.y, A2.y)); \n"
01040 " if( x2.y > x2.x ) \n"
01041 " { \n"
01042 " float4 TEMP = A2.yzwx; \n"
01043 " A2.yzw = A1.yzw; \n"
01044 " A1.yzw = TEMP.xyz; \n"
01045 " x2.x = x2.y; \n"
01046 " } \n"
01047 " if(x2.x >= 1e-10) { \n"
01048 " A1.yzw /= A1.y; \n"
01049 " A2.yzw -= A2.y * A1.yzw; \n"
01050 " if(fabs(A2.z) >= 1e-10) {\n"
01051 " offset.z = A2.w /A2.z; \n"
01052 " offset.y = A1.w - offset.z*A1.z; \n"
01053 " offset.x = A0.w - offset.z*A0.z - offset.y*A0.y; \n"
01054 " if(fabs(cc.s0 + 0.5*dot((float4)(fx[0], fy[0], fs, 0), offset ))<=THRESHOLD1\n"
01055 " || any( isgreater(fabs(offset), (float4)(1.0)))) key4 = (float4)(0.0);\n"
01056 " }\n"
01057 " }\n"
01058 " }\n"
01059 <<"\n"
01060 " float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
01061 " result = (float4)(keyv, offset.xyz);\n"
01062 " }}}}\n"
01063 " write_imagef(texK, coord0, result);\n "
01064 "}\n" <<'\0';
01065 }
01066 else
01067 {
01068 out << "\n"
01069 " float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
01070 " result = (float4)(keyv, 0, 0, 0);\n"
01071 " }}}}\n"
01072 " write_imagef(texK, coord0, result);\n "
01073 "}\n" <<'\0';
01074 }
01075
01076 s_keypoint = new ProgramCL("keypoint", buffer, _context, _device);
01077 }
01078
01079 void ProgramBagCL::LoadDisplayShaders()
01080 {
01081
01082
01083
01084 s_unpack = new ProgramCL("main",
01085 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
01086 " int width, int height) {\n"
01087 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01088 "int x = get_global_id(0), y = get_global_id(1); \n"
01089 "if(x >= width || y >= height) return;\n"
01090 "int xx = x / 2, yy = y / 2; \n"
01091 "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
01092 "float v1 = (x & 1 ? vv.w : vv.z); \n"
01093 "float v2 = (x & 1 ? vv.y : vv.x);\n"
01094 "float v = y & 1 ? v1 : v2;\n"
01095 "float4 result = (float4) (v, v, v, 1);"
01096 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
01097
01098 s_unpack_dog = new ProgramCL("main",
01099 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
01100 " int width, int height) {\n"
01101 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01102 "int x = get_global_id(0), y = get_global_id(1); \n"
01103 "if(x >= width || y >= height) return;\n"
01104 "int xx = x / 2, yy = y / 2; \n"
01105 "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
01106 "float v1 = (x & 1 ? vv.w : vv.z); \n"
01107 "float v2 = (x & 1 ? vv.y : vv.x);\n"
01108 "float v0 = y & 1 ? v1 : v2;\n"
01109 "float v = 0.5 + 20.0 * v0;\n "
01110 "float4 result = (float4) (v, v, v, 1);"
01111 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
01112
01113 s_unpack_grd = new ProgramCL("main",
01114 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
01115 " int width, int height) {\n"
01116 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01117 "int x = get_global_id(0), y = get_global_id(1); \n"
01118 "if(x >= width || y >= height) return;\n"
01119 "int xx = x / 2, yy = y / 2; \n"
01120 "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
01121 "float v1 = (x & 1 ? vv.w : vv.z); \n"
01122 "float v2 = (x & 1 ? vv.y : vv.x);\n"
01123 "float v0 = y & 1 ? v1 : v2;\n"
01124 "float v = 5.0 * v0;\n "
01125 "float4 result = (float4) (v, v, v, 1);"
01126 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
01127
01128 s_unpack_key = new ProgramCL("main",
01129 "__kernel void main(__read_only image2d_t dog,\n"
01130 " __read_only image2d_t key,\n"
01131 " __write_only image2d_t output,\n"
01132 " int width, int height) {\n"
01133 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01134 "int x = get_global_id(0), y = get_global_id(1); \n"
01135 "if(x >= width || y >= height) return;\n"
01136 "int xx = x / 2, yy = y / 2; \n"
01137 "float4 kk = read_imagef(key, sampler, (int2) (xx, yy));\n"
01138 "int4 cc = isequal(fabs(kk.xxxx), (float4)(1.0, 2.0, 3.0, 4.0));\n"
01139 "int k1 = (x & 1 ? cc.w : cc.z); \n"
01140 "int k2 = (x & 1 ? cc.y : cc.x);\n"
01141 "int k0 = y & 1 ? k1 : k2;\n"
01142 "float4 result;\n"
01143 "if(k0 != 0){\n"
01144 " //result = kk.x > 0 ? ((float4)(1.0, 0, 0, 1.0)) : ((float4) (0.0, 1.0, 0.0, 1.0)); \n"
01145 " result = kk.x < 0 ? ((float4)(0, 1.0, 0, 1.0)) : ((float4) (1.0, 0.0, 0.0, 1.0)); \n"
01146 "}else{"
01147 "float4 vv = read_imagef(dog, sampler, (int2) (xx, yy));\n"
01148 "float v1 = (x & 1 ? vv.w : vv.z); \n"
01149 "float v2 = (x & 1 ? vv.y : vv.x);\n"
01150 "float v0 = y & 1 ? v1 : v2;\n"
01151 "float v = 0.5 + 20.0 * v0;\n "
01152 "result = (float4) (v, v, v, 1);"
01153 "}\n"
01154 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
01155 }
01156
01157
01158 void ProgramBagCL::SetMarginCopyParam(int xmax, int ymax)
01159 {
01160
01161 }
01162
01163 void ProgramBagCL::SetGradPassParam(int texP)
01164 {
01165
01166 }
01167
01168 void ProgramBagCL::SetGenListEndParam(int ktex)
01169 {
01170
01171 }
01172
01173 void ProgramBagCL::SetDogTexParam(int texU, int texD)
01174 {
01175
01176 }
01177
01178 void ProgramBagCL::SetGenListInitParam(int w, int h)
01179 {
01180 float bbox[4] = {(w -1.0f) * 0.5f +0.25f, (w-1.0f) * 0.5f - 0.25f, (h - 1.0f) * 0.5f + 0.25f, (h-1.0f) * 0.5f - 0.25f};
01181
01182 }
01183
01184
01185 void ProgramBagCL::SetGenListStartParam(float width, int tex0)
01186 {
01187
01188 }
01189
01190
01191
01192 void ProgramBagCL::SetGenListStepParam(int tex, int tex0)
01193 {
01194
01195 }
01196
01197 void ProgramBagCL::SetGenVBOParam(float width, float fwidth, float size)
01198 {
01199
01200 }
01201
01202 void ProgramBagCL::SetSimpleOrientationInput(int oTex, float sigma, float sigma_step)
01203 {
01204
01205 }
01206
01207
01208 void ProgramBagCL::SetFeatureOrientationParam(int gtex, int width, int height, float sigma, int otex, float step)
01209 {
01210
01211
01212 }
01213
01214 void ProgramBagCL::SetFeatureDescirptorParam(int gtex, int otex, float dwidth, float fwidth, float width, float height, float sigma)
01215 {
01216
01217 }
01218
01219
01220
01221 const char* ProgramBagCL::GetErrorString(cl_int error)
01222 {
01223 static const char* errorString[] = {
01224 "CL_SUCCESS",
01225 "CL_DEVICE_NOT_FOUND",
01226 "CL_DEVICE_NOT_AVAILABLE",
01227 "CL_COMPILER_NOT_AVAILABLE",
01228 "CL_MEM_OBJECT_ALLOCATION_FAILURE",
01229 "CL_OUT_OF_RESOURCES",
01230 "CL_OUT_OF_HOST_MEMORY",
01231 "CL_PROFILING_INFO_NOT_AVAILABLE",
01232 "CL_MEM_COPY_OVERLAP",
01233 "CL_IMAGE_FORMAT_MISMATCH",
01234 "CL_IMAGE_FORMAT_NOT_SUPPORTED",
01235 "CL_BUILD_PROGRAM_FAILURE",
01236 "CL_MAP_FAILURE",
01237 "",
01238 "",
01239 "",
01240 "",
01241 "",
01242 "",
01243 "",
01244 "",
01245 "",
01246 "",
01247 "",
01248 "",
01249 "",
01250 "",
01251 "",
01252 "",
01253 "",
01254 "CL_INVALID_VALUE",
01255 "CL_INVALID_DEVICE_TYPE",
01256 "CL_INVALID_PLATFORM",
01257 "CL_INVALID_DEVICE",
01258 "CL_INVALID_CONTEXT",
01259 "CL_INVALID_QUEUE_PROPERTIES",
01260 "CL_INVALID_COMMAND_QUEUE",
01261 "CL_INVALID_HOST_PTR",
01262 "CL_INVALID_MEM_OBJECT",
01263 "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
01264 "CL_INVALID_IMAGE_SIZE",
01265 "CL_INVALID_SAMPLER",
01266 "CL_INVALID_BINARY",
01267 "CL_INVALID_BUILD_OPTIONS",
01268 "CL_INVALID_PROGRAM",
01269 "CL_INVALID_PROGRAM_EXECUTABLE",
01270 "CL_INVALID_KERNEL_NAME",
01271 "CL_INVALID_KERNEL_DEFINITION",
01272 "CL_INVALID_KERNEL",
01273 "CL_INVALID_ARG_INDEX",
01274 "CL_INVALID_ARG_VALUE",
01275 "CL_INVALID_ARG_SIZE",
01276 "CL_INVALID_KERNEL_ARGS",
01277 "CL_INVALID_WORK_DIMENSION",
01278 "CL_INVALID_WORK_GROUP_SIZE",
01279 "CL_INVALID_WORK_ITEM_SIZE",
01280 "CL_INVALID_GLOBAL_OFFSET",
01281 "CL_INVALID_EVENT_WAIT_LIST",
01282 "CL_INVALID_EVENT",
01283 "CL_INVALID_OPERATION",
01284 "CL_INVALID_GL_OBJECT",
01285 "CL_INVALID_BUFFER_SIZE",
01286 "CL_INVALID_MIP_LEVEL",
01287 "CL_INVALID_GLOBAL_WORK_SIZE",
01288 };
01289
01290 const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
01291
01292 const int index = -error;
01293
01294 return (index >= 0 && index < errorCount) ? errorString[index] : "";
01295 }
01296
01297 bool ProgramBagCL::CheckErrorCL(cl_int error, const char* location)
01298 {
01299 if(error == CL_SUCCESS) return true;
01300 const char *errstr = GetErrorString(error);
01301 if(errstr && errstr[0]) std::cerr << errstr;
01302 else std::cerr << "Error " << error;
01303 if(location) std::cerr << " at " << location;
01304 std::cerr << "\n";
01305 exit(0);
01306 return false;
01307
01308 }
01309
01310
01313
01314 void ProgramBagCLN::LoadFixedShaders()
01315 {
01316 s_sampling = new ProgramCL("sampling",
01317 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01318 "__kernel void sampling(__read_only image2d_t input, __write_only image2d_t output, "
01319 " int width, int height) {\n"
01320 "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
01321 "if( coord.x >= width || coord.y >= height) return;\n"
01322 "write_imagef(output, coord, read_imagef(input, sampler, coord << 1)); }" , _context, _device);
01323
01324 s_sampling_k = new ProgramCL("sampling_k",
01325 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01326 "__kernel void sampling_k(__read_only image2d_t input, __write_only image2d_t output, "
01327 " int width, int height, int step) {\n"
01328 "int x = get_global_id(0), y = get_global_id(1); \n"
01329 "if( x >= width || y >= height) return;\n"
01330 "int xa = x * step, ya = y *step; \n"
01331 "float4 v1 = read_imagef(input, sampler, (int2) (xa, ya)); \n"
01332 "write_imagef(output, (int2) (x, y), v1); }" , _context, _device);
01333
01334
01335 s_sampling_u = new ProgramCL("sampling_u",
01336 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
01337 "__kernel void sampling_u(__read_only image2d_t input, \n"
01338 " __write_only image2d_t output,\n"
01339 " int width, int height, float step) {\n"
01340 "int x = get_global_id(0), y = get_global_id(1); \n"
01341 "if( x >= width || y >= height) return;\n"
01342 "float xa = x * step, ya = y *step; \n"
01343 "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
01344 "write_imagef(output, (int2) (x, y), (float4)(v1)); }" , _context, _device);
01345
01346 s_dog_pass = new ProgramCL("dog_pass",
01347 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01348 "__kernel void dog_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
01349 " __write_only image2d_t dog, int width, int height) {\n"
01350 "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
01351 "if( coord.x >= width || coord.y >= height) return;\n"
01352 "float cc = read_imagef(tex , sampler, coord).x; \n"
01353 "float cp = read_imagef(texp, sampler, coord).x;\n"
01354 "write_imagef(dog, coord, (float4)(cc - cp)); }\n", _context, _device);
01355
01356 s_grad_pass = new ProgramCL("grad_pass",
01357 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01358 "__kernel void grad_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
01359 " __write_only image2d_t dog, int width, int height, \n"
01360 " __write_only image2d_t grad, __write_only image2d_t rot) {\n"
01361 "int x = get_global_id(0), y = get_global_id(1); \n"
01362 "if( x >= width || y >= height) return;\n"
01363 "int2 coord = (int2) (x, y);\n"
01364 "float cl = read_imagef(tex, sampler, (int2)(x - 1, y)).x;\n"
01365 "float cc = read_imagef(tex , sampler, coord).x; \n"
01366 "float cr = read_imagef(tex, sampler, (int2)(x + 1, y)).x;\n"
01367 "float cp = read_imagef(texp, sampler, coord).x;\n"
01368 "write_imagef(dog, coord, (float4)(cc - cp)); \n"
01369 "float cd = read_imagef(tex, sampler, (int2)(x, y - 1)).x;\n"
01370 "float cu = read_imagef(tex, sampler, (int2)(x, y + 1)).x;\n"
01371 "float dx = cr - cl, dy = cu - cd; \n"
01372 "float gg = 0.5 * sqrt(dx*dx + dy * dy);\n"
01373 "write_imagef(grad, coord, (float4)(gg));\n"
01374 "float oo = atan2(dy, dx + FLT_MIN);\n"
01375 "write_imagef(rot, coord, (float4)(oo));}\n", _context, _device);
01376
01377 s_grad_pass2 = new ProgramCL("grad_pass2",
01378 "#define BLOCK_DIMX 32\n"
01379 "#define BLOCK_DIMY 14\n"
01380 "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
01381 "__kernel void grad_pass2(__read_only image2d_t tex, __read_only image2d_t texp,\n"
01382 " __write_only image2d_t dog, int width, int height,\n"
01383 " __write_only image2d_t grd, __write_only image2d_t rot){\n"
01384 "__local float block[BLOCK_SIZE]; \n"
01385 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
01386 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01387 "int2 coord = (int2) ( get_global_id(0) - get_group_id(0) * 2 - 1, \n"
01388 " get_global_id(1) - get_group_id(1) * 2 - 1); \n"
01389 "int idx = mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
01390 "float cc = read_imagef(tex, sampler, coord).x;\n"
01391 "block[idx] = cc;\n"
01392 "barrier(CLK_LOCAL_MEM_FENCE);\n"
01393 "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
01394 "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
01395 "if( coord.x >= width) return; \n"
01396 "if( coord.y >= height) return;\n"
01397 "float cp = read_imagef(texp, sampler, coord).x;\n"
01398 "float dx = block[idx + 1] - block[idx - 1];\n"
01399 "float dy = block[idx + BLOCK_DIMX ] - block[idx - BLOCK_DIMX];\n"
01400 "write_imagef(dog, coord, (float4)(cc - cp)); \n"
01401 "write_imagef(grd, coord, (float4)(0.5 * sqrt(dx*dx + dy * dy)));\n"
01402 "write_imagef(rot, coord, (float4)(atan2(dy, dx + FLT_MIN)));}\n", _context, _device);
01403 }
01404
01405 void ProgramBagCLN::LoadDisplayShaders()
01406 {
01407 s_unpack = new ProgramCL("main",
01408 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01409 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
01410 " int width, int height) {\n"
01411 "int x = get_global_id(0), y = get_global_id(1); \n"
01412 "if(x >= width || y >= height) return;\n"
01413 "float v = read_imagef(input, sampler, (int2) (x, y)).x; \n"
01414 "float4 result = (float4) (v, v, v, 1);"
01415 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
01416
01417 s_unpack_grd = new ProgramCL("main",
01418 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
01419 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01420 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
01421 " int width, int height) {\n"
01422 "int x = get_global_id(0), y = get_global_id(1); \n"
01423 "if(x >= width || y >= height) return;\n"
01424 "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
01425 "float v = 5.0 * v0; float4 result = (float4) (v, v, v, 1);"
01426 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
01427
01428 s_unpack_dog = new ProgramCL("main",
01429 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01430 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
01431 " int width, int height) {\n"
01432 "int x = get_global_id(0), y = get_global_id(1); \n"
01433 "if(x >= width || y >= height) return;\n"
01434 "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
01435 "float v = 0.5 + 20.0 * v0; float4 result = (float4) (v, v, v, 1);"
01436 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
01437 }
01438
01439 ProgramCL* ProgramBagCLN::CreateFilterH(float kernel[], int width)
01440 {
01442 char buffer[10240];
01443 ostrstream out(buffer, 10240);
01444 out << "#define KERNEL_WIDTH " << width << "\n"
01445 << "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n"
01446 "#define BLOCK_WIDTH 128\n"
01447 "#define BLOCK_HEIGHT 1\n"
01448 "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
01449 "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
01450 "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
01451 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01452 "__kernel void filter_h(__read_only image2d_t input, \n"
01453 " __write_only image2d_t output, int width_, int height_, \n"
01454 " __constant float* weight) {\n"
01455 "__local float data[CACHE_WIDTH]; \n"
01456 "int x = get_global_id(0), y = get_global_id(1);\n"
01457 "#pragma unroll\n"
01458 "for(int j = 0; j < CACHE_COUNT; ++j)\n"
01459 "{\n"
01460 " if(get_local_id(0) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
01461 " {\n"
01462 " int fetch_index = min(x + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, width_);\n"
01463 " data[get_local_id(0) + j * BLOCK_WIDTH] = read_imagef(input, sampler, (int2)(fetch_index, y)).x;\n"
01464 " }\n"
01465 "}\n"
01466 "barrier(CLK_LOCAL_MEM_FENCE); \n"
01467 "if( x > width_ || y > height_) return; \n"
01468 "float result = 0; \n"
01469 "#pragma unroll\n"
01470 "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
01471 "{\n"
01472 " result += data[get_local_id(0) + i] * weight[i];\n"
01473 "}\n"
01474 << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
01475 return new ProgramCL("filter_h", buffer, _context, _device);
01476 }
01477
01478
01479
01480 ProgramCL* ProgramBagCLN::CreateFilterV(float kernel[], int width)
01481 {
01483 char buffer[10240];
01484 ostrstream out(buffer, 10240);
01485 out << "#define KERNEL_WIDTH " << width << "\n"
01486 << "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n"
01487 "#define BLOCK_WIDTH 128\n"
01488 "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
01489 "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
01490 "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
01491 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
01492 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01493 "__kernel void filter_v(__read_only image2d_t input, \n"
01494 " __write_only image2d_t output, int width_, int height_, \n"
01495 " __constant float* weight) {\n"
01496 "__local float data[CACHE_WIDTH]; \n"
01497 "int x = get_global_id(0), y = get_global_id(1);\n"
01498 "#pragma unroll\n"
01499 "for(int j = 0; j < CACHE_COUNT; ++j)\n"
01500 "{\n"
01501 " if(get_local_id(1) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
01502 " {\n"
01503 " int fetch_index = min(y + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, height_);\n"
01504 " data[get_local_id(1) + j * BLOCK_WIDTH ] = read_imagef(input, sampler, (int2)(x, fetch_index)).x;\n"
01505 " }\n"
01506 "}\n"
01507 "barrier(CLK_LOCAL_MEM_FENCE); \n"
01508 "if( x > width_ || y > height_) return; \n"
01509 "float result = 0; \n"
01510 "#pragma unroll\n"
01511 "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
01512 "{\n"
01513 " result += data[get_local_id(1) + i] * weight[i];\n"
01514 "}\n"
01515 << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
01516
01517 return new ProgramCL("filter_v", buffer, _context, _device);
01518 }
01519
01520 FilterCL* ProgramBagCLN::CreateFilter(float kernel[], int width)
01521 {
01522 FilterCL * filter = new FilterCL;
01523 filter->s_shader_h = CreateFilterH(kernel, width);
01524 filter->s_shader_v = CreateFilterV(kernel, width);
01525 filter->_weight = new CLTexImage(_context, _queue);
01526 filter->_weight->InitBufferTex(width, 1, 1);
01527 filter->_weight->CopyFromHost(kernel);
01528 filter->_size = width;
01529 return filter;
01530 }
01531
01532
01533 void ProgramBagCLN::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
01534 {
01535 cl_kernel kernelh = filter->s_shader_h->_kernel;
01536 cl_kernel kernelv = filter->s_shader_v->_kernel;
01538
01539 cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
01540 cl_mem weight = (cl_mem) filter->_weight->_clData;
01541 cl_int w_ = w - 1, h_ = h - 1;
01542
01543
01544 clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
01545 clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
01546 clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
01547 clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
01548 clSetKernelArg(kernelh, 4, sizeof(cl_mem), &weight);
01549
01550 size_t dim00 = 128, dim01 = 1;
01551 size_t gsz1[2] = {(w + dim00 - 1) / dim00 * dim00, (h + dim01 - 1) / dim01 * dim01}, lsz1[2] = {dim00, dim01};
01552 status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz1, lsz1, 0, NULL, NULL);
01553 CheckErrorCL(status, "ProgramBagCLN::FilterImageH");
01554 if(status != CL_SUCCESS) return;
01555
01556
01557 clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
01558 clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
01559 clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
01560 clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
01561 clSetKernelArg(kernelv, 4, sizeof(cl_mem), &weight);
01562
01563 size_t dim10 = 1, dim11 = 128;
01564 size_t gsz2[2] = {(w + dim10 - 1) / dim10 * dim10, (h + dim11 - 1) / dim11 * dim11}, lsz2[2] = {dim10, dim11};
01565 status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
01566 CheckErrorCL(status, "ProgramBagCLN::FilterImageV");
01567
01568 }
01569
01570 void ProgramBagCLN::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
01571 {
01572 cl_kernel kernel;
01573 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
01574
01575 cl_int fullstep = (1 << log_scale);
01576 kernel = log_scale == 1? s_sampling->_kernel : s_sampling_k->_kernel;
01577 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
01578 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
01579 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
01580 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
01581 if(log_scale > 1) clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
01582
01583 size_t dim0 = 128, dim1 = 1;
01584
01585 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
01586 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
01587 CheckErrorCL(status, "ProgramBagCLN::SampleImageD");
01588 }
01589
01590
01591 #endif
01592