rgbd_registration: ProgramCL.cpp Source File

Go to the documentation of this file.
00001 
00002 //      File:           ProgramCL.cpp
00003 //      Author:         Changchang Wu
00004 //      Description :   implementation of CL related class.
00005 //                          class ProgramCL                     A simple wrapper of Cg programs
00006 //
00007 //      Copyright (c) 2007 University of North Carolina at Chapel Hill
00008 //      All Rights Reserved
00009 //
00010 //      Permission to use, copy, modify and distribute this software and its
00011 //      documentation for educational, research and non-profit purposes, without
00012 //      fee, and without a written agreement is hereby granted, provided that the
00013 //      above copyright notice and the following paragraph appear in all copies.
00014 //      
00015 //      The University of North Carolina at Chapel Hill make no representations
00016 //      about the suitability of this software for any purpose. It is provided
00017 //      'as is' without express or implied warranty. 
00018 //
00019 //      Please send BUG REPORTS to ccwu@cs.unc.edu
00020 //
00022 
00023 #if defined(CL_SIFTGPU_ENABLED) 
00024 
00025 #include <CL/opencl.h>
00026 #include <GL/glew.h>
00027 
00028 #include <iostream>
00029 #include <iomanip>
00030 #include <vector>
00031 #include <strstream>
00032 #include <algorithm>
00033 #include <stdlib.h>
00034 #include <math.h>
00035 #include <string.h>
00036 using namespace std;
00037 
00038 #include "GlobalUtil.h"
00039 #include "CLTexImage.h"
00040 #include "ProgramCL.h"
00041 #include "SiftGPU.h"
00042 
00043 
00044 #if  defined(_WIN32) 
00045         #pragma comment (lib, "OpenCL.lib")
00046 #endif
00047 
00048 #ifndef _INC_WINDOWS
00049 #ifndef WIN32_LEAN_AND_MEAN
00050         #define WIN32_LEAN_AND_MEAN
00051 #endif
00052 #include <windows.h>
00053 #endif 
00054 
00056 // Construction/Destruction
00058 
00059 ProgramCL::ProgramCL()
00060 {
00061         _program = NULL;
00062     _kernel = NULL;
00063     _valid = 0;
00064 }
00065 
00066 ProgramCL::~ProgramCL()
00067 {
00068     if(_kernel)  clReleaseKernel(_kernel);
00069     if(_program) clReleaseProgram(_program);
00070 }
00071 
00072 ProgramCL::ProgramCL(const char* name, const char * code, cl_context context, cl_device_id device) : _valid(1)
00073 {
00074     const char * src[1] = {code};     cl_int status;
00075 
00076     _program = clCreateProgramWithSource(context, 1, src, NULL, &status);
00077     if(status != CL_SUCCESS) _valid = 0;
00078 
00079     status = clBuildProgram(_program, 0, NULL, 
00080         GlobalUtil::_debug ? 
00081         "-cl-fast-relaxed-math -cl-single-precision-constant -cl-nv-verbose" : 
00082         "-cl-fast-relaxed-math -cl-single-precision-constant", NULL, NULL);
00083 
00084     if(status != CL_SUCCESS) {PrintBuildLog(device, 1); _valid = 0;}
00085     else if(GlobalUtil::_debug) PrintBuildLog(device, 0); 
00086 
00087     _kernel = clCreateKernel(_program, name, &status); 
00088     if(status != CL_SUCCESS) _valid = 0;
00089 }
00090 
00091 void ProgramCL::PrintBuildLog(cl_device_id device, int all)
00092 {
00093     char buffer[10240] = "\0"; 
00094     cl_int status = clGetProgramBuildInfo(
00095         _program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
00096     if(all )  
00097     {
00098         std::cerr << buffer  << endl; 
00099     }else
00100     {
00101         const char * pos = strstr(buffer, "ptxas");
00102         if(pos) std::cerr << pos << endl; 
00103     }
00104 }
00105 
00108 
00109 ProgramBagCL::ProgramBagCL()
00110 {
00112     _context = NULL;   _queue = NULL;
00113     s_gray = s_sampling = NULL;
00114     s_packup = s_zero_pass = NULL;
00115     s_gray_pack = s_unpack = NULL;
00116     s_sampling_u = NULL;
00117     s_dog_pass   = NULL;
00118     s_grad_pass  = NULL;
00119     s_grad_pass2 = NULL;
00120     s_unpack_dog = NULL;
00121     s_unpack_grd = NULL;
00122     s_unpack_key = NULL;
00123     s_keypoint = NULL;
00124     f_gaussian_skip0 = NULL;
00125     f_gaussian_skip1 = NULL;
00126     f_gaussian_step = 0;
00127 
00129         GlobalUtil::StartTimer("Initialize OpenCL");
00130     if(!InitializeContext()) return;
00131     GlobalUtil::StopTimer();
00132 
00133 }
00134 
00135 
00136 
00137 ProgramBagCL::~ProgramBagCL()
00138 {
00139     if(s_gray)      delete s_gray;
00140         if(s_sampling)  delete s_sampling;
00141         if(s_zero_pass) delete s_zero_pass;
00142     if(s_packup)    delete s_packup;
00143     if(s_unpack)    delete s_unpack;
00144     if(s_gray_pack) delete s_gray_pack;
00145     if(s_sampling_u)  delete s_sampling_u;
00146     if(s_dog_pass)  delete s_dog_pass;
00147     if(s_grad_pass)  delete s_grad_pass;
00148     if(s_grad_pass2)  delete s_grad_pass2;
00149     if(s_unpack_dog) delete s_unpack_dog;
00150     if(s_unpack_grd) delete s_unpack_grd;
00151     if(s_unpack_key) delete s_unpack_key;
00152     if(s_keypoint)   delete s_keypoint;
00153 
00154     if(f_gaussian_skip1) delete f_gaussian_skip1;
00155 
00156     for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
00157     {
00158             if(f_gaussian_skip0_v[i]) delete f_gaussian_skip0_v[i];
00159     }
00160     if(f_gaussian_step && _gaussian_step_num > 0) 
00161     {
00162             for(int i = 0; i< _gaussian_step_num; i++)
00163             {
00164                     delete f_gaussian_step[i];
00165             }
00166             delete[] f_gaussian_step;
00167     }
00168 
00170     if(_context) clReleaseContext(_context);
00171     if(_queue) clReleaseCommandQueue(_queue);
00172 }
00173 
00174 bool ProgramBagCL::InitializeContext()
00175 {
00176     cl_uint num_platform, num_device;
00177     cl_int status;
00178     // Get OpenCL platform count
00179     status = clGetPlatformIDs (0, NULL, &num_platform);
00180     if (status != CL_SUCCESS || num_platform == 0) return false; 
00181 
00182     cl_platform_id platforms[16];
00183     if(num_platform > 16 ) num_platform = 16;
00184     status = clGetPlatformIDs (num_platform, platforms, NULL);
00185     _platform = platforms[0];
00186 
00188     status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_device);
00189     if(status != CL_SUCCESS || num_device == 0) return false;
00190 
00191     // Create the device list
00192     cl_device_id* devices = new cl_device_id [num_device];
00193     status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, num_device, devices, NULL);
00194     _device = (status == CL_SUCCESS? devices[0] : 0);  delete[] devices;   
00195     if(status != CL_SUCCESS)  return false;  
00196 
00197 
00198     if(GlobalUtil::_verbose)
00199     {
00200         cl_device_mem_cache_type is_gcache; 
00201         clGetDeviceInfo(_device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(is_gcache), &is_gcache, NULL);
00202         if(is_gcache == CL_NONE) std::cout << "No cache for global memory\n";
00203         //else if(is_gcache == CL_READ_ONLY_CACHE) std::cout << "Read only cache for global memory\n";
00204         //else std::cout << "Read/Write cache for global memory\n";
00205     }
00206 
00207     //context;
00208     if(GlobalUtil::_UseSiftGPUEX)
00209     {
00210         cl_context_properties prop[] = {
00211         CL_CONTEXT_PLATFORM, (cl_context_properties)_platform,
00212         CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
00213         CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),  0 };
00214         _context = clCreateContext(prop, 1, &_device, NULL, NULL, &status);    
00215         if(status != CL_SUCCESS) return false;
00216     }else
00217     {
00218         _context = clCreateContext(0, 1, &_device, NULL, NULL, &status);    
00219         if(status != CL_SUCCESS) return false;
00220     }
00221 
00222     //command queue
00223     _queue = clCreateCommandQueue(_context, _device, 0, &status);
00224     return status == CL_SUCCESS;
00225 }
00226 
00227 void ProgramBagCL::InitProgramBag(SiftParam&param)
00228 {
00229         GlobalUtil::StartTimer("Load Programs");
00230     LoadFixedShaders();
00231     LoadDynamicShaders(param);
00232     if(GlobalUtil::_UseSiftGPUEX) LoadDisplayShaders();
00233     GlobalUtil::StopTimer();
00234 }
00235 
00236 
00237 void ProgramBagCL::UnloadProgram()
00238 {
00239 
00240 }
00241 
00242 void ProgramBagCL::FinishCL()
00243 {
00244     clFinish(_queue);
00245 }
00246 
00247 void ProgramBagCL::LoadFixedShaders()
00248 {
00249 
00250  
00251     s_gray = new ProgramCL( "gray", 
00252         "__kernel void gray(__read_only  image2d_t input, __write_only image2d_t output) {\n"
00253         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00254         "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
00255         "float4 weight = (float4)(0.299, 0.587, 0.114, 0.0);\n"
00256         "float intensity = dot(weight, read_imagef(input,sampler, coord ));\n"
00257             "float4 result= (float4)(intensity, intensity, intensity, 1.0);\n"
00258         "write_imagef(output, coord, result); }", _context, _device     );
00259 
00260 
00261         s_sampling = new ProgramCL("sampling",
00262         "__kernel void sampling(__read_only  image2d_t input, __write_only image2d_t output,\n"
00263         "                   int width, int height) {\n"
00264         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00265         "int x = get_global_id(0), y =  get_global_id(1); \n"
00266         "if( x >= width || y >= height) return;\n"
00267         "int xa = x + x,   ya = y + y; \n"
00268         "int xb = xa + 1,  yb = ya + 1; \n"
00269         "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
00270         "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
00271         "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
00272         "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
00273         "float4 result = (float4) (v1, v2, v3, v4);"
00274         "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
00275 
00276         s_sampling_k = new ProgramCL("sampling_k",
00277         "__kernel void sampling_k(__read_only  image2d_t input, __write_only image2d_t output, "
00278         "                   int width, int height,\n"
00279         "                   int step,  int halfstep) {\n"
00280         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
00281         "int x = get_global_id(0), y =  get_global_id(1); \n"
00282         "if( x >= width || y >= height) return;\n"
00283         "int xa = x * step,   ya = y *step; \n"
00284         "int xb = xa + halfstep,  yb = ya + halfstep; \n"
00285         "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
00286         "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
00287         "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
00288         "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
00289         "float4 result = (float4) (v1, v2, v3, v4);"
00290         "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
00291 
00292 
00293         s_sampling_u = new ProgramCL("sampling_u",
00294         "__kernel void sampling_u(__read_only  image2d_t input, \n"
00295         "                   __write_only image2d_t output,\n"
00296         "                   int width, int height,\n"
00297         "                   float step, float halfstep) {\n"
00298         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
00299         "int x = get_global_id(0), y =  get_global_id(1); \n"
00300         "if( x >= width || y >= height) return;\n"
00301         "float xa = x * step,       ya = y *step; \n"
00302         "float xb = xa + halfstep,  yb = ya + halfstep; \n"
00303         "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
00304         "float v2 = read_imagef(input, sampler, (float2) (xb, ya)).x; \n"
00305         "float v3 = read_imagef(input, sampler, (float2) (xa, yb)).x; \n"
00306         "float v4 = read_imagef(input, sampler, (float2) (xb, yb)).x; \n"
00307         "float4 result = (float4) (v1, v2, v3, v4);"
00308         "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
00309 
00310 
00311     s_zero_pass = new ProgramCL("zero_pass",
00312         "__kernel void zero_pass(__write_only image2d_t output){\n"
00313         "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
00314         "write_imagef(output, coord, (float4)(0.0));}", _context, _device);
00315 
00316     s_packup = new ProgramCL("packup",
00317         "__kernel void packup(__global float* input, __write_only image2d_t output,\n"
00318         "                   int twidth, int theight, int width){\n"
00319         "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
00320         "if(coord.x >= twidth || coord.y >= theight) return;\n"
00321         "int index0 = (coord.y + coord.y) * width; \n"
00322         "int index1 = index0 + coord.x;\n"
00323         "int x2 = min(width -1, coord.x); \n"
00324         "float v1 = input[index1 + coord.x], v2 = input[index1 + x2]; \n"
00325         "int index2 = index1 + width; \n"
00326         "float v3 = input[index2 + coord.x], v4 = input[index2 + x2]; \n "
00327         "write_imagef(output, coord, (float4) (v1, v2, v3, v4));}", _context, _device);
00328 
00329    s_dog_pass = new ProgramCL("dog_pass",
00330         "__kernel void dog_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
00331         "                   __write_only image2d_t dog, int width, int height) {\n"
00332         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
00333         "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 
00334         "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
00335         "if( coord.x >= width || coord.y >= height) return;\n"
00336         "float4 cc = read_imagef(tex , sampler, coord); \n"
00337         "float4 cp = read_imagef(texp, sampler, coord);\n"
00338         "write_imagef(dog, coord, cc - cp); }\n", _context, _device); 
00339 
00340    s_grad_pass = new ProgramCL("grad_pass",
00341         "__kernel void grad_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
00342         "                   __write_only image2d_t dog, int width, int height,\n"
00343         "                    __write_only image2d_t grad, __write_only image2d_t rot) {\n"
00344         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
00345         "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 
00346         "int x = get_global_id(0), y =  get_global_id(1); \n"
00347         "if( x >= width || y >= height) return;\n"
00348         "int2 coord = (int2) (x, y);\n"
00349         "float4 cc = read_imagef(tex , sampler, coord); \n"
00350         "float4 cp = read_imagef(texp, sampler, coord);\n"
00351         "float2 cl = read_imagef(tex, sampler, (int2)(x - 1, y)).yw;\n"
00352         "float2 cr = read_imagef(tex, sampler, (int2)(x + 1, y)).xz;\n"
00353             "float2 cd = read_imagef(tex, sampler, (int2)(x, y - 1)).zw;\n"
00354         "float2 cu = read_imagef(tex, sampler, (int2)(x, y + 1)).xy;\n"
00355         "write_imagef(dog, coord, cc - cp); \n"
00356         "float4 dx = (float4)(cc.y - cl.x,  cr.x - cc.x, cc.w - cl.y, cr.y - cc.z);\n"
00357         "float4 dy = (float4)(cc.zw - cd.xy, cu.xy - cc.xy);\n"
00358         "write_imagef(grad, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
00359         "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device); 
00360 
00361     s_grad_pass2 = new ProgramCL("grad_pass2",
00362         "#define BLOCK_DIMX 32\n"
00363         "#define BLOCK_DIMY 14\n"
00364         "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
00365         "__kernel void grad_pass2(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
00366         "                   __write_only image2d_t dog, int width, int height,\n"
00367         "                   __write_only image2d_t grd, __write_only image2d_t rot){\n"//,  __local float* block) {\n"
00368         "__local float block[BLOCK_SIZE * 4]; \n"
00369         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
00370         "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 
00371         "int2 coord = (int2) (  get_global_id(0) - get_group_id(0) * 2 - 1, \n"
00372         "                       get_global_id(1) - get_group_id(1) * 2- 1); \n"
00373         "int idx =  mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
00374         "float4 cc = read_imagef(tex, sampler, coord);\n"
00375         "block[idx                 ] = cc.x;\n"
00376         "block[idx + BLOCK_SIZE    ] = cc.y;\n"
00377         "block[idx + BLOCK_SIZE * 2] = cc.z;\n"
00378         "block[idx + BLOCK_SIZE * 3] = cc.w;\n"
00379         "barrier(CLK_LOCAL_MEM_FENCE);\n"
00380         "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
00381         "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
00382         "if( coord.x >= width) return; \n"
00383         "if( coord.y >= height) return;\n"
00384         "float4 cp = read_imagef(texp, sampler, coord);\n"
00385         "float4 dx = (float4)(  cc.y - block[idx - 1 + BLOCK_SIZE], \n"
00386         "                       block[idx + 1] - cc.x, \n"
00387         "                       cc.w - block[idx - 1 + 3 * BLOCK_SIZE], \n"
00388         "                       block[idx + 1 + 2 * BLOCK_SIZE] - cc.z);\n"
00389         "float4 dy = (float4)(  cc.z - block[idx - BLOCK_DIMX + 2 * BLOCK_SIZE], \n"
00390         "                       cc.w - block[idx - BLOCK_DIMX + 3 * BLOCK_SIZE],"
00391         //"                       cc.zw - block[idx - BLOCK_DIMX].zw, \n"
00392         "                       block[idx + BLOCK_DIMX] - cc.x,\n "
00393         "                       block[idx + BLOCK_DIMX + BLOCK_SIZE] - cc.y);\n"
00394         //"                       block[idx + BLOCK_DIMX].xy - cc.xy);\n"
00395         "write_imagef(dog, coord, cc - cp); \n"
00396         "write_imagef(grd, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
00397         "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device); 
00398 }
00399 
00400 void ProgramBagCL::LoadDynamicShaders(SiftParam& param)
00401 {
00402     LoadKeypointShader();
00403     LoadGenListShader(param._dog_level_num, 0);
00404     CreateGaussianFilters(param);
00405 }
00406 
00407 
00408 void ProgramBagCL::SelectInitialSmoothingFilter(int octave_min, SiftParam&param)
00409 {
00410     float sigma = param.GetInitialSmoothSigma(octave_min);
00411     if(sigma == 0) 
00412     {
00413        f_gaussian_skip0 = NULL; 
00414     }else
00415     {
00416             for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
00417             {
00418                     if(f_gaussian_skip0_v[i]->_id == octave_min)
00419                     {
00420                             f_gaussian_skip0 = f_gaussian_skip0_v[i];
00421                             return ;
00422                     }
00423             }
00424             FilterCL * filter = CreateGaussianFilter(sigma); 
00425             filter->_id = octave_min;
00426             f_gaussian_skip0_v.push_back(filter);
00427             f_gaussian_skip0 = filter; 
00428     }
00429 
00430 }
00431 
00432 void ProgramBagCL::CreateGaussianFilters(SiftParam&param)
00433 {
00434         if(param._sigma_skip0>0.0f) 
00435         {
00436                 f_gaussian_skip0 = CreateGaussianFilter(param._sigma_skip0);
00437                 f_gaussian_skip0->_id = GlobalUtil::_octave_min_default; 
00438                 f_gaussian_skip0_v.push_back(f_gaussian_skip0);
00439         }
00440         if(param._sigma_skip1>0.0f) 
00441         {
00442                 f_gaussian_skip1 = CreateGaussianFilter(param._sigma_skip1);
00443         }
00444 
00445         f_gaussian_step = new FilterCL*[param._sigma_num];
00446         for(int i = 0; i< param._sigma_num; i++)
00447         {
00448                 f_gaussian_step[i] =  CreateGaussianFilter(param._sigma[i]);
00449         }
00450     _gaussian_step_num = param._sigma_num;
00451 }
00452 
00453 
00454 FilterCL* ProgramBagCL::CreateGaussianFilter(float sigma)
00455 {
00456         //pixel inside 3*sigma box
00457         int sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
00458         int width = 2*sz + 1;
00459 
00460         //filter size truncation
00461         if(GlobalUtil::_MaxFilterWidth >0 && width > GlobalUtil::_MaxFilterWidth)
00462         {
00463                 std::cout<<"Filter size truncated from "<<width<<" to "<<GlobalUtil::_MaxFilterWidth<<endl;
00464                 sz = GlobalUtil::_MaxFilterWidth>>1;
00465                 width = 2 * sz + 1;
00466         }
00467 
00468         int i;
00469         float * kernel = new float[width];
00470         float   rv = 1.0f/(sigma*sigma);
00471         float   v, ksum =0; 
00472 
00473         // pre-compute filter
00474         for( i = -sz ; i <= sz ; ++i) 
00475         {
00476                 kernel[i+sz] =  v = exp(-0.5f * i * i *rv) ;
00477                 ksum += v;
00478         }
00479 
00480         //normalize the kernel
00481         rv = 1.0f / ksum;
00482         for(i = 0; i< width ;i++) kernel[i]*=rv;
00483 
00484     FilterCL * filter = CreateFilter(kernel, width);
00485     delete [] kernel;
00486     if(GlobalUtil::_verbose && GlobalUtil::_timingL) std::cout<<"Filter: sigma = "<<sigma<<", size = "<<width<<"x"<<width<<endl;
00487     return filter;
00488 }
00489 
00490 FilterCL*  ProgramBagCL::CreateFilter(float kernel[], int width)
00491 {
00492     FilterCL * filter = new FilterCL;
00493     filter->s_shader_h = CreateFilterH(kernel, width); 
00494     filter->s_shader_v = CreateFilterV(kernel, width);
00495     filter->_size = width; 
00496     filter->_id  = 0;
00497     return filter;
00498 }
00499 
00500 ProgramCL* ProgramBagCL::CreateFilterH(float kernel[], int width)
00501 {
00502         int halfwidth  = width >>1;
00503         float * pf = kernel + halfwidth;
00504         int nhpixel = (halfwidth+1)>>1; //how many neighbour pixels need to be looked up
00505         int npixel  = (nhpixel<<1)+1;//
00506         float weight[3];
00507 
00509         char buffer[10240];
00510         ostrstream out(buffer, 10240);
00511         out<<setprecision(8);
00512 
00513 
00514     //CL_DEVICE_IMAGE2D_MAX_WIDTH
00515         out<<
00516           "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
00517           "__kernel void filter_h(__read_only  image2d_t input, \n"
00518           "          __write_only image2d_t output, int width_, int height_) {\n"
00519           "int x = get_global_id(0);\n"
00520           "int y = get_global_id(1); \n"
00521           "if( x > width_ || y > height_) return; \n"
00522           "float4 pc; int2 coord; \n"
00523           "float4 result = (float4)(0.0);\n";
00524     for(int i = 0 ; i < npixel ; i++)
00525         {
00526                 out<<"coord = (int2)(x + ("<< (i - nhpixel) << "), y);\n";
00527                 out<<"pc= read_imagef(input, sampler, coord);\n";
00528                 if(GlobalUtil::_PreciseBorder)  
00529         out<<"if(coord.x < 0) pc = pc.xxzz; else if (coord.x > width_) pc = pc.yyww; \n";
00530                 //for each sub-pixel j  in center, the weight of sub-pixel k 
00531                 int xw = (i - nhpixel)*2;
00532                 for(int j = 0; j < 3; j++)
00533                 {
00534                         int xwn = xw  + j  -1;
00535                         weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
00536                 }
00537                 if(weight[1] == 0.0)
00538                 {
00539                         out<<"result += (float4)("<<weight[2]<<","<<weight[0]<<","<<weight[2]<<","<<weight[0]<<") * pc.yxwz;\n";
00540                 }
00541                 else
00542                 {
00543                         out<<"result += (float4)("<<weight[1]<<", "<<weight[0]<<", "<<weight[1]<<", "<<weight[0]<<") * pc.xxzz;\n";
00544                         out<<"result += (float4)("<<weight[2]<<", "<<weight[1]<<", "<<weight[2]<<", "<<weight[1]<<") * pc.yyww;\n";
00545                 }       
00546         }
00547     out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
00548         return new ProgramCL("filter_h", buffer, _context, _device); 
00549 }
00550 
00551 
00552 
00553 ProgramCL* ProgramBagCL::CreateFilterV(float kernel[], int width)
00554 {
00555 
00556         int halfwidth  = width >>1;
00557         float * pf = kernel + halfwidth;
00558         int nhpixel = (halfwidth+1)>>1; //how many neighbour pixels need to be looked up
00559         int npixel  = (nhpixel<<1)+1;//
00560         float weight[3];
00561 
00563         char buffer[10240];
00564         ostrstream out(buffer, 10240);
00565         out<<setprecision(8);
00566 
00567 
00568     //CL_DEVICE_IMAGE2D_MAX_WIDTH
00569         out<< 
00570           "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
00571           "__kernel void filter_v(__read_only  image2d_t input, \n"
00572           "          __write_only image2d_t output, int width_, int height_) {\n"
00573           "int x = get_global_id(0);\n"
00574           "int y = get_global_id(1); \n"
00575           "if( x > width_ || y >= height_) return; \n"
00576           "float4 pc; int2 coord; \n"
00577           "float4 result = (float4)(0.0);\n";
00578     for(int i = 0 ; i < npixel ; i++)
00579         {
00580                 out<<"coord = (int2)(x, y + ("<< (i - nhpixel) << "));\n";
00581                 out<<"pc= read_imagef(input, sampler, coord);\n";
00582                 if(GlobalUtil::_PreciseBorder)  
00583         out<<"if(coord.y < 0) pc = pc.xyxy; else if (coord.y > height_) pc = pc.zwzw; \n";
00584                 //for each sub-pixel j  in center, the weight of sub-pixel k 
00585                 int xw = (i - nhpixel)*2;
00586                 for(int j = 0; j < 3; j++)
00587                 {
00588                         int xwn = xw  + j  -1;
00589                         weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
00590                 }
00591                 if(weight[1] == 0.0)
00592                 {
00593                         out<<"result += (float4)("<<weight[2]<<","<<weight[2]<<","<<weight[0]<<","<<weight[0]<<") * pc.zwxy;\n";
00594                 }
00595                 else
00596                 {
00597                         out<<"result += (float4)("<<weight[1]<<", "<<weight[1]<<", "<<weight[0]<<", "<<weight[0]<<") * pc.xyxy;\n";
00598                         out<<"result += (float4)("<<weight[2]<<", "<<weight[2]<<", "<<weight[1]<<", "<<weight[1]<<") * pc.zwzw;\n";
00599                 }       
00600         }
00601     out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
00602         return new ProgramCL("filter_v", buffer, _context, _device); 
00603 
00604 }
00605 
00606 void ProgramBagCL::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
00607 {
00608     cl_kernel kernelh = filter->s_shader_h->_kernel;
00609     cl_kernel kernelv = filter->s_shader_v->_kernel;
00611 
00612     cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
00613     cl_int w_ = w - 1, h_ = h - 1; 
00614 
00615     size_t dim0 = 16, dim1 = 16;
00616     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00617 
00618     clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
00619     clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
00620     clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
00621     clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
00622     status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz, lsz, 0, NULL, NULL);
00623     CheckErrorCL(status, "ProgramBagCL::FilterImageH");
00624     if(status != CL_SUCCESS) return;
00625 
00626     clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
00627     clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
00628     clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
00629     clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
00630     size_t gsz2[2] = {(w + dim1 - 1) / dim1 * dim1, (h + dim0 - 1) / dim0 * dim0}, lsz2[2] = {dim1, dim0};
00631     status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
00632     CheckErrorCL(status, "ProgramBagCL::FilterImageV");
00633     //clReleaseEvent(event);
00634 }
00635 
00636 void ProgramBagCL::SampleImageU(CLTexImage *dst, CLTexImage *src, int log_scale)
00637 {
00638     cl_kernel  kernel= s_sampling_u->_kernel; 
00639     float scale  = 1.0f / (1 << log_scale);
00640     float offset = scale * 0.5f; 
00641     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00642     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00643     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00644     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00645     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00646     clSetKernelArg(kernel, 4, sizeof(cl_float), &(scale));
00647     clSetKernelArg(kernel, 5, sizeof(cl_float), &(offset));
00648 
00649     size_t dim0 = 16, dim1 = 16;
00650     //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2; 
00651     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00652     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00653     CheckErrorCL(status, "ProgramBagCL::SampleImageU");
00654 }
00655 
00656 void ProgramBagCL::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
00657 {
00658     cl_kernel  kernel; 
00659     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); 
00660     if(log_scale == 1)
00661     {
00662         kernel = s_sampling->_kernel;
00663         clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00664         clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00665         clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00666         clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00667     }else
00668     {
00669         cl_int fullstep = (1 << log_scale);
00670         cl_int halfstep = fullstep >> 1;
00671         kernel = s_sampling_k->_kernel;
00672         clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00673         clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00674         clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00675         clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00676         clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
00677         clSetKernelArg(kernel, 5, sizeof(cl_int), &(halfstep));
00678     }
00679     size_t dim0 = 128, dim1 = 1;
00680     //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2; 
00681     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00682     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00683     CheckErrorCL(status, "ProgramBagCL::SampleImageD");
00684 }
00685 
00686 void ProgramBagCL::FilterInitialImage(CLTexImage* tex, CLTexImage* buf)
00687 {
00688     if(f_gaussian_skip0) FilterImage(f_gaussian_skip0, tex, tex, buf);
00689 }
00690 
00691 void ProgramBagCL::FilterSampledImage(CLTexImage* tex, CLTexImage* buf)
00692 {
00693     if(f_gaussian_skip1) FilterImage(f_gaussian_skip1, tex, tex, buf);
00694 }
00695 
00696 void ProgramBagCL::ComputeDOG(CLTexImage*tex, CLTexImage* texp, CLTexImage* dog, CLTexImage* grad, CLTexImage* rot)
00697 {
00698     int margin = 0, use_gm2 = 1; 
00699     bool both_grad_dog = rot->_clData && grad->_clData;
00700     cl_int w = tex->GetImgWidth(), h = tex->GetImgHeight();
00701     cl_kernel  kernel ; size_t dim0, dim1; 
00702     if(!both_grad_dog)  {kernel = s_dog_pass->_kernel; dim0 = 16; dim1 = 12; }
00703     else if(use_gm2)    {kernel = s_grad_pass2->_kernel; dim0 = 32; dim1 = 14; margin = 2; }
00704     else                {kernel = s_grad_pass->_kernel; dim0 = 16; dim1 = 20; }
00705     size_t gsz[2] = {   (w + dim0 - 1 - margin) / (dim0 - margin) * dim0,
00706                         (h + dim1 - 1 - margin) / (dim1 - margin) * dim1};
00707     size_t lsz[2] = {dim0, dim1};
00708     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(tex->_clData));
00709     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(texp->_clData));
00710     clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dog->_clData));
00711     clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
00712     clSetKernelArg(kernel, 4, sizeof(cl_int), &(h)); 
00713     if(both_grad_dog)
00714     {
00715         clSetKernelArg(kernel, 5, sizeof(cl_mem), &(grad->_clData));
00716         clSetKernelArg(kernel, 6, sizeof(cl_mem), &(rot->_clData));
00717     }
00719     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00720     CheckErrorCL(status, "ProgramBagCL::ComputeDOG");
00721 }
00722 
00723 
00724 void ProgramBagCL::ComputeKEY(CLTexImage*dog, CLTexImage* key, float Tdog, float Tedge)
00725 {
00726     cl_kernel  kernel = s_keypoint->_kernel; 
00727     cl_int w = key->GetImgWidth(), h = key->GetImgHeight();
00728         float threshold0 = Tdog* (GlobalUtil::_SubpixelLocalization?0.8f:1.0f);
00729         float threshold1 = Tdog;
00730         float threshold2 = (Tedge+1)*(Tedge+1)/Tedge;
00731         
00732     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
00733     clSetKernelArg(kernel, 1, sizeof(cl_mem), &((dog + 1)->_clData));
00734     clSetKernelArg(kernel, 2, sizeof(cl_mem), &((dog - 1)->_clData));
00735     clSetKernelArg(kernel, 3, sizeof(cl_mem), &(key->_clData));
00736     clSetKernelArg(kernel, 4, sizeof(cl_float), &(threshold0));
00737     clSetKernelArg(kernel, 5, sizeof(cl_float), &(threshold1));
00738     clSetKernelArg(kernel, 6, sizeof(cl_float), &(threshold2));
00739     clSetKernelArg(kernel, 7, sizeof(cl_int), &(w));
00740     clSetKernelArg(kernel, 8, sizeof(cl_int), &(h)); 
00741 
00742     size_t dim0 = 8, dim1 = 8;
00743     //if( w * h / dim0 / dim1 < 16) dim1 /= 2; 
00744     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00745     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00746     CheckErrorCL(status, "ProgramBagCL::ComputeKEY");
00747 }
00748 
00749 void ProgramBagCL::UnpackImage(CLTexImage*src, CLTexImage* dst)
00750 {
00751     cl_kernel  kernel = s_unpack->_kernel; 
00752     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00753     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00754     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00755     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00756     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00757     const size_t dim0 = 16, dim1 = 16;
00758     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00759     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00760 
00761     CheckErrorCL(status, "ProgramBagCL::UnpackImage");
00762     FinishCL();
00763 
00764 }
00765 
00766 void ProgramBagCL::UnpackImageDOG(CLTexImage*src, CLTexImage* dst)
00767 {
00768     if(s_unpack_dog == NULL) return; 
00769     cl_kernel  kernel = s_unpack_dog->_kernel; 
00770     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00771     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00772     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00773     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00774     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00775     const size_t dim0 = 16, dim1 = 16;
00776     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00777     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00778 
00779     CheckErrorCL(status, "ProgramBagCL::UnpackImage");
00780     FinishCL();
00781 }
00782 
00783 void ProgramBagCL::UnpackImageGRD(CLTexImage*src, CLTexImage* dst)
00784 {
00785     if(s_unpack_grd == NULL) return; 
00786     cl_kernel  kernel = s_unpack_grd->_kernel; 
00787     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00788     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
00789     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
00790     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
00791     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
00792     const size_t dim0 = 16, dim1 = 16;
00793     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00794     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00795 
00796     CheckErrorCL(status, "ProgramBagCL::UnpackImage");
00797     FinishCL();
00798 }
00799 void ProgramBagCL::UnpackImageKEY(CLTexImage*src, CLTexImage* dog, CLTexImage* dst)
00800 {
00801     if(s_unpack_key == NULL) return;
00802     cl_kernel  kernel = s_unpack_key->_kernel; 
00803     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
00804     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
00805     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(src->_clData));
00806     clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dst->_clData));
00807     clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
00808     clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
00809     const size_t dim0 = 16, dim1 = 16;
00810     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
00811     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
00812 
00813     CheckErrorCL(status, "ProgramBagCL::UnpackImageKEY");
00814     FinishCL();
00815 }
00816 void ProgramBagCL::LoadDescriptorShader()
00817 {
00818         GlobalUtil::_DescriptorPPT = 16;
00819         LoadDescriptorShaderF2();
00820 }
00821 
00822 void ProgramBagCL::LoadDescriptorShaderF2()
00823 {
00824 
00825 }
00826 
00827 void ProgramBagCL::LoadOrientationShader(void)
00828 {
00829 
00830 }
00831 
00832 void ProgramBagCL::LoadGenListShader(int ndoglev,int nlev)
00833 {
00834 
00835 }
00836 
00837 void ProgramBagCL::LoadKeypointShader()
00838 {
00839     int i;    char buffer[20240];
00840         ostrstream out(buffer, 20240);
00841         streampos pos;
00842 
00843         //tex(X)(Y)
00844         //X: (CLR) (CENTER 0, LEFT -1, RIGHT +1)  
00845         //Y: (CDU) (CENTER 0, DOWN -1, UP    +1) 
00846         out<<
00847     "__kernel void keypoint(__read_only image2d_t tex, __read_only image2d_t texU,\n"
00848     "           __read_only image2d_t texD, __write_only image2d_t texK,\n"
00849     "          float THRESHOLD0, float THRESHOLD1, \n"
00850     "          float THRESHOLD2, int width, int height)\n"
00851         "{\n"
00852     "   sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
00853     "         CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
00854     "   int x = get_global_id(0), y = get_global_id(1);\n"
00855     "   if(x  >= width || y >= height) return; \n"
00856     "   int  xp = x - 1, xn = x + 1;\n"
00857     "   int  yp = y - 1, yn = y + 1;\n"
00858     "   int2 coord0 = (int2) (x, y); \n"
00859     "   int2 coord1 = (int2) (xp, y); \n"
00860     "   int2 coord2 = (int2) (xn, y); \n"
00861     "   int2 coord3 = (int2) (x, yp); \n"
00862     "   int2 coord4 = (int2) (x, yn); \n"
00863     "   int2 coord5 = (int2) (xp, yp); \n"
00864     "   int2 coord6 = (int2) (xp, yn); \n"
00865     "   int2 coord7 = (int2) (xn, yp); \n"
00866     "   int2 coord8 = (int2) (xn, yn); \n"
00867     "   float4 ccc = read_imagef(tex, sampler,coord0);\n"
00868         "       float4 clc = read_imagef(tex, sampler,coord1);\n"
00869         "       float4 crc = read_imagef(tex, sampler,coord2);\n"
00870         "       float4 ccd = read_imagef(tex, sampler,coord3);\n"
00871         "       float4 ccu = read_imagef(tex, sampler,coord4);\n"
00872         "       float4 cld = read_imagef(tex, sampler,coord5);\n"
00873         "       float4 clu = read_imagef(tex, sampler,coord6);\n"
00874         "       float4 crd = read_imagef(tex, sampler,coord7);\n"
00875         "       float4 cru = read_imagef(tex, sampler,coord8);\n"
00876     "   float4   cc = ccc;\n"
00877         "       float4  v1[4], v2[4];\n"
00878         "       v1[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
00879         "       v1[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
00880         "       v1[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
00881         "       v1[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
00882         "       v2[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
00883         "       v2[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
00884         "       v2[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
00885         "       v2[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n"
00886     "   float4 key4 = (float4)(0); \n";
00887         //test against 8 neighbours
00888         //use variable to identify type of extremum
00889         //1.0 for local maximum and -1.0 for minimum
00890     for(i = 0; i < 4; ++i)
00891         out<<
00892     "   if(cc.s"<<i<<" > THRESHOLD0){ \n"
00893     "           if(all(isgreater((float4)(cc.s"<<i<<"), max(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = 1.0;\n"
00894     "   }else if(cc.s"<<i<<" < -THRESHOLD0){ \n"
00895     "           if(all(isless((float4)(cc.s"<<i<<"), min(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = -1.0;\n"
00896     "   }";
00897 
00898         out<<
00899     "   if(x ==0) {key4.x =  key4.z= 0; }\n"
00900     "   else if(x + 1 == width) {key4.y =  key4.w = 0;}\n"
00901     "   if(y ==0) {key4.x =   key4.y = 0; }\n"
00902     "   else if(y + 1 == height) {key4.z = key4.w = 0;}\n"
00903     "   float4 ak = fabs(key4); \n"
00904     "   float keysum = ak.x + ak.y + ak.z + ak.w; \n"
00905         "       float4 result = (float4)(0.0);\n"
00906         "       if(keysum == 1.0) {\n"
00907         "       float fxx[4], fyy[4], fxy[4], fx[4], fy[4];\n";
00908         
00909     //do edge supression first.. 
00910         //vector v1 is < (-1, 0), (1, 0), (0,-1), (0, 1)>
00911         //vector v2 is < (-1,-1), (-1,1), (1,-1), (1, 1)>
00912     for(i = 0; i < 4; ++i)
00913         out <<
00914         "       if(key4.s"<<i<<" != 0)\n"
00915         "       {\n"
00916         "               float4 D2 = v1["<<i<<"].xyzw - cc.s"<<i<<";\n"
00917         "               float2 D4 = v2["<<i<<"].xw - v2["<<i<<"].yz;\n"
00918         "               float2 D5 = 0.5*(v1["<<i<<"].yw-v1["<<i<<"].xz); \n"
00919         "               fx["<<i<<"] = D5.x;     fy["<<i<<"] = D5.y ;\n"
00920         "               fxx["<<i<<"] = D2.x + D2.y;\n"
00921         "               fyy["<<i<<"] = D2.z + D2.w;\n"
00922         "               fxy["<<i<<"] = 0.25*(D4.x + D4.y);\n"
00923         "               float fxx_plus_fyy = fxx["<<i<<"] + fyy["<<i<<"];\n"
00924         "               float score_up = fxx_plus_fyy*fxx_plus_fyy; \n"
00925         "               float score_down = (fxx["<<i<<"]*fyy["<<i<<"] - fxy["<<i<<"]*fxy["<<i<<"]);\n"
00926         "               if( score_down <= 0 || score_up > THRESHOLD2 * score_down)keysum = 0;\n"
00927         "       }\n";
00928 
00929     out << 
00930         "       if(keysum == 1) {\n";
00932         //read 9 pixels of upper/lower level
00933         out<<
00934         "       float4  v4[4], v5[4], v6[4];\n"
00935         "       ccc = read_imagef(texU, sampler,coord0);\n"
00936         "       clc = read_imagef(texU, sampler,coord1);\n"
00937         "       crc = read_imagef(texU, sampler,coord2);\n"
00938         "       ccd = read_imagef(texU, sampler,coord3);\n"
00939         "       ccu = read_imagef(texU, sampler,coord4);\n"
00940         "       cld = read_imagef(texU, sampler,coord5);\n"
00941         "       clu = read_imagef(texU, sampler,coord6);\n"
00942         "       crd = read_imagef(texU, sampler,coord7);\n"
00943         "       cru = read_imagef(texU, sampler,coord8);\n"
00944     "   float4 cu = ccc;\n"
00945         "       v4[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
00946         "       v4[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
00947         "       v4[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
00948         "       v4[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
00949         "       v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
00950         "       v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
00951         "       v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
00952         "       v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
00953 
00954     for(i = 0; i < 4; ++i)
00955         out <<
00956         "       if(key4.s"<<i<<" == 1.0)\n"
00957         "       {\n"
00958         "               if(cc.s"<<i<<" < cu.s"<<i<<" || \n"
00959     "           any(isless((float4)(cc.s"<<i<<"), max(v4["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
00960         "       }else if(key4.s"<<i<<" == -1.0)\n"
00961         "       {\n"
00962         "               if(cc.s"<<i<<" > cu.s"<<i<<" || \n"
00963     "           any(isgreater((float4)(cc.s"<<i<<"), min(v4["<<i<<"], v6["<<i<<"]))) )keysum = 0; \n"
00964         "       }\n";
00965 
00966     out <<
00967         "       if(keysum == 1.0) { \n";
00968     out <<
00969         "       ccc = read_imagef(texD, sampler,coord0);\n"
00970         "       clc = read_imagef(texD, sampler,coord1);\n"
00971         "       crc = read_imagef(texD, sampler,coord2);\n"
00972         "       ccd = read_imagef(texD, sampler,coord3);\n"
00973         "       ccu = read_imagef(texD, sampler,coord4);\n"
00974         "       cld = read_imagef(texD, sampler,coord5);\n"
00975         "       clu = read_imagef(texD, sampler,coord6);\n"
00976         "       crd = read_imagef(texD, sampler,coord7);\n"
00977         "       cru = read_imagef(texD, sampler,coord8);\n"
00978     "   float4 cd = ccc;\n"
00979         "       v5[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
00980         "       v5[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
00981         "       v5[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
00982         "       v5[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
00983         "       v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
00984         "       v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
00985         "       v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
00986         "       v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
00987     for(i = 0; i < 4; ++i)
00988     out <<
00989         "       if(key4.s"<<i<<" == 1.0)\n"
00990         "       {\n"
00991         "               if(cc.s"<<i<<" < cd.s"<<i<<" ||\n"
00992     "           any(isless((float4)(cc.s"<<i<<"), max(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
00993         "       }else if(key4.s"<<i<<" == -1.0)\n"
00994         "       {\n"
00995         "               if(cc.s"<<i<<" > cd.s"<<i<<" ||\n"
00996     "           any(isgreater((float4)(cc.s"<<i<<"), min(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
00997         "       }\n";
00998 
00999     out << 
01000         "       if(keysum==1.0) {\n";
01002         if(GlobalUtil::_SubpixelLocalization)
01003     {
01004             out <<
01005             "   float4 offset = (float4)(0); \n";
01006         for(i = 1; i < 4; ++i)
01007         out <<
01008             "   if(key4.s"<<i<<" != 0) \n"
01009             "   {\n"
01010             "           cu.s0 = cu.s"<<i<<";    cd.s0 = cd.s"<<i<<";    cc.s0 = cc.s"<<i<<";    \n"
01011             "           v4[0] = v4["<<i<<"];    v5[0] = v5["<<i<<"];                                            \n"
01012             "           fxy[0] = fxy["<<i<<"];  fxx[0] = fxx["<<i<<"];  fyy[0] = fyy["<<i<<"];  \n"
01013             "           fx[0] = fx["<<i<<"];    fy[0] = fy["<<i<<"];                                            \n"
01014             "   }\n";
01015 
01016         out <<  
01017             "   float fs = 0.5*( cu.s0 - cd.s0 );                               \n"
01018             "   float fss = cu.s0 + cd.s0 - cc.s0 - cc.s0;\n"
01019             "   float fxs = 0.25 * (v4[0].y + v5[0].x - v4[0].x - v5[0].y);\n"
01020             "   float fys = 0.25 * (v4[0].w + v5[0].z - v4[0].z - v5[0].w);\n"
01021             "   float4 A0, A1, A2 ;                     \n"
01022             "   A0 = (float4)(fxx[0], fxy[0], fxs, -fx[0]);     \n"
01023             "   A1 = (float4)(fxy[0], fyy[0], fys, -fy[0]);     \n"
01024             "   A2 = (float4)(fxs, fys, fss, -fs);      \n"
01025         "       float4 x3 = fabs((float4)(fxx[0], fxy[0], fxs, 0));             \n"
01026             "   float maxa = max(max(x3.x, x3.y), x3.z);        \n"
01027             "   if(maxa >= 1e-10 ) \n"
01028             "   {                                                                                               \n"
01029             "           if(x3.y ==maxa )                                                        \n"
01030             "           {                                                                                       \n"
01031             "                   float4 TEMP = A1; A1 = A0; A0 = TEMP;   \n"
01032             "           }else if( x3.z == maxa )                                        \n"
01033             "           {                                                                                       \n"
01034             "                   float4 TEMP = A2; A2 = A0; A0 = TEMP;   \n"
01035             "           }                                                                                       \n"
01036             "           A0 /= A0.x;                                                                     \n"
01037             "           A1 -= A1.x * A0;                                                        \n"
01038             "           A2 -= A2.x * A0;                                                        \n"
01039         "               float2 x2 = fabs((float2)(A1.y, A2.y));         \n"
01040             "           if( x2.y > x2.x )                                                       \n"
01041             "           {                                                                                       \n"
01042             "                   float4 TEMP = A2.yzwx;                                  \n"
01043             "                   A2.yzw = A1.yzw;                                                \n"
01044             "                   A1.yzw = TEMP.xyz;                                                      \n"
01045             "                   x2.x = x2.y;                                                    \n"
01046             "           }                                                                                       \n"
01047             "           if(x2.x >= 1e-10) {                                                             \n"
01048             "                   A1.yzw /= A1.y;                                                         \n"
01049             "                   A2.yzw -= A2.y * A1.yzw;                                        \n"
01050             "                   if(fabs(A2.z) >= 1e-10) {\n"
01051             "                           offset.z = A2.w /A2.z;                              \n"
01052             "                           offset.y = A1.w - offset.z*A1.z;                            \n"
01053             "                           offset.x = A0.w - offset.z*A0.z - offset.y*A0.y;        \n"
01054         "                               if(fabs(cc.s0 + 0.5*dot((float4)(fx[0], fy[0], fs, 0), offset ))<=THRESHOLD1\n"
01055         "                   || any( isgreater(fabs(offset), (float4)(1.0)))) key4 = (float4)(0.0);\n"
01056             "                   }\n"
01057             "           }\n"
01058             "   }\n"
01059             <<"\n"
01060         "       float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
01061             "   result = (float4)(keyv,  offset.xyz);\n"
01062             "   }}}}\n"
01063         "   write_imagef(texK, coord0, result);\n "
01064             "}\n"       <<'\0';
01065     }
01066         else 
01067     {
01068         out << "\n"
01069         "       float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
01070         "       result =  (float4)(keyv, 0, 0, 0);\n"
01071         "       }}}}\n"
01072         "   write_imagef(texK, coord0, result);\n "
01073         "}\n"   <<'\0';
01074     }
01075 
01076     s_keypoint = new ProgramCL("keypoint", buffer, _context, _device);
01077 }
01078 
01079 void ProgramBagCL::LoadDisplayShaders()
01080 {
01081     //"uniform sampler2DRect tex; void main(){\n"
01082     //"vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy); bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));\n"
01083     //"float v = ff.y?(ff.x? pc.r : pc.g):(ff.x?pc.b:pc.a); gl_FragColor = vec4(vec3(v), 1.0);}");
01084         s_unpack = new ProgramCL("main", 
01085     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
01086     "                   int width, int height) {\n"
01087     "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01088     "int x = get_global_id(0), y =  get_global_id(1); \n"
01089     "if(x >= width || y >= height) return;\n"
01090     "int xx = x / 2, yy = y / 2; \n"
01091     "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
01092     "float v1 = (x & 1 ? vv.w : vv.z); \n"
01093     "float v2 = (x & 1 ? vv.y : vv.x);\n"
01094     "float v = y & 1 ? v1 : v2;\n"
01095     "float4 result = (float4) (v, v, v, 1);"
01096     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
01097 
01098         s_unpack_dog = new ProgramCL("main", 
01099     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
01100     "                   int width, int height) {\n"
01101     "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01102     "int x = get_global_id(0), y =  get_global_id(1); \n"
01103     "if(x >= width || y >= height) return;\n"
01104     "int xx = x / 2, yy = y / 2; \n"
01105     "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
01106     "float v1 = (x & 1 ? vv.w : vv.z); \n"
01107     "float v2 = (x & 1 ? vv.y : vv.x);\n"
01108     "float v0 = y & 1 ? v1 : v2;\n"
01109     "float v = 0.5 + 20.0 * v0;\n "
01110     "float4 result = (float4) (v, v, v, 1);"
01111     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
01112         
01113     s_unpack_grd = new ProgramCL("main", 
01114     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
01115     "                   int width, int height) {\n"
01116     "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01117     "int x = get_global_id(0), y =  get_global_id(1); \n"
01118     "if(x >= width || y >= height) return;\n"
01119     "int xx = x / 2, yy = y / 2; \n"
01120     "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
01121     "float v1 = (x & 1 ? vv.w : vv.z); \n"
01122     "float v2 = (x & 1 ? vv.y : vv.x);\n"
01123     "float v0 = y & 1 ? v1 : v2;\n"
01124     "float v = 5.0 * v0;\n "
01125     "float4 result = (float4) (v, v, v, 1);"
01126     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
01127 
01128         s_unpack_key = new ProgramCL("main", 
01129     "__kernel void main(__read_only  image2d_t dog,\n"
01130     "                   __read_only image2d_t key,\n"
01131     "                   __write_only image2d_t output,\n"
01132     "                   int width, int height) {\n"
01133     "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01134     "int x = get_global_id(0), y =  get_global_id(1); \n"
01135     "if(x >= width || y >= height) return;\n"
01136     "int xx = x / 2, yy = y / 2; \n"
01137     "float4 kk = read_imagef(key, sampler, (int2) (xx, yy));\n"
01138     "int4 cc = isequal(fabs(kk.xxxx), (float4)(1.0, 2.0, 3.0, 4.0));\n"
01139     "int k1 = (x & 1 ? cc.w : cc.z); \n"
01140     "int k2 = (x & 1 ? cc.y : cc.x);\n"
01141     "int k0 = y & 1 ? k1 : k2;\n"
01142     "float4 result;\n"
01143     "if(k0 != 0){\n"
01144     "   //result = kk.x > 0 ? ((float4)(1.0, 0, 0, 1.0)) : ((float4) (0.0, 1.0, 0.0, 1.0)); \n"
01145     "   result = kk.x < 0 ? ((float4)(0, 1.0, 0, 1.0)) : ((float4) (1.0, 0.0,  0.0, 1.0)); \n"
01146     "}else{"
01147         "float4 vv = read_imagef(dog, sampler, (int2) (xx, yy));\n"
01148         "float v1 = (x & 1 ? vv.w : vv.z); \n"
01149         "float v2 = (x & 1 ? vv.y : vv.x);\n"
01150         "float v0 = y & 1 ? v1 : v2;\n"
01151         "float v = 0.5 + 20.0 * v0;\n "
01152         "result = (float4) (v, v, v, 1);"
01153     "}\n"
01154     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
01155 }
01156 
01157 
01158 void ProgramBagCL::SetMarginCopyParam(int xmax, int ymax)
01159 {
01160 
01161 }
01162 
01163 void ProgramBagCL::SetGradPassParam(int texP)
01164 {
01165 
01166 }
01167 
01168 void ProgramBagCL::SetGenListEndParam(int ktex)
01169 {
01170 
01171 }
01172 
01173 void ProgramBagCL::SetDogTexParam(int texU, int texD)
01174 {
01175 
01176 }
01177 
01178 void ProgramBagCL::SetGenListInitParam(int w, int h)
01179 {
01180         float bbox[4] = {(w -1.0f) * 0.5f +0.25f, (w-1.0f) * 0.5f - 0.25f,  (h - 1.0f) * 0.5f + 0.25f, (h-1.0f) * 0.5f - 0.25f};
01181 
01182 }
01183 
01184 
01185 void ProgramBagCL::SetGenListStartParam(float width, int tex0)
01186 {
01187 
01188 }
01189 
01190 
01191 
01192 void ProgramBagCL::SetGenListStepParam(int tex, int tex0)
01193 {
01194 
01195 }
01196 
01197 void ProgramBagCL::SetGenVBOParam(float width, float fwidth, float size)
01198 {
01199 
01200 }
01201 
01202 void ProgramBagCL::SetSimpleOrientationInput(int oTex, float sigma, float sigma_step)
01203 {
01204 
01205 }
01206 
01207 
01208 void ProgramBagCL::SetFeatureOrientationParam(int gtex, int width, int height, float sigma, int otex, float step)
01209 {
01210 
01211 
01212 }
01213 
01214 void ProgramBagCL::SetFeatureDescirptorParam(int gtex, int otex, float dwidth, float fwidth,  float width, float height, float sigma)
01215 {
01216 
01217 }
01218 
01219 
01220 
01221 const char* ProgramBagCL::GetErrorString(cl_int error)
01222 {
01223     static const char* errorString[] = {
01224         "CL_SUCCESS",
01225         "CL_DEVICE_NOT_FOUND",
01226         "CL_DEVICE_NOT_AVAILABLE",
01227         "CL_COMPILER_NOT_AVAILABLE",
01228         "CL_MEM_OBJECT_ALLOCATION_FAILURE",
01229         "CL_OUT_OF_RESOURCES",
01230         "CL_OUT_OF_HOST_MEMORY",
01231         "CL_PROFILING_INFO_NOT_AVAILABLE",
01232         "CL_MEM_COPY_OVERLAP",
01233         "CL_IMAGE_FORMAT_MISMATCH",
01234         "CL_IMAGE_FORMAT_NOT_SUPPORTED",
01235         "CL_BUILD_PROGRAM_FAILURE",
01236         "CL_MAP_FAILURE",
01237         "",
01238         "",
01239         "",
01240         "",
01241         "",
01242         "",
01243         "",
01244         "",
01245         "",
01246         "",
01247         "",
01248         "",
01249         "",
01250         "",
01251         "",
01252         "",
01253         "",
01254         "CL_INVALID_VALUE",
01255         "CL_INVALID_DEVICE_TYPE",
01256         "CL_INVALID_PLATFORM",
01257         "CL_INVALID_DEVICE",
01258         "CL_INVALID_CONTEXT",
01259         "CL_INVALID_QUEUE_PROPERTIES",
01260         "CL_INVALID_COMMAND_QUEUE",
01261         "CL_INVALID_HOST_PTR",
01262         "CL_INVALID_MEM_OBJECT",
01263         "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
01264         "CL_INVALID_IMAGE_SIZE",
01265         "CL_INVALID_SAMPLER",
01266         "CL_INVALID_BINARY",
01267         "CL_INVALID_BUILD_OPTIONS",
01268         "CL_INVALID_PROGRAM",
01269         "CL_INVALID_PROGRAM_EXECUTABLE",
01270         "CL_INVALID_KERNEL_NAME",
01271         "CL_INVALID_KERNEL_DEFINITION",
01272         "CL_INVALID_KERNEL",
01273         "CL_INVALID_ARG_INDEX",
01274         "CL_INVALID_ARG_VALUE",
01275         "CL_INVALID_ARG_SIZE",
01276         "CL_INVALID_KERNEL_ARGS",
01277         "CL_INVALID_WORK_DIMENSION",
01278         "CL_INVALID_WORK_GROUP_SIZE",
01279         "CL_INVALID_WORK_ITEM_SIZE",
01280         "CL_INVALID_GLOBAL_OFFSET",
01281         "CL_INVALID_EVENT_WAIT_LIST",
01282         "CL_INVALID_EVENT",
01283         "CL_INVALID_OPERATION",
01284         "CL_INVALID_GL_OBJECT",
01285         "CL_INVALID_BUFFER_SIZE",
01286         "CL_INVALID_MIP_LEVEL",
01287         "CL_INVALID_GLOBAL_WORK_SIZE",
01288     };
01289 
01290     const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
01291 
01292     const int index = -error;
01293 
01294     return (index >= 0 && index < errorCount) ? errorString[index] : "";
01295 }
01296 
01297 bool ProgramBagCL::CheckErrorCL(cl_int error, const char* location)
01298 {
01299     if(error == CL_SUCCESS) return true;
01300         const char *errstr = GetErrorString(error);
01301         if(errstr && errstr[0]) std::cerr << errstr; 
01302         else std::cerr  << "Error " << error;
01303         if(location) std::cerr  << " at " << location;          
01304         std::cerr  << "\n";
01305     exit(0);
01306     return false;
01307 
01308 }
01309 
01310 
01313 
01314 void ProgramBagCLN::LoadFixedShaders()
01315 {
01316         s_sampling = new ProgramCL("sampling",
01317         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01318         "__kernel void sampling(__read_only  image2d_t input, __write_only image2d_t output, "
01319         "                   int width, int height) {\n"
01320         "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
01321         "if( coord.x >= width || coord.y >= height) return;\n"
01322         "write_imagef(output, coord, read_imagef(input, sampler, coord << 1)); }"  , _context, _device); 
01323     
01324     s_sampling_k = new ProgramCL("sampling_k",
01325         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01326         "__kernel void sampling_k(__read_only  image2d_t input, __write_only image2d_t output, "
01327         "                   int width, int height, int step) {\n"
01328         "int x = get_global_id(0), y =  get_global_id(1); \n"
01329         "if( x >= width || y >= height) return;\n"
01330         "int xa = x * step,   ya = y *step; \n"
01331         "float4 v1 = read_imagef(input, sampler, (int2) (xa, ya)); \n"
01332         "write_imagef(output, (int2) (x, y), v1); }"  , _context, _device);
01333 
01334 
01335         s_sampling_u = new ProgramCL("sampling_u",
01336         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
01337         "__kernel void sampling_u(__read_only  image2d_t input, \n"
01338         "                   __write_only image2d_t output,\n"
01339         "                   int width, int height, float step) {\n"
01340         "int x = get_global_id(0), y =  get_global_id(1); \n"
01341         "if( x >= width || y >= height) return;\n"
01342         "float xa = x * step,       ya = y *step; \n"
01343         "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
01344         "write_imagef(output, (int2) (x, y), (float4)(v1)); }"  , _context, _device);
01345 
01346    s_dog_pass = new ProgramCL("dog_pass",
01347         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01348         "__kernel void dog_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
01349         "                   __write_only image2d_t dog, int width, int height) {\n"
01350         "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
01351         "if( coord.x >= width || coord.y >= height) return;\n"
01352         "float cc = read_imagef(tex , sampler, coord).x; \n"
01353         "float cp = read_imagef(texp, sampler, coord).x;\n"
01354         "write_imagef(dog, coord, (float4)(cc - cp)); }\n", _context, _device); 
01355 
01356    s_grad_pass = new ProgramCL("grad_pass",
01357         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01358         "__kernel void grad_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
01359         "                   __write_only image2d_t dog, int width, int height, \n"
01360         "                   __write_only image2d_t grad, __write_only image2d_t rot) {\n"
01361         "int x = get_global_id(0), y =  get_global_id(1); \n"
01362         "if( x >= width || y >= height) return;\n"
01363         "int2 coord = (int2) (x, y);\n"
01364         "float cl = read_imagef(tex, sampler, (int2)(x - 1, y)).x;\n"
01365         "float cc = read_imagef(tex , sampler, coord).x; \n"
01366         "float cr = read_imagef(tex, sampler, (int2)(x + 1, y)).x;\n"
01367         "float cp = read_imagef(texp, sampler, coord).x;\n"
01368         "write_imagef(dog, coord, (float4)(cc - cp)); \n"
01369             "float cd = read_imagef(tex, sampler, (int2)(x, y - 1)).x;\n"
01370         "float cu = read_imagef(tex, sampler, (int2)(x, y + 1)).x;\n"
01371         "float dx = cr - cl, dy = cu - cd; \n"
01372             "float gg = 0.5 * sqrt(dx*dx + dy * dy);\n"
01373         "write_imagef(grad, coord, (float4)(gg));\n"
01374         "float oo = atan2(dy, dx + FLT_MIN);\n"
01375         "write_imagef(rot, coord, (float4)(oo));}\n", _context, _device); 
01376 
01377    s_grad_pass2 = new ProgramCL("grad_pass2",
01378         "#define BLOCK_DIMX 32\n"
01379         "#define BLOCK_DIMY 14\n"
01380         "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
01381         "__kernel void grad_pass2(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
01382         "                   __write_only image2d_t dog, int width, int height,\n"
01383         "                   __write_only image2d_t grd, __write_only image2d_t rot){\n"//,  __local float* block) {\n"
01384         "__local float block[BLOCK_SIZE]; \n"
01385         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
01386         "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 
01387         "int2 coord = (int2) (  get_global_id(0) - get_group_id(0) * 2 - 1, \n"
01388         "                       get_global_id(1) - get_group_id(1) * 2 - 1); \n"
01389         "int idx =  mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
01390         "float cc = read_imagef(tex, sampler, coord).x;\n"
01391         "block[idx] = cc;\n"
01392         "barrier(CLK_LOCAL_MEM_FENCE);\n"
01393         "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
01394         "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
01395         "if( coord.x >= width) return; \n"
01396         "if( coord.y >= height) return;\n"
01397         "float cp = read_imagef(texp, sampler, coord).x;\n"
01398         "float dx = block[idx + 1] - block[idx - 1];\n"
01399         "float dy = block[idx + BLOCK_DIMX ] - block[idx - BLOCK_DIMX];\n"
01400         "write_imagef(dog, coord, (float4)(cc - cp)); \n"
01401         "write_imagef(grd, coord, (float4)(0.5 * sqrt(dx*dx + dy * dy)));\n"
01402         "write_imagef(rot, coord, (float4)(atan2(dy, dx + FLT_MIN)));}\n", _context, _device); 
01403 }
01404 
01405 void ProgramBagCLN::LoadDisplayShaders()
01406 {
01407         s_unpack = new ProgramCL("main", 
01408     "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01409     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
01410     "                   int width, int height) {\n"
01411     "int x = get_global_id(0), y =  get_global_id(1); \n"
01412     "if(x >= width || y >= height) return;\n"
01413     "float v = read_imagef(input, sampler, (int2) (x, y)).x; \n"
01414     "float4 result = (float4) (v, v, v, 1);"
01415     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
01416 
01417     s_unpack_grd = new ProgramCL("main", 
01418     "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
01419     "           CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01420     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
01421     "                   int width, int height) {\n"
01422     "int x = get_global_id(0), y =  get_global_id(1); \n"
01423     "if(x >= width || y >= height) return;\n"
01424     "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
01425     "float v = 5.0 * v0;  float4 result = (float4) (v, v, v, 1);"
01426     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
01427 
01428         s_unpack_dog = new ProgramCL("main", 
01429     "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01430     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
01431     "                   int width, int height) {\n"
01432     "int x = get_global_id(0), y =  get_global_id(1); \n"
01433     "if(x >= width || y >= height) return;\n"
01434     "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
01435     "float v = 0.5 + 20.0 * v0; float4 result = (float4) (v, v, v, 1);"
01436     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
01437 }
01438 
01439 ProgramCL* ProgramBagCLN::CreateFilterH(float kernel[], int width)
01440 {
01442         char buffer[10240];
01443         ostrstream out(buffer, 10240);
01444     out <<  "#define KERNEL_WIDTH " << width << "\n"
01445         <<  "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n" 
01446             "#define BLOCK_WIDTH 128\n"
01447             "#define BLOCK_HEIGHT 1\n"
01448             "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
01449             "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
01450             "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
01451             "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01452             "__kernel void filter_h(__read_only  image2d_t input, \n"
01453             "          __write_only image2d_t output, int width_, int height_, \n"
01454             "          __constant float* weight) {\n"
01455             "__local float data[CACHE_WIDTH]; \n"
01456             "int x = get_global_id(0), y = get_global_id(1);\n"
01457             "#pragma unroll\n"
01458                 "for(int j = 0; j < CACHE_COUNT; ++j)\n"
01459                 "{\n"
01460                     "    if(get_local_id(0) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
01461                     "    {\n"
01462                         "        int fetch_index = min(x + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, width_);\n"
01463             "        data[get_local_id(0) + j * BLOCK_WIDTH] = read_imagef(input, sampler, (int2)(fetch_index, y)).x;\n"
01464                     "    }\n"
01465                 "}\n"
01466             "barrier(CLK_LOCAL_MEM_FENCE); \n"
01467             "if( x > width_ || y > height_) return; \n"
01468             "float result = 0; \n"
01469             "#pragma unroll\n"
01470             "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
01471             "{\n"
01472             "   result += data[get_local_id(0) + i] * weight[i];\n"
01473             "}\n"
01474          << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
01475         return new ProgramCL("filter_h", buffer, _context, _device); 
01476 }
01477 
01478 
01479 
01480 ProgramCL* ProgramBagCLN::CreateFilterV(float kernel[], int width)
01481 {
01483         char buffer[10240];
01484         ostrstream out(buffer, 10240);
01485     out <<  "#define KERNEL_WIDTH " << width << "\n"
01486         <<  "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n" 
01487             "#define BLOCK_WIDTH 128\n"
01488             "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
01489             "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
01490             "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
01491             "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
01492             "           CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
01493             "__kernel void filter_v(__read_only  image2d_t input, \n"
01494             "          __write_only image2d_t output, int width_, int height_, \n"
01495             "          __constant float* weight) {\n"
01496             "__local float data[CACHE_WIDTH]; \n"
01497             "int x = get_global_id(0), y = get_global_id(1);\n"
01498             "#pragma unroll\n"
01499                 "for(int j = 0; j < CACHE_COUNT; ++j)\n"
01500                 "{\n"
01501                     "    if(get_local_id(1) + j * BLOCK_WIDTH  < CACHE_WIDTH)\n"
01502                     "    {\n"
01503                         "        int fetch_index = min(y + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, height_);\n"
01504             "        data[get_local_id(1) + j * BLOCK_WIDTH ] = read_imagef(input, sampler, (int2)(x, fetch_index)).x;\n"
01505                     "    }\n"
01506                 "}\n"
01507             "barrier(CLK_LOCAL_MEM_FENCE); \n"
01508             "if( x > width_ || y > height_) return; \n"
01509             "float result = 0; \n"
01510             "#pragma unroll\n"
01511             "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
01512             "{\n"
01513             "   result += data[get_local_id(1) + i] * weight[i];\n"
01514             "}\n"
01515          << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
01516         
01517         return new ProgramCL("filter_v", buffer, _context, _device); 
01518 }
01519 
01520 FilterCL*  ProgramBagCLN::CreateFilter(float kernel[], int width)
01521 {
01522     FilterCL * filter = new FilterCL;
01523     filter->s_shader_h = CreateFilterH(kernel, width); 
01524     filter->s_shader_v = CreateFilterV(kernel, width);
01525     filter->_weight = new CLTexImage(_context, _queue);
01526     filter->_weight->InitBufferTex(width, 1, 1);
01527     filter->_weight->CopyFromHost(kernel);
01528     filter->_size = width; 
01529     return filter;
01530 }
01531 
01532 
01533 void ProgramBagCLN::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
01534 {
01535     cl_kernel kernelh = filter->s_shader_h->_kernel;
01536     cl_kernel kernelv = filter->s_shader_v->_kernel;
01538 
01539     cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
01540     cl_mem weight = (cl_mem) filter->_weight->_clData;
01541     cl_int w_ = w - 1, h_ = h - 1; 
01542 
01543 
01544     clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
01545     clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
01546     clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
01547     clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
01548     clSetKernelArg(kernelh, 4, sizeof(cl_mem), &weight);
01549 
01550     size_t dim00 = 128, dim01 = 1;
01551     size_t gsz1[2] = {(w + dim00 - 1) / dim00 * dim00, (h + dim01 - 1) / dim01 * dim01}, lsz1[2] = {dim00, dim01};
01552     status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz1, lsz1, 0, NULL, NULL);
01553     CheckErrorCL(status, "ProgramBagCLN::FilterImageH");
01554     if(status != CL_SUCCESS) return;
01555 
01556 
01557     clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
01558     clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
01559     clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
01560     clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
01561     clSetKernelArg(kernelv, 4, sizeof(cl_mem), &weight); 
01562 
01563     size_t dim10 = 1, dim11 = 128;
01564     size_t gsz2[2] = {(w + dim10 - 1) / dim10 * dim10, (h + dim11 - 1) / dim11 * dim11}, lsz2[2] = {dim10, dim11};
01565     status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
01566     CheckErrorCL(status, "ProgramBagCLN::FilterImageV");
01567     //clReleaseEvent(event);
01568 }
01569 
01570 void ProgramBagCLN::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
01571 {
01572     cl_kernel  kernel; 
01573     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); 
01574 
01575     cl_int fullstep = (1 << log_scale);
01576     kernel = log_scale == 1? s_sampling->_kernel : s_sampling_k->_kernel;
01577     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
01578     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
01579     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
01580     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
01581     if(log_scale > 1) clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
01582 
01583     size_t dim0 = 128, dim1 = 1;
01584     //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2; 
01585     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
01586     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
01587     CheckErrorCL(status, "ProgramBagCLN::SampleImageD");
01588 }
01589 
01590 
01591 #endif
01592