opencl_kernels.cpp
Go to the documentation of this file.
00001 #include "opencl_kernels.h"
00002 
00003 #include <opencl_utils.h>
00004 #include <parallel_common.h>
00005 #include <parallel_reduce.h>
00006 #include <parallel_math.h>
00007 
00008 namespace parallel_ode
00009 {
00010 
00011 static cl_program cpSolver;             
00021 static size_t uSnap(size_t a, size_t b){
00022   return ((a % b) == 0) ? a : (a - (a % b) + b);
00023 }
00024 
00025 static cl_kernel
00026   ckReduce,                             
00027   ckZero,                               
00028   ckZero4,                              
00029   ckSolve;                              
00031 static cl_command_queue cqDefaultCommandQueue; 
00040 std::string extractDirectory( const std::string& path )
00041 {
00042   return path.substr( 0, path.find_last_of( '/' )+1 );
00043 }
00044 
00045 static const std::string FILENAME =  __FILE__; 
00046 static const std::string SRC_DIR = extractDirectory( FILENAME ); 
00047 static const std::string INC_DIR = extractDirectory( FILENAME ) + "../include/" + PROJ_NAME + "/"; 
00049 extern "C" void oclInitializeKernels(cl_context cxGPUContext, cl_command_queue cqParamCommandQue)
00050 {
00051   cl_int ciErrNum;
00052   size_t kernelLength;
00053 
00054   char *cDefines1 = oclLoadProgSource((INC_DIR+"parallel_defines.h").c_str(), KERNEL_PREFIX, &kernelLength);
00055   oclCheckError(cDefines1 != NULL, true);
00056   char *cDefines2 = oclLoadProgSource((INC_DIR+"opencl_common.h").c_str(), cDefines1, &kernelLength);
00057   oclCheckError(cDefines2 != NULL, true);
00058   char *cKernels = oclLoadProgSource((SRC_DIR+"parallel_kernels_nontemplate.h").c_str(), cDefines2, &kernelLength);
00059   //char *cKernels = oclLoadProgSource((SRC_DIR+"parallel_kernels.cl").c_str(), cDefines1, &kernelLength);
00060   oclCheckError(cKernels != NULL, true);
00061 
00062   cpSolver = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cKernels, &kernelLength, &ciErrNum);
00063   oclCheckError(ciErrNum, CL_SUCCESS);
00064 
00065   ciErrNum = clBuildProgram(cpSolver, 0, NULL, KERNEL_OPTIONS, NULL, NULL);
00066   if (ciErrNum != CL_SUCCESS)
00067   {
00068     oclLogBuildInfo(cpSolver, oclGetFirstDev(cxGPUContext));
00069     oclLogPtx(cpSolver, oclGetFirstDev(cxGPUContext), "clSolverError.ptx");
00070     oclCheckError(ciErrNum, CL_SUCCESS);
00071   }
00072 
00073   ckReduce = clCreateKernel(cpSolver, "parallelReduce", &ciErrNum);
00074   oclCheckError(ciErrNum, CL_SUCCESS);
00075   ckZero = clCreateKernel(cpSolver, "parallelZero", &ciErrNum);
00076   oclCheckError(ciErrNum, CL_SUCCESS);
00077   ckZero4 = clCreateKernel(cpSolver, "parallelZero4", &ciErrNum);
00078   oclCheckError(ciErrNum, CL_SUCCESS);
00079   ckSolve = clCreateKernel(cpSolver, "parallelSORLCP", &ciErrNum);
00080   oclCheckError(ciErrNum, CL_SUCCESS);
00081 
00082   //Save default command queue
00083   cqDefaultCommandQueue = cqParamCommandQue;
00084 
00085   //Discard temp storage
00086   free(cDefines1);
00087   free(cDefines2);
00088   free(cKernels);
00089 
00090   //Save ptx code to separate file
00091   oclLogPtx(cpSolver, oclGetFirstDev(cxGPUContext), "oclSolver.ptx");
00092 }
00093 
00094 extern "C" void oclShutdownKernels(void) {
00095   cl_int ciErrNum = CL_SUCCESS;
00096   ciErrNum |= clReleaseKernel(ckReduce);
00097   ciErrNum |= clReleaseKernel(ckSolve);
00098   ciErrNum |= clReleaseKernel(ckZero);
00099   ciErrNum |= clReleaseKernel(ckZero4);
00100   ciErrNum |= clReleaseProgram(cpSolver);
00101   oclCheckError(ciErrNum, CL_SUCCESS);
00102 }
00103 
00104 void oclPGSReduce( cl_mem fc0_reduction,
00105                    cl_mem fc1_reduction,
00106                    ReduceStrategy* reduceStrategy )
00107 {
00108   cl_int ciErrNum = CL_SUCCESS;
00109 
00110   const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( );
00111   const size_t bodySize = reduceStrategy->getBodySize( );
00112   const size_t localWorkSize = 16;
00113   const size_t globalWorkSize = uSnap(bodySize, localWorkSize);
00114 
00115   switch( reduceStrategy->getType( ) )
00116   {
00117     case ReduceTypes::REDUCE_STRIDED:
00118       {
00119         ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckReduce, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
00120         oclCheckError(ciErrNum, CL_SUCCESS);
00121       }
00122       break;
00123     case ReduceTypes::REDUCE_SEQUENTIAL:
00124     case ReduceTypes::REDUCE_COMPACT:
00125     case ReduceTypes::REDUCE_NONE:
00126     default:
00127       oclCheckError(0, 1);
00128       break;
00129 
00130   };
00131 
00132   if( reduceStrategy->clearReduceBuffers( ) ) {
00133     oclZeroVector(fc0_reduction, bodyReductionSize, false);
00134     oclZeroVector(fc1_reduction, bodyReductionSize, false);
00135   }
00136 }
00137 
00138 void oclPGSReduce( cl_mem fc0,
00139                    cl_mem fc1,
00140                    cl_mem fc0_reduction,
00141                    cl_mem fc1_reduction,
00142                    ReduceStrategy* reduceStrategy )
00143 {
00144   cl_int ciErrNum = CL_SUCCESS;
00145 
00146   const int bodySize = reduceStrategy->getBodySize( );
00147   const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( );
00148   const int bodyOffsetStride = reduceStrategy->getBodyOffsetStride( );
00149 
00150   ciErrNum  = clSetKernelArg(ckReduce, 0, sizeof(cl_mem), (void *)&fc0);
00151   ciErrNum |= clSetKernelArg(ckReduce, 1, sizeof(cl_mem), (void *)&fc1);
00152   ciErrNum |= clSetKernelArg(ckReduce, 2, sizeof(cl_mem), (void *)&fc0_reduction);
00153   ciErrNum |= clSetKernelArg(ckReduce, 3, sizeof(cl_mem), (void *)&fc1_reduction);
00154   ciErrNum |= clSetKernelArg(ckReduce, 4, sizeof(int), (void *)&bodyOffsetStride);
00155   ciErrNum |= clSetKernelArg(ckReduce, 5, sizeof(int), (void *)&bodySize);
00156   ciErrNum |= clSetKernelArg(ckReduce, 6, sizeof(int), (void *)&bodyReductionSize);
00157   oclCheckError(ciErrNum, CL_SUCCESS);
00158 
00159   oclPGSReduce(fc0_reduction, fc1_reduction, reduceStrategy);
00160 }
00161 
00162 void oclZeroVector( cl_mem buffer, int bufferSize, bool bScalarType )
00163 {
00164   const size_t localWorkSize = ParallelOptions::BSIZE;
00165   const size_t globalWorkSize = uSnap(bufferSize, localWorkSize);
00166   cl_int ciErrNum = CL_SUCCESS;
00167 
00168   if( bScalarType ) {
00169     ciErrNum  = clSetKernelArg(ckZero, 0, sizeof(cl_mem), (void *)&buffer);
00170     ciErrNum |= clSetKernelArg(ckZero, 1, sizeof(int), (void *)&bufferSize);
00171     oclCheckError(ciErrNum, CL_SUCCESS);
00172 
00173     ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckZero, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
00174     oclCheckError(ciErrNum, CL_SUCCESS);
00175   } else {
00176     ciErrNum  = clSetKernelArg(ckZero4, 0, sizeof(cl_mem), (void *)&buffer);
00177     ciErrNum |= clSetKernelArg(ckZero4, 1, sizeof(int), (void *)&bufferSize);
00178     oclCheckError(ciErrNum, CL_SUCCESS);
00179 
00180     ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckZero4, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
00181     oclCheckError(ciErrNum, CL_SUCCESS);
00182   }
00183 }
00184 
00185 void oclPGSSolve( int offset, int numConstraints, bool bUseAtomics )
00186 {
00187   cl_int ciErrNum = CL_SUCCESS;
00188   size_t globalWorkSizeUnaligned = numConstraints;
00189   const size_t localWorkSize = ParallelOptions::BSIZE;
00190   const size_t globalWorkSize = uSnap(globalWorkSizeUnaligned, localWorkSize);
00191 
00192   ciErrNum |= clSetKernelArg(ckSolve, 12, sizeof(int), (void *)&offset);
00193   ciErrNum |= clSetKernelArg(ckSolve, 13, sizeof(int), (void *)&numConstraints);
00194   oclCheckError(ciErrNum, CL_SUCCESS);
00195 
00196   ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckSolve, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
00197   oclCheckError(ciErrNum, CL_SUCCESS);
00198 }
00199 
00200 void oclPGSSolveInit( cl_mem bodyIDs,
00201                       cl_mem fIDs,
00202                       cl_mem j,
00203                       cl_mem ij,
00204                       cl_mem fc0,
00205                       cl_mem fc1,
00206                       cl_mem fc0_reduction,
00207                       cl_mem fc1_reduction,
00208                       cl_mem lambda,
00209                       cl_mem adcfm,
00210                       cl_mem rhs,
00211                       cl_mem hilo,
00212                       int bStride, int cStride,
00213                       ReduceStrategy* reduceStrategy )
00214 {
00215   cl_int ciErrNum = CL_SUCCESS;
00216 
00217   // Initialize ckSolve kernel arguments
00218   ciErrNum |= clSetKernelArg(ckSolve, 0, sizeof(cl_mem), (void *)&fc0_reduction);
00219   ciErrNum |= clSetKernelArg(ckSolve, 1, sizeof(cl_mem), (void *)&fc1_reduction);
00220   ciErrNum |= clSetKernelArg(ckSolve, 2, sizeof(cl_mem), (void *)&lambda);
00221   ciErrNum |= clSetKernelArg(ckSolve, 3, sizeof(cl_mem), (void *)&bodyIDs);
00222   ciErrNum |= clSetKernelArg(ckSolve, 4, sizeof(cl_mem), (void *)&fIDs);
00223   ciErrNum |= clSetKernelArg(ckSolve, 5, sizeof(cl_mem), (void *)&j);
00224   ciErrNum |= clSetKernelArg(ckSolve, 6, sizeof(cl_mem), (void *)&ij);
00225   ciErrNum |= clSetKernelArg(ckSolve, 7, sizeof(cl_mem), (void *)&fc0);
00226   ciErrNum |= clSetKernelArg(ckSolve, 8, sizeof(cl_mem), (void *)&fc1);
00227   ciErrNum |= clSetKernelArg(ckSolve, 9, sizeof(cl_mem), (void *)&adcfm);
00228   ciErrNum |= clSetKernelArg(ckSolve, 10, sizeof(cl_mem), (void *)&rhs);
00229   ciErrNum |= clSetKernelArg(ckSolve, 11, sizeof(cl_mem), (void *)&hilo);
00230   ciErrNum |= clSetKernelArg(ckSolve, 14, sizeof(int), (void *)&bStride);
00231   ciErrNum |= clSetKernelArg(ckSolve, 15, sizeof(int), (void *)&cStride);
00232   oclCheckError(ciErrNum, CL_SUCCESS);
00233 
00234   // Initialize ckReduce kernel arguments
00235   const int bodySize = reduceStrategy->getBodySize( );
00236   const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( );
00237   const int bodyOffsetStride = reduceStrategy->getBodyOffsetStride( );
00238 
00239   ciErrNum  = clSetKernelArg(ckReduce, 0, sizeof(cl_mem), (void *)&fc0);
00240   ciErrNum |= clSetKernelArg(ckReduce, 1, sizeof(cl_mem), (void *)&fc1);
00241   ciErrNum |= clSetKernelArg(ckReduce, 2, sizeof(cl_mem), (void *)&fc0_reduction);
00242   ciErrNum |= clSetKernelArg(ckReduce, 3, sizeof(cl_mem), (void *)&fc1_reduction);
00243   ciErrNum |= clSetKernelArg(ckReduce, 4, sizeof(int), (void *)&bodyOffsetStride);
00244   ciErrNum |= clSetKernelArg(ckReduce, 5, sizeof(int), (void *)&bodySize);
00245   ciErrNum |= clSetKernelArg(ckReduce, 6, sizeof(int), (void *)&bodyReductionSize);
00246   oclCheckError(ciErrNum, CL_SUCCESS);
00247 }
00248 
00249 }


parallel_quickstep
Author(s): Jared Duke
autogenerated on Wed Apr 23 2014 10:23:51