00001 #include "opencl_kernels.h"
00002
00003 #include <opencl_utils.h>
00004 #include <parallel_common.h>
00005 #include <parallel_reduce.h>
00006 #include <parallel_math.h>
00007
00008 namespace parallel_ode
00009 {
00010
00011 static cl_program cpSolver;
00021 static size_t uSnap(size_t a, size_t b){
00022 return ((a % b) == 0) ? a : (a - (a % b) + b);
00023 }
00024
00025 static cl_kernel
00026 ckReduce,
00027 ckZero,
00028 ckZero4,
00029 ckSolve;
00031 static cl_command_queue cqDefaultCommandQueue;
00040 std::string extractDirectory( const std::string& path )
00041 {
00042 return path.substr( 0, path.find_last_of( '/' )+1 );
00043 }
00044
00045 static const std::string FILENAME = __FILE__;
00046 static const std::string SRC_DIR = extractDirectory( FILENAME );
00047 static const std::string INC_DIR = extractDirectory( FILENAME ) + "../include/" + PROJ_NAME + "/";
00049 extern "C" void oclInitializeKernels(cl_context cxGPUContext, cl_command_queue cqParamCommandQue)
00050 {
00051 cl_int ciErrNum;
00052 size_t kernelLength;
00053
00054 char *cDefines1 = oclLoadProgSource((INC_DIR+"parallel_defines.h").c_str(), KERNEL_PREFIX, &kernelLength);
00055 oclCheckError(cDefines1 != NULL, true);
00056 char *cDefines2 = oclLoadProgSource((INC_DIR+"opencl_common.h").c_str(), cDefines1, &kernelLength);
00057 oclCheckError(cDefines2 != NULL, true);
00058 char *cKernels = oclLoadProgSource((SRC_DIR+"parallel_kernels_nontemplate.h").c_str(), cDefines2, &kernelLength);
00059
00060 oclCheckError(cKernels != NULL, true);
00061
00062 cpSolver = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cKernels, &kernelLength, &ciErrNum);
00063 oclCheckError(ciErrNum, CL_SUCCESS);
00064
00065 ciErrNum = clBuildProgram(cpSolver, 0, NULL, KERNEL_OPTIONS, NULL, NULL);
00066 if (ciErrNum != CL_SUCCESS)
00067 {
00068 oclLogBuildInfo(cpSolver, oclGetFirstDev(cxGPUContext));
00069 oclLogPtx(cpSolver, oclGetFirstDev(cxGPUContext), "clSolverError.ptx");
00070 oclCheckError(ciErrNum, CL_SUCCESS);
00071 }
00072
00073 ckReduce = clCreateKernel(cpSolver, "parallelReduce", &ciErrNum);
00074 oclCheckError(ciErrNum, CL_SUCCESS);
00075 ckZero = clCreateKernel(cpSolver, "parallelZero", &ciErrNum);
00076 oclCheckError(ciErrNum, CL_SUCCESS);
00077 ckZero4 = clCreateKernel(cpSolver, "parallelZero4", &ciErrNum);
00078 oclCheckError(ciErrNum, CL_SUCCESS);
00079 ckSolve = clCreateKernel(cpSolver, "parallelSORLCP", &ciErrNum);
00080 oclCheckError(ciErrNum, CL_SUCCESS);
00081
00082
00083 cqDefaultCommandQueue = cqParamCommandQue;
00084
00085
00086 free(cDefines1);
00087 free(cDefines2);
00088 free(cKernels);
00089
00090
00091 oclLogPtx(cpSolver, oclGetFirstDev(cxGPUContext), "oclSolver.ptx");
00092 }
00093
00094 extern "C" void oclShutdownKernels(void) {
00095 cl_int ciErrNum = CL_SUCCESS;
00096 ciErrNum |= clReleaseKernel(ckReduce);
00097 ciErrNum |= clReleaseKernel(ckSolve);
00098 ciErrNum |= clReleaseKernel(ckZero);
00099 ciErrNum |= clReleaseKernel(ckZero4);
00100 ciErrNum |= clReleaseProgram(cpSolver);
00101 oclCheckError(ciErrNum, CL_SUCCESS);
00102 }
00103
00104 void oclPGSReduce( cl_mem fc0_reduction,
00105 cl_mem fc1_reduction,
00106 ReduceStrategy* reduceStrategy )
00107 {
00108 cl_int ciErrNum = CL_SUCCESS;
00109
00110 const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( );
00111 const size_t bodySize = reduceStrategy->getBodySize( );
00112 const size_t localWorkSize = 16;
00113 const size_t globalWorkSize = uSnap(bodySize, localWorkSize);
00114
00115 switch( reduceStrategy->getType( ) )
00116 {
00117 case ReduceTypes::REDUCE_STRIDED:
00118 {
00119 ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckReduce, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
00120 oclCheckError(ciErrNum, CL_SUCCESS);
00121 }
00122 break;
00123 case ReduceTypes::REDUCE_SEQUENTIAL:
00124 case ReduceTypes::REDUCE_COMPACT:
00125 case ReduceTypes::REDUCE_NONE:
00126 default:
00127 oclCheckError(0, 1);
00128 break;
00129
00130 };
00131
00132 if( reduceStrategy->clearReduceBuffers( ) ) {
00133 oclZeroVector(fc0_reduction, bodyReductionSize, false);
00134 oclZeroVector(fc1_reduction, bodyReductionSize, false);
00135 }
00136 }
00137
00138 void oclPGSReduce( cl_mem fc0,
00139 cl_mem fc1,
00140 cl_mem fc0_reduction,
00141 cl_mem fc1_reduction,
00142 ReduceStrategy* reduceStrategy )
00143 {
00144 cl_int ciErrNum = CL_SUCCESS;
00145
00146 const int bodySize = reduceStrategy->getBodySize( );
00147 const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( );
00148 const int bodyOffsetStride = reduceStrategy->getBodyOffsetStride( );
00149
00150 ciErrNum = clSetKernelArg(ckReduce, 0, sizeof(cl_mem), (void *)&fc0);
00151 ciErrNum |= clSetKernelArg(ckReduce, 1, sizeof(cl_mem), (void *)&fc1);
00152 ciErrNum |= clSetKernelArg(ckReduce, 2, sizeof(cl_mem), (void *)&fc0_reduction);
00153 ciErrNum |= clSetKernelArg(ckReduce, 3, sizeof(cl_mem), (void *)&fc1_reduction);
00154 ciErrNum |= clSetKernelArg(ckReduce, 4, sizeof(int), (void *)&bodyOffsetStride);
00155 ciErrNum |= clSetKernelArg(ckReduce, 5, sizeof(int), (void *)&bodySize);
00156 ciErrNum |= clSetKernelArg(ckReduce, 6, sizeof(int), (void *)&bodyReductionSize);
00157 oclCheckError(ciErrNum, CL_SUCCESS);
00158
00159 oclPGSReduce(fc0_reduction, fc1_reduction, reduceStrategy);
00160 }
00161
00162 void oclZeroVector( cl_mem buffer, int bufferSize, bool bScalarType )
00163 {
00164 const size_t localWorkSize = ParallelOptions::BSIZE;
00165 const size_t globalWorkSize = uSnap(bufferSize, localWorkSize);
00166 cl_int ciErrNum = CL_SUCCESS;
00167
00168 if( bScalarType ) {
00169 ciErrNum = clSetKernelArg(ckZero, 0, sizeof(cl_mem), (void *)&buffer);
00170 ciErrNum |= clSetKernelArg(ckZero, 1, sizeof(int), (void *)&bufferSize);
00171 oclCheckError(ciErrNum, CL_SUCCESS);
00172
00173 ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckZero, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
00174 oclCheckError(ciErrNum, CL_SUCCESS);
00175 } else {
00176 ciErrNum = clSetKernelArg(ckZero4, 0, sizeof(cl_mem), (void *)&buffer);
00177 ciErrNum |= clSetKernelArg(ckZero4, 1, sizeof(int), (void *)&bufferSize);
00178 oclCheckError(ciErrNum, CL_SUCCESS);
00179
00180 ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckZero4, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
00181 oclCheckError(ciErrNum, CL_SUCCESS);
00182 }
00183 }
00184
00185 void oclPGSSolve( int offset, int numConstraints, bool bUseAtomics )
00186 {
00187 cl_int ciErrNum = CL_SUCCESS;
00188 size_t globalWorkSizeUnaligned = numConstraints;
00189 const size_t localWorkSize = ParallelOptions::BSIZE;
00190 const size_t globalWorkSize = uSnap(globalWorkSizeUnaligned, localWorkSize);
00191
00192 ciErrNum |= clSetKernelArg(ckSolve, 12, sizeof(int), (void *)&offset);
00193 ciErrNum |= clSetKernelArg(ckSolve, 13, sizeof(int), (void *)&numConstraints);
00194 oclCheckError(ciErrNum, CL_SUCCESS);
00195
00196 ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckSolve, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
00197 oclCheckError(ciErrNum, CL_SUCCESS);
00198 }
00199
00200 void oclPGSSolveInit( cl_mem bodyIDs,
00201 cl_mem fIDs,
00202 cl_mem j,
00203 cl_mem ij,
00204 cl_mem fc0,
00205 cl_mem fc1,
00206 cl_mem fc0_reduction,
00207 cl_mem fc1_reduction,
00208 cl_mem lambda,
00209 cl_mem adcfm,
00210 cl_mem rhs,
00211 cl_mem hilo,
00212 int bStride, int cStride,
00213 ReduceStrategy* reduceStrategy )
00214 {
00215 cl_int ciErrNum = CL_SUCCESS;
00216
00217
00218 ciErrNum |= clSetKernelArg(ckSolve, 0, sizeof(cl_mem), (void *)&fc0_reduction);
00219 ciErrNum |= clSetKernelArg(ckSolve, 1, sizeof(cl_mem), (void *)&fc1_reduction);
00220 ciErrNum |= clSetKernelArg(ckSolve, 2, sizeof(cl_mem), (void *)&lambda);
00221 ciErrNum |= clSetKernelArg(ckSolve, 3, sizeof(cl_mem), (void *)&bodyIDs);
00222 ciErrNum |= clSetKernelArg(ckSolve, 4, sizeof(cl_mem), (void *)&fIDs);
00223 ciErrNum |= clSetKernelArg(ckSolve, 5, sizeof(cl_mem), (void *)&j);
00224 ciErrNum |= clSetKernelArg(ckSolve, 6, sizeof(cl_mem), (void *)&ij);
00225 ciErrNum |= clSetKernelArg(ckSolve, 7, sizeof(cl_mem), (void *)&fc0);
00226 ciErrNum |= clSetKernelArg(ckSolve, 8, sizeof(cl_mem), (void *)&fc1);
00227 ciErrNum |= clSetKernelArg(ckSolve, 9, sizeof(cl_mem), (void *)&adcfm);
00228 ciErrNum |= clSetKernelArg(ckSolve, 10, sizeof(cl_mem), (void *)&rhs);
00229 ciErrNum |= clSetKernelArg(ckSolve, 11, sizeof(cl_mem), (void *)&hilo);
00230 ciErrNum |= clSetKernelArg(ckSolve, 14, sizeof(int), (void *)&bStride);
00231 ciErrNum |= clSetKernelArg(ckSolve, 15, sizeof(int), (void *)&cStride);
00232 oclCheckError(ciErrNum, CL_SUCCESS);
00233
00234
00235 const int bodySize = reduceStrategy->getBodySize( );
00236 const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( );
00237 const int bodyOffsetStride = reduceStrategy->getBodyOffsetStride( );
00238
00239 ciErrNum = clSetKernelArg(ckReduce, 0, sizeof(cl_mem), (void *)&fc0);
00240 ciErrNum |= clSetKernelArg(ckReduce, 1, sizeof(cl_mem), (void *)&fc1);
00241 ciErrNum |= clSetKernelArg(ckReduce, 2, sizeof(cl_mem), (void *)&fc0_reduction);
00242 ciErrNum |= clSetKernelArg(ckReduce, 3, sizeof(cl_mem), (void *)&fc1_reduction);
00243 ciErrNum |= clSetKernelArg(ckReduce, 4, sizeof(int), (void *)&bodyOffsetStride);
00244 ciErrNum |= clSetKernelArg(ckReduce, 5, sizeof(int), (void *)&bodySize);
00245 ciErrNum |= clSetKernelArg(ckReduce, 6, sizeof(int), (void *)&bodyReductionSize);
00246 oclCheckError(ciErrNum, CL_SUCCESS);
00247 }
00248
00249 }