$search
00001 #include "opencl_kernels.h" 00002 00003 #include <opencl_utils.h> 00004 #include <parallel_common.h> 00005 #include <parallel_reduce.h> 00006 #include <parallel_math.h> 00007 00008 namespace parallel_ode 00009 { 00010 00011 static cl_program cpSolver; 00021 static size_t uSnap(size_t a, size_t b){ 00022 return ((a % b) == 0) ? a : (a - (a % b) + b); 00023 } 00024 00025 static cl_kernel 00026 ckReduce, 00027 ckZero, 00028 ckZero4, 00029 ckSolve; 00031 static cl_command_queue cqDefaultCommandQueue; 00040 std::string extractDirectory( const std::string& path ) 00041 { 00042 return path.substr( 0, path.find_last_of( '/' )+1 ); 00043 } 00044 00045 static const std::string FILENAME = __FILE__; 00046 static const std::string SRC_DIR = extractDirectory( FILENAME ); 00047 static const std::string INC_DIR = extractDirectory( FILENAME ) + "../include/" + PROJ_NAME + "/"; 00049 extern "C" void oclInitializeKernels(cl_context cxGPUContext, cl_command_queue cqParamCommandQue) 00050 { 00051 cl_int ciErrNum; 00052 size_t kernelLength; 00053 00054 char *cDefines1 = oclLoadProgSource((INC_DIR+"parallel_defines.h").c_str(), KERNEL_PREFIX, &kernelLength); 00055 oclCheckError(cDefines1 != NULL, true); 00056 char *cDefines2 = oclLoadProgSource((INC_DIR+"opencl_common.h").c_str(), cDefines1, &kernelLength); 00057 oclCheckError(cDefines2 != NULL, true); 00058 char *cKernels = oclLoadProgSource((SRC_DIR+"parallel_kernels_nontemplate.h").c_str(), cDefines2, &kernelLength); 00059 //char *cKernels = oclLoadProgSource((SRC_DIR+"parallel_kernels.cl").c_str(), cDefines1, &kernelLength); 00060 oclCheckError(cKernels != NULL, true); 00061 00062 cpSolver = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cKernels, &kernelLength, &ciErrNum); 00063 oclCheckError(ciErrNum, CL_SUCCESS); 00064 00065 ciErrNum = clBuildProgram(cpSolver, 0, NULL, KERNEL_OPTIONS, NULL, NULL); 00066 if (ciErrNum != CL_SUCCESS) 00067 { 00068 oclLogBuildInfo(cpSolver, oclGetFirstDev(cxGPUContext)); 00069 oclLogPtx(cpSolver, oclGetFirstDev(cxGPUContext), "clSolverError.ptx"); 00070 oclCheckError(ciErrNum, CL_SUCCESS); 00071 } 00072 00073 ckReduce = clCreateKernel(cpSolver, "parallelReduce", &ciErrNum); 00074 oclCheckError(ciErrNum, CL_SUCCESS); 00075 ckZero = clCreateKernel(cpSolver, "parallelZero", &ciErrNum); 00076 oclCheckError(ciErrNum, CL_SUCCESS); 00077 ckZero4 = clCreateKernel(cpSolver, "parallelZero4", &ciErrNum); 00078 oclCheckError(ciErrNum, CL_SUCCESS); 00079 ckSolve = clCreateKernel(cpSolver, "parallelSORLCP", &ciErrNum); 00080 oclCheckError(ciErrNum, CL_SUCCESS); 00081 00082 //Save default command queue 00083 cqDefaultCommandQueue = cqParamCommandQue; 00084 00085 //Discard temp storage 00086 free(cDefines1); 00087 free(cDefines2); 00088 free(cKernels); 00089 00090 //Save ptx code to separate file 00091 oclLogPtx(cpSolver, oclGetFirstDev(cxGPUContext), "oclSolver.ptx"); 00092 } 00093 00094 extern "C" void oclShutdownKernels(void) { 00095 cl_int ciErrNum = CL_SUCCESS; 00096 ciErrNum |= clReleaseKernel(ckReduce); 00097 ciErrNum |= clReleaseKernel(ckSolve); 00098 ciErrNum |= clReleaseKernel(ckZero); 00099 ciErrNum |= clReleaseKernel(ckZero4); 00100 ciErrNum |= clReleaseProgram(cpSolver); 00101 oclCheckError(ciErrNum, CL_SUCCESS); 00102 } 00103 00104 void oclPGSReduce( cl_mem fc0_reduction, 00105 cl_mem fc1_reduction, 00106 ReduceStrategy* reduceStrategy ) 00107 { 00108 cl_int ciErrNum = CL_SUCCESS; 00109 00110 const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( ); 00111 const size_t bodySize = reduceStrategy->getBodySize( ); 00112 const size_t localWorkSize = 16; 00113 const size_t globalWorkSize = uSnap(bodySize, localWorkSize); 00114 00115 switch( reduceStrategy->getType( ) ) 00116 { 00117 case ReduceTypes::REDUCE_STRIDED: 00118 { 00119 ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckReduce, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); 00120 oclCheckError(ciErrNum, CL_SUCCESS); 00121 } 00122 break; 00123 case ReduceTypes::REDUCE_SEQUENTIAL: 00124 case ReduceTypes::REDUCE_COMPACT: 00125 case ReduceTypes::REDUCE_NONE: 00126 default: 00127 oclCheckError(0, 1); 00128 break; 00129 00130 }; 00131 00132 if( reduceStrategy->clearReduceBuffers( ) ) { 00133 oclZeroVector(fc0_reduction, bodyReductionSize, false); 00134 oclZeroVector(fc1_reduction, bodyReductionSize, false); 00135 } 00136 } 00137 00138 void oclPGSReduce( cl_mem fc0, 00139 cl_mem fc1, 00140 cl_mem fc0_reduction, 00141 cl_mem fc1_reduction, 00142 ReduceStrategy* reduceStrategy ) 00143 { 00144 cl_int ciErrNum = CL_SUCCESS; 00145 00146 const int bodySize = reduceStrategy->getBodySize( ); 00147 const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( ); 00148 const int bodyOffsetStride = reduceStrategy->getBodyOffsetStride( ); 00149 00150 ciErrNum = clSetKernelArg(ckReduce, 0, sizeof(cl_mem), (void *)&fc0); 00151 ciErrNum |= clSetKernelArg(ckReduce, 1, sizeof(cl_mem), (void *)&fc1); 00152 ciErrNum |= clSetKernelArg(ckReduce, 2, sizeof(cl_mem), (void *)&fc0_reduction); 00153 ciErrNum |= clSetKernelArg(ckReduce, 3, sizeof(cl_mem), (void *)&fc1_reduction); 00154 ciErrNum |= clSetKernelArg(ckReduce, 4, sizeof(int), (void *)&bodyOffsetStride); 00155 ciErrNum |= clSetKernelArg(ckReduce, 5, sizeof(int), (void *)&bodySize); 00156 ciErrNum |= clSetKernelArg(ckReduce, 6, sizeof(int), (void *)&bodyReductionSize); 00157 oclCheckError(ciErrNum, CL_SUCCESS); 00158 00159 oclPGSReduce(fc0_reduction, fc1_reduction, reduceStrategy); 00160 } 00161 00162 void oclZeroVector( cl_mem buffer, int bufferSize, bool bScalarType ) 00163 { 00164 const size_t localWorkSize = ParallelOptions::BSIZE; 00165 const size_t globalWorkSize = uSnap(bufferSize, localWorkSize); 00166 cl_int ciErrNum = CL_SUCCESS; 00167 00168 if( bScalarType ) { 00169 ciErrNum = clSetKernelArg(ckZero, 0, sizeof(cl_mem), (void *)&buffer); 00170 ciErrNum |= clSetKernelArg(ckZero, 1, sizeof(int), (void *)&bufferSize); 00171 oclCheckError(ciErrNum, CL_SUCCESS); 00172 00173 ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckZero, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); 00174 oclCheckError(ciErrNum, CL_SUCCESS); 00175 } else { 00176 ciErrNum = clSetKernelArg(ckZero4, 0, sizeof(cl_mem), (void *)&buffer); 00177 ciErrNum |= clSetKernelArg(ckZero4, 1, sizeof(int), (void *)&bufferSize); 00178 oclCheckError(ciErrNum, CL_SUCCESS); 00179 00180 ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckZero4, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); 00181 oclCheckError(ciErrNum, CL_SUCCESS); 00182 } 00183 } 00184 00185 void oclPGSSolve( int offset, int numConstraints, bool bUseAtomics ) 00186 { 00187 cl_int ciErrNum = CL_SUCCESS; 00188 size_t globalWorkSizeUnaligned = numConstraints; 00189 const size_t localWorkSize = ParallelOptions::BSIZE; 00190 const size_t globalWorkSize = uSnap(globalWorkSizeUnaligned, localWorkSize); 00191 00192 ciErrNum |= clSetKernelArg(ckSolve, 12, sizeof(int), (void *)&offset); 00193 ciErrNum |= clSetKernelArg(ckSolve, 13, sizeof(int), (void *)&numConstraints); 00194 oclCheckError(ciErrNum, CL_SUCCESS); 00195 00196 ciErrNum = clEnqueueNDRangeKernel(cqDefaultCommandQueue, ckSolve, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); 00197 oclCheckError(ciErrNum, CL_SUCCESS); 00198 } 00199 00200 void oclPGSSolveInit( cl_mem bodyIDs, 00201 cl_mem fIDs, 00202 cl_mem j, 00203 cl_mem ij, 00204 cl_mem fc0, 00205 cl_mem fc1, 00206 cl_mem fc0_reduction, 00207 cl_mem fc1_reduction, 00208 cl_mem lambda, 00209 cl_mem adcfm, 00210 cl_mem rhs, 00211 cl_mem hilo, 00212 int bStride, int cStride, 00213 ReduceStrategy* reduceStrategy ) 00214 { 00215 cl_int ciErrNum = CL_SUCCESS; 00216 00217 // Initialize ckSolve kernel arguments 00218 ciErrNum |= clSetKernelArg(ckSolve, 0, sizeof(cl_mem), (void *)&fc0_reduction); 00219 ciErrNum |= clSetKernelArg(ckSolve, 1, sizeof(cl_mem), (void *)&fc1_reduction); 00220 ciErrNum |= clSetKernelArg(ckSolve, 2, sizeof(cl_mem), (void *)&lambda); 00221 ciErrNum |= clSetKernelArg(ckSolve, 3, sizeof(cl_mem), (void *)&bodyIDs); 00222 ciErrNum |= clSetKernelArg(ckSolve, 4, sizeof(cl_mem), (void *)&fIDs); 00223 ciErrNum |= clSetKernelArg(ckSolve, 5, sizeof(cl_mem), (void *)&j); 00224 ciErrNum |= clSetKernelArg(ckSolve, 6, sizeof(cl_mem), (void *)&ij); 00225 ciErrNum |= clSetKernelArg(ckSolve, 7, sizeof(cl_mem), (void *)&fc0); 00226 ciErrNum |= clSetKernelArg(ckSolve, 8, sizeof(cl_mem), (void *)&fc1); 00227 ciErrNum |= clSetKernelArg(ckSolve, 9, sizeof(cl_mem), (void *)&adcfm); 00228 ciErrNum |= clSetKernelArg(ckSolve, 10, sizeof(cl_mem), (void *)&rhs); 00229 ciErrNum |= clSetKernelArg(ckSolve, 11, sizeof(cl_mem), (void *)&hilo); 00230 ciErrNum |= clSetKernelArg(ckSolve, 14, sizeof(int), (void *)&bStride); 00231 ciErrNum |= clSetKernelArg(ckSolve, 15, sizeof(int), (void *)&cStride); 00232 oclCheckError(ciErrNum, CL_SUCCESS); 00233 00234 // Initialize ckReduce kernel arguments 00235 const int bodySize = reduceStrategy->getBodySize( ); 00236 const int bodyReductionSize = reduceStrategy->getBodySizeWithReduction( ); 00237 const int bodyOffsetStride = reduceStrategy->getBodyOffsetStride( ); 00238 00239 ciErrNum = clSetKernelArg(ckReduce, 0, sizeof(cl_mem), (void *)&fc0); 00240 ciErrNum |= clSetKernelArg(ckReduce, 1, sizeof(cl_mem), (void *)&fc1); 00241 ciErrNum |= clSetKernelArg(ckReduce, 2, sizeof(cl_mem), (void *)&fc0_reduction); 00242 ciErrNum |= clSetKernelArg(ckReduce, 3, sizeof(cl_mem), (void *)&fc1_reduction); 00243 ciErrNum |= clSetKernelArg(ckReduce, 4, sizeof(int), (void *)&bodyOffsetStride); 00244 ciErrNum |= clSetKernelArg(ckReduce, 5, sizeof(int), (void *)&bodySize); 00245 ciErrNum |= clSetKernelArg(ckReduce, 6, sizeof(int), (void *)&bodyReductionSize); 00246 oclCheckError(ciErrNum, CL_SUCCESS); 00247 } 00248 00249 }