$search
00001 #ifndef PARALLEL_KERNELS_H 00002 #define PARALLEL_KERNELS_H 00003 00004 #define C_ID(index,i) (index+(cStride*(i))) 00005 #define B_ID(index,i) (index+(bStride*(i))) 00006 00007 static uint2 s_blockIdx, s_blockDim, s_threadIdx; 00008 00009 template <typename T> static inline dxDevice T parallel_inf() { return 0.0; } 00010 template <typename T> static inline dxDevice T parallel_zero() { return 0.0; } 00011 00012 template <> inline dxDevice float parallel_inf<float>() { return dxParallelInfF; } 00013 template <> inline dxDevice float parallel_zero<float>() { return 0.0f ; } 00014 template <> inline dxDevice float4 parallel_zero<float4>() { return make_float4( 0.0f ); } 00015 #ifdef CUDA_DOUBLESUPPORT 00016 template <> inline dxDevice double parallel_inf<double>() { return dxParallelInfD; } 00017 template <> inline dxDevice double parallel_zero<double>() { return 0.0; } 00018 template <> inline dxDevice double4 parallel_zero<double4>() { return make_double4( 0.0 ); } 00019 #else 00020 template <> inline dxDevice double parallel_inf <double>() { return dxParallelInfF; } 00021 template <> inline dxDevice double parallel_zero<double>() { return 0.0; } 00022 template <> inline dxDevice double4 parallel_zero<double4>() { return make_fdouble4(0.0 ); } 00023 #endif 00024 00025 template <typename T> 00026 dxGlobal void 00027 cudaZeroT( T *buffer, const int bufferSize ) 00028 { 00029 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x; 00030 00031 if( index < bufferSize ) 00032 buffer[ index ] = parallel_zero<T>(); 00033 } 00034 00035 //template<typename T, bool bUseAtomics, bool bUseProjection> 00036 template<typename T> 00037 dxGlobal void 00038 cudaSORLCPT( typename vec4<T>::Type *fc0_reduction, 00039 typename vec4<T>::Type *fc1_reduction, 00040 T* lambda, 00041 const int4 *bodyIDs, 00042 const int *fIDs, 00043 const typename vec4<T>::Type *j, 00044 const typename vec4<T>::Type *ij, 00045 const typename vec4<T>::Type *fc0, 00046 const typename vec4<T>::Type *fc1, 00047 const T* adcfm, 00048 const T* rhs, 00049 const T* hilo, 00050 const int offset, 00051 const int numConstraints, 00052 const int bStride, 00053 const int cStride ) 00054 { 00055 typedef typename vec4<T>::Type Vec4T; 00056 00057 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x; 00058 //int index = dxGlobalIdxX(); 00059 00060 if( index >= numConstraints ) 00061 return; 00062 00063 index += offset; 00064 00065 T old_lambda = lambda[ index ]; 00066 00067 int4 bodyID = bodyIDs[ index ]; 00068 00069 Vec4T fc00 = fc0[ bodyID.x ]; 00070 Vec4T fc01 = fc1[ bodyID.x ]; 00071 Vec4T fc10 = make_vec4( (T)0.0 ); 00072 Vec4T fc11 = make_vec4( (T)0.0 ); 00073 00074 Vec4T j0_temp = j[ index ]; 00075 Vec4T j1_temp = j[ C_ID(index,1) ]; 00076 Vec4T j2_temp = j[ C_ID(index,2) ]; 00077 Vec4T j3_temp = j[ C_ID(index,3) ]; 00078 00079 T delta = rhs[ index ] - old_lambda * adcfm[ index ]; 00080 00081 if( bodyID.y >= 0 ) { 00082 fc10 = fc0[ bodyID.y ]; 00083 fc11 = fc1[ bodyID.y ]; 00084 } 00085 00086 { 00087 delta -= dot( fc00, j0_temp ); 00088 delta -= dot( fc01, j1_temp ); 00089 if (bodyID.y >= 0) { 00090 delta -= dot( fc10, j2_temp ); 00091 delta -= dot( fc11, j3_temp ); 00092 } 00093 } 00094 00095 // if( bUseProjection ) { 00096 T lo_act = hilo[ index ]; 00097 T hi_act = hilo[ C_ID(index,1) ]; 00098 int fID = fIDs[ index ]; 00099 00100 if (fID >= 0) { 00101 hi_act = abs( hi_act * lambda[ fID ]); 00102 lo_act = -hi_act; 00103 } 00104 00105 T new_lambda = old_lambda + delta; 00106 T final_lambda = new_lambda; 00107 00108 if (new_lambda < lo_act) { 00109 delta = lo_act-old_lambda; 00110 final_lambda = lo_act; 00111 } 00112 else if (new_lambda > hi_act) { 00113 delta = hi_act-old_lambda; 00114 final_lambda = hi_act; 00115 } 00116 lambda[ index ] = final_lambda; 00117 // } else { 00118 // lambda[ index ] = old_lambda + delta; 00119 // } 00120 00121 j0_temp = ij[ index ]; 00122 j1_temp = ij[ C_ID(index,1) ]; 00123 j2_temp = ij[ C_ID(index,2) ]; 00124 j3_temp = ij[ C_ID(index,3) ]; 00125 00126 { 00127 j0_temp *= delta; 00128 j1_temp *= delta; 00129 00130 // STORE 00131 //if( bUseAtomics ) { 00132 // myAtomicVecAdd<T>(fc0[ bodyID.x ], j0_temp); 00133 // myAtomicVecAdd<T>(fc1[ bodyID.x ], j1_temp); 00134 //} else { 00135 fc0_reduction[ bodyID.z ] = j0_temp; 00136 fc1_reduction[ bodyID.z ] = j1_temp; 00137 //} 00138 00139 if( bodyID.y >= 0 ) { 00140 j2_temp *= delta; 00141 j3_temp *= delta; 00142 00143 // STORE 00144 //if( bUseAtomics ) { 00145 // myAtomicVecAdd<T>(fc0[ bodyID.y ], j2_temp); 00146 // myAtomicVecAdd<T>(fc1[ bodyID.y ], j3_temp); 00147 //} else { 00148 fc0_reduction[ bodyID.w ] = j2_temp; 00149 fc1_reduction[ bodyID.w ] = j3_temp; 00150 //} 00151 } 00152 } 00153 } 00154 00155 template<typename T> 00156 dxGlobal void 00157 cudaReduceIterativeCompactT( typename vec4<T>::Type *fc0_reduction, 00158 typename vec4<T>::Type *fc1_reduction, 00159 const int treePower ) 00160 { 00161 typedef typename vec4<T>::Type Vec4T; 00162 00163 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x; 00164 00165 // Should we do an atomiceExch here to zero out the old memory? 00166 Vec4T fc0 = fc0_reduction[ index ]; 00167 Vec4T fc1 = fc1_reduction[ index ]; 00168 00169 // row2.w is repetition counter: will gather if >= 8,4,2,1. Do nothing if =0. 00170 if (fc1.w >= treePower) 00171 { 00172 fc1_reduction[ index ].w = fc1.w = 0; // mark as processed 00173 fc0_reduction[ index - treePower ] += fc0; 00174 fc1_reduction[ index - treePower ] += fc1; 00175 } 00176 } 00177 00178 template<typename T> 00179 dxGlobal void 00180 cudaReduceLoopedCompactT( typename vec4<T>::Type *fc0_reduction, 00181 typename vec4<T>::Type *fc1_reduction, 00182 const int size, 00183 const int step ) 00184 { 00185 typedef typename vec4<T>::Type Vec4T; 00186 00187 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x; 00188 const uint stride = dxBlockIdx.x * dxBlockDim.x; 00189 00190 while(index < size) { 00191 // Should we do an atomiceExch here to zero out the old memory? 00192 Vec4T fc0 = fc0_reduction[ index ]; 00193 Vec4T fc1 = fc1_reduction[ index ]; 00194 00195 const uint offset = 0x1 << (step - 1); 00196 const uint next = index - offset; 00197 00198 if( static_cast<int>(fc1.w) & offset) { 00199 fc1_reduction[ index ].w = fc1.w = 0; 00200 fc0_reduction[ next ] += fc0; 00201 fc1_reduction[ next ] += fc1; 00202 } 00203 index += stride; 00204 } 00205 } 00206 00207 template<typename T> 00208 dxGlobal void 00209 cudaReduceStridedT( typename vec4<T>::Type *fc0, 00210 typename vec4<T>::Type *fc1, 00211 const typename vec4<T>::Type *fc0_reduction, 00212 const typename vec4<T>::Type *fc1_reduction, 00213 const int reductionStride, 00214 const int bodySize, 00215 const int reductionSize ) 00216 { 00217 typedef typename vec4<T>::Type Vec4T; 00218 00219 const int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x; 00220 if( index >= bodySize ) return; 00221 00222 int nextIndex = index + reductionStride; 00223 00224 Vec4T sum0 = fc0_reduction[ index ]; 00225 Vec4T sum1 = fc1_reduction[ index ]; 00226 00227 while(nextIndex < reductionSize) { 00228 // Should we do an atomiceExch here to zero out the old memory? 00229 sum0 += fc0_reduction[ nextIndex ]; 00230 sum1 += fc1_reduction[ nextIndex ]; 00231 nextIndex += reductionStride; 00232 } 00233 00234 fc0[ index ] += sum0; 00235 fc1[ index ] += sum1; 00236 } 00237 00238 template <typename T, unsigned int blockSize> 00239 dxGlobal void 00240 cudaReduceSequentialT( typename vec4<T>::Type *fc0, 00241 typename vec4<T>::Type *fc1, 00242 typename vec4<T>::Type *fc0_reduction, 00243 typename vec4<T>::Type *fc1_reduction, 00244 int n ) 00245 { 00246 typedef typename vec3<T>::Type Vec3T; 00247 typedef typename vec4<T>::Type Vec4T; 00248 00249 dxShared Vec3T sdata0[ blockSize * 2]; 00250 dxShared Vec3T sdata1[ blockSize * 2]; 00251 00252 unsigned int tid = dxThreadIdx.x; 00253 unsigned int bid = dxBlockIdx.x; 00254 unsigned int i = dxBlockIdx.x*blockSize + dxThreadIdx.x; 00255 00256 if(i >= n ) return; 00257 00258 // Should we do an atomiceExch here to zero out the old memory? 00259 Vec3T fc0Sum = make_vec3( fc0_reduction[ i ] ); 00260 Vec3T fc1Sum = make_vec3( fc1_reduction[ i ] ); 00261 00262 // each thread puts its local sum into shared memory 00263 sdata0[ tid ] = fc0Sum; 00264 sdata1[ tid ] = fc1Sum; 00265 00266 dxSyncthreads(); 00267 00268 // do reduction in shared mem 00269 if (blockSize >= 512) { 00270 if (tid < 256) { 00271 sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 256]; 00272 sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 256]; 00273 } 00274 dxSyncthreads(); 00275 } 00276 if (blockSize >= 256) { 00277 if (tid < 128) { 00278 sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 128]; 00279 sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 128]; 00280 } 00281 dxSyncthreads(); 00282 } 00283 if (blockSize >= 128) { 00284 if (tid < 64) { 00285 sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 64]; 00286 sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 64]; 00287 } 00288 dxSyncthreads(); 00289 } 00290 00291 if (tid < 32) 00292 { 00293 // Within a warp we require the volatile specifier for correct storage behavior 00294 volatile Vec3T* smem0 = sdata0; 00295 volatile Vec3T* smem1 = sdata1; 00296 00297 if (blockSize >= 64) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+32]); 00298 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+32]); } 00299 if (blockSize >= 32) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+16]); 00300 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+16]); } 00301 if (blockSize >= 16) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 8]); 00302 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 8]); } 00303 if (blockSize >= 8) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 4]); 00304 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 4]); } 00305 if (blockSize >= 4) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 2]); 00306 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 2]); } 00307 if (blockSize >= 2) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 1]); 00308 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 1]); } 00309 } 00310 00311 // write result for this block to global mem 00312 if(tid == 0) 00313 { 00314 fc0[ bid ] += make_vec4( sdata0[0] ); 00315 fc1[ bid ] += make_vec4( sdata1[0] ); 00316 } 00317 } 00318 00319 template<typename T> 00320 dxGlobal void 00321 cudaComputeInvMJTT( int4 *bodyIDs, 00322 typename vec4<T>::Type *j0, 00323 typename vec4<T>::Type *j1, 00324 typename vec4<T>::Type *j2, 00325 typename vec4<T>::Type *j3, 00326 typename vec4<T>::Type *ij0, 00327 typename vec4<T>::Type *ij1, 00328 typename vec4<T>::Type *ij2, 00329 typename vec4<T>::Type *ij3, 00330 T *iMass, 00331 int numConstraints, 00332 typename vec4<T>::Type *ii0, 00333 typename vec4<T>::Type *ii1, 00334 typename vec4<T>::Type *ii2) 00335 { 00336 typedef typename vec3<T>::Type Vec3T; 00337 00338 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x; 00339 00340 if (index >= numConstraints) 00341 return; 00342 00343 Vec3T ij_temp, ii_temp, j_temp; 00344 00345 int4 bodyID = bodyIDs[ index ]; 00346 int body0ID = bodyID.x; 00347 int body1ID = bodyID.y; 00348 00349 register T k = iMass[ body0ID ]; 00350 00351 // Store 00352 ij0[ index ] = j0[ index ] * k; 00353 00354 ii_temp = make_vec3( ii0[ body0ID ] ); 00355 j_temp = make_vec3( j1[ index] ); 00356 ij_temp.x = dot( ii_temp, j_temp ); 00357 00358 ii_temp = make_vec3( ii1[ body0ID ] ); 00359 j_temp = make_vec3( j1[ index] ); 00360 ij_temp.y = dot( ii_temp, j_temp ); 00361 00362 ii_temp = make_vec3( ii2[ body0ID ] ); 00363 j_temp = make_vec3( j1[ index] ); 00364 ij_temp.z = dot( ii_temp, j_temp ); 00365 00366 // Store 00367 ij1[ index ] = make_vec4( ij_temp ); 00368 00369 if( body1ID >= 0 ) { 00370 k = iMass[ body1ID ]; 00371 00372 // Store 00373 ij2[ index ] = j2[ index ] * k; 00374 00375 ii_temp = make_vec3( ii0[ body1ID ] ); 00376 j_temp = make_vec3( j3[ index] ); 00377 ij_temp.x = dot( ii_temp, j_temp ); 00378 00379 ii_temp = make_vec3( ii1[ body1ID ] ); 00380 j_temp = make_vec3( j3[ index] ); 00381 ij_temp.y = dot( ii_temp, j_temp ); 00382 00383 ii_temp = make_vec3( ii2[ body1ID ] ); 00384 j_temp = make_vec3( j3[ index] ); 00385 ij_temp.z = dot( ii_temp, j_temp ); 00386 00387 // Store 00388 ij3[ index ] = make_vec4( ij_temp ); 00389 00390 } else { 00391 // Store 00392 ij2[ index ] = make_vec4( (T)0.0 ); 00393 ij3[ index ] = make_vec4( (T)0.0 ); 00394 } 00395 } 00396 00397 template<typename T> 00398 dxGlobal void 00399 cudaComputeAdcfmBT( int4 *bodyIDs, 00400 typename vec4<T>::Type *j0, 00401 typename vec4<T>::Type *j1, 00402 typename vec4<T>::Type *j2, 00403 typename vec4<T>::Type *j3, 00404 typename vec4<T>::Type *ij0, 00405 typename vec4<T>::Type *ij1, 00406 typename vec4<T>::Type *ij2, 00407 typename vec4<T>::Type *ij3, 00408 T *adcfm, 00409 T *rhs, 00410 T sorParam, int numConstraints) 00411 { 00412 typedef typename vec3<T>::Type Vec3T; 00413 00414 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x; 00415 00416 if (index >= numConstraints) 00417 return; 00418 00419 int body1ID = bodyIDs[ index ].y; 00420 00421 register T adcfm_i = 0.0; 00422 register T cfm_i = adcfm[ index ]; 00423 register Vec3T j0_temp = make_vec3( j0[ index ] ); 00424 register Vec3T j1_temp = make_vec3( j1[ index ] ); 00425 register Vec3T ij0_temp = make_vec3( ij0[ index ] ); 00426 register Vec3T ij1_temp = make_vec3( ij1[ index ] ); 00427 00428 { 00429 adcfm_i += dot( j0_temp, ij0_temp ); 00430 adcfm_i += dot( j1_temp, ij1_temp ); 00431 00432 if(body1ID >= 0) { 00433 j0_temp = make_vec3( j2[ index ] ); 00434 j1_temp = make_vec3( j3[ index ] ); 00435 ij0_temp = make_vec3( ij2[ index ] ); 00436 ij1_temp = make_vec3( ij3[ index ] ); 00437 00438 adcfm_i += dot( j0_temp, ij0_temp ); 00439 adcfm_i += dot( j1_temp, ij1_temp ); 00440 } 00441 adcfm_i = sorParam / (adcfm_i + cfm_i); 00442 } 00443 00444 { 00445 j0[ index ] *= adcfm_i; 00446 j1[ index ] *= adcfm_i; 00447 j2[ index ] *= adcfm_i; 00448 j3[ index ] *= adcfm_i; 00449 rhs[ index ] *= adcfm_i; 00450 adcfm[ index ] = adcfm_i + cfm_i; 00451 } 00452 00453 } 00454 00455 template<typename T> 00456 dxGlobal void 00457 cudaIntegrateT( typename vec4<T>::Type* pos, 00458 typename vec4<T>::Type* lVel, 00459 typename vec4<T>::Type* aVel, 00460 float deltaTime, 00461 int numConstraints) 00462 { 00463 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x; 00464 00465 if (index >= numConstraints) 00466 return; 00467 00468 //typename vec4<T>::Type position = pos[index]; 00469 //typename vec3<T>::Type accel = computeBodyAccel<T, multithreadBodies>(position, oldPos, numBodies); 00470 } 00471 00472 #endif