00001 #ifndef PARALLEL_KERNELS_H
00002 #define PARALLEL_KERNELS_H
00003
00004 #define C_ID(index,i) (index+(cStride*(i)))
00005 #define B_ID(index,i) (index+(bStride*(i)))
00006
00007 static uint2 s_blockIdx, s_blockDim, s_threadIdx;
00008
00009 template <typename T> static inline dxDevice T parallel_inf() { return 0.0; }
00010 template <typename T> static inline dxDevice T parallel_zero() { return 0.0; }
00011
00012 template <> inline dxDevice float parallel_inf<float>() { return dxParallelInfF; }
00013 template <> inline dxDevice float parallel_zero<float>() { return 0.0f ; }
00014 template <> inline dxDevice float4 parallel_zero<float4>() { return make_float4( 0.0f ); }
00015 #ifdef CUDA_DOUBLESUPPORT
00016 template <> inline dxDevice double parallel_inf<double>() { return dxParallelInfD; }
00017 template <> inline dxDevice double parallel_zero<double>() { return 0.0; }
00018 template <> inline dxDevice double4 parallel_zero<double4>() { return make_double4( 0.0 ); }
00019 #else
00020 template <> inline dxDevice double parallel_inf <double>() { return dxParallelInfF; }
00021 template <> inline dxDevice double parallel_zero<double>() { return 0.0; }
00022 template <> inline dxDevice double4 parallel_zero<double4>() { return make_fdouble4(0.0 ); }
00023 #endif
00024
00025 template <typename T>
00026 dxGlobal void
00027 cudaZeroT( T *buffer, const int bufferSize )
00028 {
00029 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00030
00031 if( index < bufferSize )
00032 buffer[ index ] = parallel_zero<T>();
00033 }
00034
00035
00036 template<typename T>
00037 dxGlobal void
00038 cudaSORLCPT( typename vec4<T>::Type *fc0_reduction,
00039 typename vec4<T>::Type *fc1_reduction,
00040 T* lambda,
00041 const int4 *bodyIDs,
00042 const int *fIDs,
00043 const typename vec4<T>::Type *j,
00044 const typename vec4<T>::Type *ij,
00045 const typename vec4<T>::Type *fc0,
00046 const typename vec4<T>::Type *fc1,
00047 const T* adcfm,
00048 const T* rhs,
00049 const T* hilo,
00050 const int offset,
00051 const int numConstraints,
00052 const int bStride,
00053 const int cStride )
00054 {
00055 typedef typename vec4<T>::Type Vec4T;
00056
00057 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00058
00059
00060 if( index >= numConstraints )
00061 return;
00062
00063 index += offset;
00064
00065 T old_lambda = lambda[ index ];
00066
00067 int4 bodyID = bodyIDs[ index ];
00068
00069 Vec4T fc00 = fc0[ bodyID.x ];
00070 Vec4T fc01 = fc1[ bodyID.x ];
00071 Vec4T fc10 = make_vec4( (T)0.0 );
00072 Vec4T fc11 = make_vec4( (T)0.0 );
00073
00074 Vec4T j0_temp = j[ index ];
00075 Vec4T j1_temp = j[ C_ID(index,1) ];
00076 Vec4T j2_temp = j[ C_ID(index,2) ];
00077 Vec4T j3_temp = j[ C_ID(index,3) ];
00078
00079 T delta = rhs[ index ] - old_lambda * adcfm[ index ];
00080
00081 if( bodyID.y >= 0 ) {
00082 fc10 = fc0[ bodyID.y ];
00083 fc11 = fc1[ bodyID.y ];
00084 }
00085
00086 {
00087 delta -= dot( fc00, j0_temp );
00088 delta -= dot( fc01, j1_temp );
00089 if (bodyID.y >= 0) {
00090 delta -= dot( fc10, j2_temp );
00091 delta -= dot( fc11, j3_temp );
00092 }
00093 }
00094
00095
00096 T lo_act = hilo[ index ];
00097 T hi_act = hilo[ C_ID(index,1) ];
00098 int fID = fIDs[ index ];
00099
00100 if (fID >= 0) {
00101 hi_act = abs( hi_act * lambda[ fID ]);
00102 lo_act = -hi_act;
00103 }
00104
00105 T new_lambda = old_lambda + delta;
00106 T final_lambda = new_lambda;
00107
00108 if (new_lambda < lo_act) {
00109 delta = lo_act-old_lambda;
00110 final_lambda = lo_act;
00111 }
00112 else if (new_lambda > hi_act) {
00113 delta = hi_act-old_lambda;
00114 final_lambda = hi_act;
00115 }
00116 lambda[ index ] = final_lambda;
00117
00118
00119
00120
00121 j0_temp = ij[ index ];
00122 j1_temp = ij[ C_ID(index,1) ];
00123 j2_temp = ij[ C_ID(index,2) ];
00124 j3_temp = ij[ C_ID(index,3) ];
00125
00126 {
00127 j0_temp *= delta;
00128 j1_temp *= delta;
00129
00130
00131
00132
00133
00134
00135 fc0_reduction[ bodyID.z ] = j0_temp;
00136 fc1_reduction[ bodyID.z ] = j1_temp;
00137
00138
00139 if( bodyID.y >= 0 ) {
00140 j2_temp *= delta;
00141 j3_temp *= delta;
00142
00143
00144
00145
00146
00147
00148 fc0_reduction[ bodyID.w ] = j2_temp;
00149 fc1_reduction[ bodyID.w ] = j3_temp;
00150
00151 }
00152 }
00153 }
00154
00155 template<typename T>
00156 dxGlobal void
00157 cudaReduceIterativeCompactT( typename vec4<T>::Type *fc0_reduction,
00158 typename vec4<T>::Type *fc1_reduction,
00159 const int treePower )
00160 {
00161 typedef typename vec4<T>::Type Vec4T;
00162
00163 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00164
00165
00166 Vec4T fc0 = fc0_reduction[ index ];
00167 Vec4T fc1 = fc1_reduction[ index ];
00168
00169
00170 if (fc1.w >= treePower)
00171 {
00172 fc1_reduction[ index ].w = fc1.w = 0;
00173 fc0_reduction[ index - treePower ] += fc0;
00174 fc1_reduction[ index - treePower ] += fc1;
00175 }
00176 }
00177
00178 template<typename T>
00179 dxGlobal void
00180 cudaReduceLoopedCompactT( typename vec4<T>::Type *fc0_reduction,
00181 typename vec4<T>::Type *fc1_reduction,
00182 const int size,
00183 const int step )
00184 {
00185 typedef typename vec4<T>::Type Vec4T;
00186
00187 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00188 const uint stride = dxBlockIdx.x * dxBlockDim.x;
00189
00190 while(index < size) {
00191
00192 Vec4T fc0 = fc0_reduction[ index ];
00193 Vec4T fc1 = fc1_reduction[ index ];
00194
00195 const uint offset = 0x1 << (step - 1);
00196 const uint next = index - offset;
00197
00198 if( static_cast<int>(fc1.w) & offset) {
00199 fc1_reduction[ index ].w = fc1.w = 0;
00200 fc0_reduction[ next ] += fc0;
00201 fc1_reduction[ next ] += fc1;
00202 }
00203 index += stride;
00204 }
00205 }
00206
00207 template<typename T>
00208 dxGlobal void
00209 cudaReduceStridedT( typename vec4<T>::Type *fc0,
00210 typename vec4<T>::Type *fc1,
00211 const typename vec4<T>::Type *fc0_reduction,
00212 const typename vec4<T>::Type *fc1_reduction,
00213 const int reductionStride,
00214 const int bodySize,
00215 const int reductionSize )
00216 {
00217 typedef typename vec4<T>::Type Vec4T;
00218
00219 const int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00220 if( index >= bodySize ) return;
00221
00222 int nextIndex = index + reductionStride;
00223
00224 Vec4T sum0 = fc0_reduction[ index ];
00225 Vec4T sum1 = fc1_reduction[ index ];
00226
00227 while(nextIndex < reductionSize) {
00228
00229 sum0 += fc0_reduction[ nextIndex ];
00230 sum1 += fc1_reduction[ nextIndex ];
00231 nextIndex += reductionStride;
00232 }
00233
00234 fc0[ index ] += sum0;
00235 fc1[ index ] += sum1;
00236 }
00237
00238 template <typename T, unsigned int blockSize>
00239 dxGlobal void
00240 cudaReduceSequentialT( typename vec4<T>::Type *fc0,
00241 typename vec4<T>::Type *fc1,
00242 typename vec4<T>::Type *fc0_reduction,
00243 typename vec4<T>::Type *fc1_reduction,
00244 int n )
00245 {
00246 typedef typename vec3<T>::Type Vec3T;
00247 typedef typename vec4<T>::Type Vec4T;
00248
00249 dxShared Vec3T sdata0[ blockSize * 2];
00250 dxShared Vec3T sdata1[ blockSize * 2];
00251
00252 unsigned int tid = dxThreadIdx.x;
00253 unsigned int bid = dxBlockIdx.x;
00254 unsigned int i = dxBlockIdx.x*blockSize + dxThreadIdx.x;
00255
00256 if(i >= n ) return;
00257
00258
00259 Vec3T fc0Sum = make_vec3( fc0_reduction[ i ] );
00260 Vec3T fc1Sum = make_vec3( fc1_reduction[ i ] );
00261
00262
00263 sdata0[ tid ] = fc0Sum;
00264 sdata1[ tid ] = fc1Sum;
00265
00266 dxSyncthreads();
00267
00268
00269 if (blockSize >= 512) {
00270 if (tid < 256) {
00271 sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 256];
00272 sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 256];
00273 }
00274 dxSyncthreads();
00275 }
00276 if (blockSize >= 256) {
00277 if (tid < 128) {
00278 sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 128];
00279 sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 128];
00280 }
00281 dxSyncthreads();
00282 }
00283 if (blockSize >= 128) {
00284 if (tid < 64) {
00285 sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 64];
00286 sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 64];
00287 }
00288 dxSyncthreads();
00289 }
00290
00291 if (tid < 32)
00292 {
00293
00294 volatile Vec3T* smem0 = sdata0;
00295 volatile Vec3T* smem1 = sdata1;
00296
00297 if (blockSize >= 64) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+32]);
00298 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+32]); }
00299 if (blockSize >= 32) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+16]);
00300 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+16]); }
00301 if (blockSize >= 16) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 8]);
00302 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 8]); }
00303 if (blockSize >= 8) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 4]);
00304 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 4]); }
00305 if (blockSize >= 4) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 2]);
00306 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 2]); }
00307 if (blockSize >= 2) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 1]);
00308 add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 1]); }
00309 }
00310
00311
00312 if(tid == 0)
00313 {
00314 fc0[ bid ] += make_vec4( sdata0[0] );
00315 fc1[ bid ] += make_vec4( sdata1[0] );
00316 }
00317 }
00318
00319 template<typename T>
00320 dxGlobal void
00321 cudaComputeInvMJTT( int4 *bodyIDs,
00322 typename vec4<T>::Type *j0,
00323 typename vec4<T>::Type *j1,
00324 typename vec4<T>::Type *j2,
00325 typename vec4<T>::Type *j3,
00326 typename vec4<T>::Type *ij0,
00327 typename vec4<T>::Type *ij1,
00328 typename vec4<T>::Type *ij2,
00329 typename vec4<T>::Type *ij3,
00330 T *iMass,
00331 int numConstraints,
00332 typename vec4<T>::Type *ii0,
00333 typename vec4<T>::Type *ii1,
00334 typename vec4<T>::Type *ii2)
00335 {
00336 typedef typename vec3<T>::Type Vec3T;
00337
00338 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00339
00340 if (index >= numConstraints)
00341 return;
00342
00343 Vec3T ij_temp, ii_temp, j_temp;
00344
00345 int4 bodyID = bodyIDs[ index ];
00346 int body0ID = bodyID.x;
00347 int body1ID = bodyID.y;
00348
00349 register T k = iMass[ body0ID ];
00350
00351
00352 ij0[ index ] = j0[ index ] * k;
00353
00354 ii_temp = make_vec3( ii0[ body0ID ] );
00355 j_temp = make_vec3( j1[ index] );
00356 ij_temp.x = dot( ii_temp, j_temp );
00357
00358 ii_temp = make_vec3( ii1[ body0ID ] );
00359 j_temp = make_vec3( j1[ index] );
00360 ij_temp.y = dot( ii_temp, j_temp );
00361
00362 ii_temp = make_vec3( ii2[ body0ID ] );
00363 j_temp = make_vec3( j1[ index] );
00364 ij_temp.z = dot( ii_temp, j_temp );
00365
00366
00367 ij1[ index ] = make_vec4( ij_temp );
00368
00369 if( body1ID >= 0 ) {
00370 k = iMass[ body1ID ];
00371
00372
00373 ij2[ index ] = j2[ index ] * k;
00374
00375 ii_temp = make_vec3( ii0[ body1ID ] );
00376 j_temp = make_vec3( j3[ index] );
00377 ij_temp.x = dot( ii_temp, j_temp );
00378
00379 ii_temp = make_vec3( ii1[ body1ID ] );
00380 j_temp = make_vec3( j3[ index] );
00381 ij_temp.y = dot( ii_temp, j_temp );
00382
00383 ii_temp = make_vec3( ii2[ body1ID ] );
00384 j_temp = make_vec3( j3[ index] );
00385 ij_temp.z = dot( ii_temp, j_temp );
00386
00387
00388 ij3[ index ] = make_vec4( ij_temp );
00389
00390 } else {
00391
00392 ij2[ index ] = make_vec4( (T)0.0 );
00393 ij3[ index ] = make_vec4( (T)0.0 );
00394 }
00395 }
00396
00397 template<typename T>
00398 dxGlobal void
00399 cudaComputeAdcfmBT( int4 *bodyIDs,
00400 typename vec4<T>::Type *j0,
00401 typename vec4<T>::Type *j1,
00402 typename vec4<T>::Type *j2,
00403 typename vec4<T>::Type *j3,
00404 typename vec4<T>::Type *ij0,
00405 typename vec4<T>::Type *ij1,
00406 typename vec4<T>::Type *ij2,
00407 typename vec4<T>::Type *ij3,
00408 T *adcfm,
00409 T *rhs,
00410 T sorParam, int numConstraints)
00411 {
00412 typedef typename vec3<T>::Type Vec3T;
00413
00414 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00415
00416 if (index >= numConstraints)
00417 return;
00418
00419 int body1ID = bodyIDs[ index ].y;
00420
00421 register T adcfm_i = 0.0;
00422 register T cfm_i = adcfm[ index ];
00423 register Vec3T j0_temp = make_vec3( j0[ index ] );
00424 register Vec3T j1_temp = make_vec3( j1[ index ] );
00425 register Vec3T ij0_temp = make_vec3( ij0[ index ] );
00426 register Vec3T ij1_temp = make_vec3( ij1[ index ] );
00427
00428 {
00429 adcfm_i += dot( j0_temp, ij0_temp );
00430 adcfm_i += dot( j1_temp, ij1_temp );
00431
00432 if(body1ID >= 0) {
00433 j0_temp = make_vec3( j2[ index ] );
00434 j1_temp = make_vec3( j3[ index ] );
00435 ij0_temp = make_vec3( ij2[ index ] );
00436 ij1_temp = make_vec3( ij3[ index ] );
00437
00438 adcfm_i += dot( j0_temp, ij0_temp );
00439 adcfm_i += dot( j1_temp, ij1_temp );
00440 }
00441 adcfm_i = sorParam / (adcfm_i + cfm_i);
00442 }
00443
00444 {
00445 j0[ index ] *= adcfm_i;
00446 j1[ index ] *= adcfm_i;
00447 j2[ index ] *= adcfm_i;
00448 j3[ index ] *= adcfm_i;
00449 rhs[ index ] *= adcfm_i;
00450 adcfm[ index ] = adcfm_i + cfm_i;
00451 }
00452
00453 }
00454
00455 template<typename T>
00456 dxGlobal void
00457 cudaIntegrateT( typename vec4<T>::Type* pos,
00458 typename vec4<T>::Type* lVel,
00459 typename vec4<T>::Type* aVel,
00460 float deltaTime,
00461 int numConstraints)
00462 {
00463 int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00464
00465 if (index >= numConstraints)
00466 return;
00467
00468
00469
00470 }
00471
00472 #endif