parallel_quickstep: parallel_kernels.h Source File

Go to the documentation of this file.
00001 #ifndef PARALLEL_KERNELS_H
00002 #define PARALLEL_KERNELS_H
00003 
00004 #define C_ID(index,i)     (index+(cStride*(i)))
00005 #define B_ID(index,i)     (index+(bStride*(i)))
00006 
00007 static uint2 s_blockIdx, s_blockDim, s_threadIdx;
00008 
00009 template <typename T> static inline dxDevice T       parallel_inf()         { return 0.0; }
00010 template <typename T> static inline dxDevice T       parallel_zero()         { return 0.0; }
00011 
00012 template <>                  inline dxDevice float   parallel_inf<float>()  { return dxParallelInfF; }
00013 template <>                  inline dxDevice float   parallel_zero<float>()  { return  0.0f ; }
00014 template <>                  inline dxDevice float4  parallel_zero<float4>()  { return make_float4( 0.0f ); }
00015 #ifdef CUDA_DOUBLESUPPORT
00016 template <>                  inline dxDevice double  parallel_inf<double>() { return dxParallelInfD; }
00017 template <>                  inline dxDevice double  parallel_zero<double>()  { return 0.0; }
00018 template <>                  inline dxDevice double4 parallel_zero<double4>()  { return make_double4( 0.0 ); }
00019 #else
00020 template <>                  inline dxDevice double  parallel_inf <double>()   { return dxParallelInfF; }
00021 template <>                  inline dxDevice double  parallel_zero<double>()   { return 0.0; }
00022 template <>                  inline dxDevice double4 parallel_zero<double4>()  { return make_fdouble4(0.0 ); }
00023 #endif
00024 
00025 template <typename T>
00026 dxGlobal void
00027 cudaZeroT( T *buffer, const int bufferSize )
00028 {
00029   int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00030 
00031   if( index < bufferSize )
00032     buffer[ index ] = parallel_zero<T>();
00033 }
00034 
00035 //template<typename T, bool bUseAtomics, bool bUseProjection>
00036 template<typename T>
00037 dxGlobal void
00038 cudaSORLCPT( typename vec4<T>::Type *fc0_reduction,
00039              typename vec4<T>::Type *fc1_reduction,
00040              T* lambda,
00041              const int4 *bodyIDs,
00042              const int *fIDs,
00043              const typename vec4<T>::Type *j,
00044              const typename vec4<T>::Type *ij,
00045              const typename vec4<T>::Type *fc0,
00046              const typename vec4<T>::Type *fc1,
00047              const T* adcfm,
00048              const T* rhs,
00049              const T* hilo,
00050              const int offset,
00051              const int numConstraints,
00052              const int bStride,
00053              const int cStride )
00054 {
00055   typedef typename vec4<T>::Type Vec4T;
00056 
00057   int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00058   //int index = dxGlobalIdxX();
00059 
00060   if( index >= numConstraints )
00061     return;
00062 
00063   index += offset;
00064 
00065   T old_lambda = lambda[ index ];
00066 
00067   int4 bodyID = bodyIDs[ index ];
00068 
00069   Vec4T fc00 = fc0[ bodyID.x ];
00070   Vec4T fc01 = fc1[ bodyID.x ];
00071   Vec4T fc10 = make_vec4( (T)0.0 );
00072   Vec4T fc11 = make_vec4( (T)0.0 );
00073 
00074   Vec4T j0_temp = j[      index    ];
00075   Vec4T j1_temp = j[ C_ID(index,1) ];
00076   Vec4T j2_temp = j[ C_ID(index,2) ];
00077   Vec4T j3_temp = j[ C_ID(index,3) ];
00078 
00079   T  delta = rhs[ index ] - old_lambda * adcfm[ index ];
00080 
00081   if( bodyID.y >= 0 )  {
00082     fc10 = fc0[ bodyID.y ];
00083     fc11 = fc1[ bodyID.y ];
00084   }
00085 
00086   {
00087     delta -= dot( fc00, j0_temp );
00088     delta -= dot( fc01, j1_temp );
00089     if (bodyID.y >= 0) {
00090       delta -= dot( fc10, j2_temp );
00091       delta -= dot( fc11, j3_temp );
00092     }
00093   }
00094 
00095 //  if( bUseProjection ) {
00096     T lo_act = hilo[ index ];
00097     T hi_act = hilo[ C_ID(index,1) ];
00098     int fID = fIDs[ index ];
00099 
00100     if (fID >= 0) {
00101       hi_act = abs( hi_act * lambda[ fID ]);
00102       lo_act = -hi_act;
00103     }
00104 
00105     T new_lambda = old_lambda + delta;
00106     T final_lambda = new_lambda;
00107 
00108     if (new_lambda < lo_act) {
00109       delta = lo_act-old_lambda;
00110       final_lambda = lo_act;
00111     }
00112     else if (new_lambda > hi_act) {
00113       delta = hi_act-old_lambda;
00114       final_lambda = hi_act;
00115     }
00116     lambda[ index ] = final_lambda;
00117 //  } else {
00118 //    lambda[ index ] = old_lambda + delta;
00119 //  }
00120 
00121   j0_temp = ij[      index    ];
00122   j1_temp = ij[ C_ID(index,1) ];
00123   j2_temp = ij[ C_ID(index,2) ];
00124   j3_temp = ij[ C_ID(index,3) ];
00125 
00126   {
00127     j0_temp *= delta;
00128     j1_temp *= delta;
00129 
00130     // STORE
00131     //if( bUseAtomics ) {
00132     //  myAtomicVecAdd<T>(fc0[ bodyID.x ], j0_temp);
00133     //  myAtomicVecAdd<T>(fc1[ bodyID.x ], j1_temp);
00134     //} else {
00135       fc0_reduction[ bodyID.z ] = j0_temp;
00136       fc1_reduction[ bodyID.z ] = j1_temp;
00137     //}
00138 
00139     if( bodyID.y >= 0 ) {
00140       j2_temp *= delta;
00141       j3_temp *= delta;
00142 
00143       // STORE
00144       //if( bUseAtomics ) {
00145       //  myAtomicVecAdd<T>(fc0[ bodyID.y ], j2_temp);
00146       //  myAtomicVecAdd<T>(fc1[ bodyID.y ], j3_temp);
00147       //} else {
00148         fc0_reduction[ bodyID.w ] = j2_temp;
00149         fc1_reduction[ bodyID.w ] = j3_temp;
00150       //}
00151     }
00152   }
00153 }
00154 
00155 template<typename T>
00156 dxGlobal void
00157 cudaReduceIterativeCompactT( typename vec4<T>::Type *fc0_reduction,
00158                              typename vec4<T>::Type *fc1_reduction,
00159                              const int treePower )
00160 {
00161   typedef typename vec4<T>::Type Vec4T;
00162 
00163   int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00164 
00165   // Should we do an atomiceExch here to zero out the old memory?
00166   Vec4T fc0 = fc0_reduction[ index ];
00167   Vec4T fc1 = fc1_reduction[ index ];
00168 
00169   // row2.w is repetition counter: will gather if >= 8,4,2,1. Do nothing if =0.
00170   if (fc1.w >= treePower)
00171   {
00172     fc1_reduction[ index ].w = fc1.w = 0; // mark as processed
00173     fc0_reduction[ index - treePower ] += fc0;
00174     fc1_reduction[ index - treePower ] += fc1;
00175   }
00176 }
00177 
00178 template<typename T>
00179 dxGlobal void
00180 cudaReduceLoopedCompactT( typename vec4<T>::Type *fc0_reduction,
00181                           typename vec4<T>::Type *fc1_reduction,
00182                           const int size,
00183                           const int step )
00184 {
00185   typedef typename vec4<T>::Type Vec4T;
00186 
00187   int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00188   const uint stride = dxBlockIdx.x * dxBlockDim.x;
00189 
00190   while(index < size) {
00191     // Should we do an atomiceExch here to zero out the old memory?
00192     Vec4T fc0 = fc0_reduction[ index ];
00193     Vec4T fc1 = fc1_reduction[ index ];
00194 
00195     const uint offset = 0x1 << (step - 1);
00196     const uint next = index - offset;
00197 
00198     if( static_cast<int>(fc1.w) & offset) {
00199       fc1_reduction[ index ].w = fc1.w = 0;
00200       fc0_reduction[ next ] += fc0;
00201       fc1_reduction[ next ] += fc1;
00202     }
00203     index += stride;
00204   }
00205 }
00206 
00207 template<typename T>
00208 dxGlobal void
00209 cudaReduceStridedT( typename vec4<T>::Type *fc0,
00210                     typename vec4<T>::Type *fc1,
00211                     const typename vec4<T>::Type *fc0_reduction,
00212                     const typename vec4<T>::Type *fc1_reduction,
00213                     const int reductionStride,
00214                     const int bodySize,
00215                     const int reductionSize )
00216 {
00217   typedef typename vec4<T>::Type Vec4T;
00218 
00219   const int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00220   if( index >= bodySize ) return;
00221 
00222   int nextIndex = index + reductionStride;
00223 
00224   Vec4T sum0 = fc0_reduction[ index ];
00225   Vec4T sum1 = fc1_reduction[ index ];
00226 
00227   while(nextIndex < reductionSize) {
00228     // Should we do an atomiceExch here to zero out the old memory?
00229     sum0 += fc0_reduction[ nextIndex ];
00230     sum1 += fc1_reduction[ nextIndex ];
00231     nextIndex += reductionStride;
00232   }
00233 
00234   fc0[ index ] += sum0;
00235   fc1[ index ] += sum1;
00236 }
00237 
00238 template <typename T, unsigned int blockSize>
00239 dxGlobal void
00240 cudaReduceSequentialT( typename vec4<T>::Type *fc0,
00241                        typename vec4<T>::Type *fc1,
00242                        typename vec4<T>::Type *fc0_reduction,
00243                        typename vec4<T>::Type *fc1_reduction,
00244                        int n )
00245 {
00246   typedef typename vec3<T>::Type Vec3T;
00247   typedef typename vec4<T>::Type Vec4T;
00248 
00249   dxShared Vec3T sdata0[ blockSize * 2];
00250   dxShared Vec3T sdata1[ blockSize * 2];
00251 
00252   unsigned int tid = dxThreadIdx.x;
00253   unsigned int bid = dxBlockIdx.x;
00254   unsigned int i = dxBlockIdx.x*blockSize + dxThreadIdx.x;
00255 
00256   if(i >= n ) return;
00257 
00258   // Should we do an atomiceExch here to zero out the old memory?
00259   Vec3T fc0Sum = make_vec3( fc0_reduction[ i ] );
00260   Vec3T fc1Sum = make_vec3( fc1_reduction[ i ] );
00261 
00262   // each thread puts its local sum into shared memory
00263   sdata0[ tid ] = fc0Sum;
00264   sdata1[ tid ] = fc1Sum;
00265 
00266   dxSyncthreads();
00267 
00268   // do reduction in shared mem
00269   if (blockSize >= 512) {
00270     if (tid < 256) {
00271       sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 256];
00272       sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 256];
00273     }
00274     dxSyncthreads();
00275   }
00276   if (blockSize >= 256) {
00277     if (tid < 128) {
00278       sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 128];
00279       sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 128];
00280     }
00281     dxSyncthreads();
00282   }
00283   if (blockSize >= 128) {
00284     if (tid < 64) {
00285       sdata0[tid] = fc0Sum = fc0Sum + sdata0[tid + 64];
00286       sdata1[tid] = fc1Sum = fc1Sum + sdata1[tid + 64];
00287     }
00288     dxSyncthreads();
00289   }
00290 
00291   if (tid < 32)
00292   {
00293     // Within a warp we require the volatile specifier for correct storage behavior
00294     volatile Vec3T* smem0 = sdata0;
00295     volatile Vec3T* smem1 = sdata1;
00296 
00297     if (blockSize >=  64) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+32]);
00298                             add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+32]); }
00299     if (blockSize >=  32) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+16]);
00300                             add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+16]); }
00301     if (blockSize >=  16) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 8]);
00302                             add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 8]); }
00303     if (blockSize >=   8) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 4]);
00304                             add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 4]); }
00305     if (blockSize >=   4) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 2]);
00306                             add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 2]); }
00307     if (blockSize >=   2) { add_assign_volatile(smem0[tid],fc0Sum,smem0[tid+ 1]);
00308                             add_assign_volatile(smem1[tid],fc1Sum,smem1[tid+ 1]); }
00309   }
00310 
00311   // write result for this block to global mem
00312   if(tid == 0)
00313   {
00314     fc0[ bid ] += make_vec4( sdata0[0] );
00315     fc1[ bid ] += make_vec4( sdata1[0] );
00316   }
00317 }
00318 
00319 template<typename T>
00320 dxGlobal void
00321 cudaComputeInvMJTT( int4 *bodyIDs,
00322                     typename vec4<T>::Type *j0,
00323                     typename vec4<T>::Type *j1,
00324                     typename vec4<T>::Type *j2,
00325                     typename vec4<T>::Type *j3,
00326                     typename vec4<T>::Type *ij0,
00327                     typename vec4<T>::Type *ij1,
00328                     typename vec4<T>::Type *ij2,
00329                     typename vec4<T>::Type *ij3,
00330                     T *iMass,
00331                     int numConstraints,
00332                     typename vec4<T>::Type *ii0,
00333                     typename vec4<T>::Type *ii1,
00334                     typename vec4<T>::Type *ii2)
00335 {
00336   typedef typename vec3<T>::Type Vec3T;
00337 
00338   int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00339 
00340   if (index >= numConstraints)
00341     return;
00342 
00343   Vec3T ij_temp, ii_temp, j_temp;
00344 
00345   int4 bodyID = bodyIDs[ index ];
00346   int body0ID = bodyID.x;
00347   int body1ID = bodyID.y;
00348 
00349   register T k = iMass[ body0ID ];
00350 
00351   // Store
00352   ij0[ index ] = j0[ index ] * k;
00353 
00354   ii_temp = make_vec3( ii0[ body0ID ] );
00355   j_temp = make_vec3( j1[ index] );
00356   ij_temp.x = dot( ii_temp, j_temp );
00357 
00358   ii_temp = make_vec3( ii1[ body0ID ] );
00359   j_temp = make_vec3( j1[ index] );
00360   ij_temp.y = dot( ii_temp, j_temp );
00361 
00362   ii_temp = make_vec3( ii2[ body0ID ] );
00363   j_temp = make_vec3( j1[ index] );
00364   ij_temp.z = dot( ii_temp, j_temp );
00365 
00366   // Store
00367   ij1[ index ] = make_vec4( ij_temp );
00368 
00369   if( body1ID >= 0 ) {
00370     k = iMass[ body1ID ];
00371 
00372     // Store
00373     ij2[ index ] = j2[ index ] * k;
00374 
00375     ii_temp = make_vec3( ii0[ body1ID ] );
00376     j_temp = make_vec3( j3[ index] );
00377     ij_temp.x = dot( ii_temp, j_temp );
00378 
00379     ii_temp = make_vec3( ii1[ body1ID ] );
00380     j_temp = make_vec3( j3[ index] );
00381     ij_temp.y = dot( ii_temp, j_temp );
00382 
00383     ii_temp = make_vec3( ii2[ body1ID ] );
00384     j_temp = make_vec3( j3[ index] );
00385     ij_temp.z = dot( ii_temp, j_temp );
00386 
00387     // Store
00388     ij3[ index ] = make_vec4( ij_temp );
00389 
00390   } else {
00391     // Store
00392     ij2[ index ] = make_vec4( (T)0.0 );
00393     ij3[ index ] = make_vec4( (T)0.0 );
00394   }
00395 }
00396 
00397 template<typename T>
00398 dxGlobal void
00399 cudaComputeAdcfmBT( int4 *bodyIDs,
00400                     typename vec4<T>::Type *j0,
00401                     typename vec4<T>::Type *j1,
00402                     typename vec4<T>::Type *j2,
00403                     typename vec4<T>::Type *j3,
00404                     typename vec4<T>::Type *ij0,
00405                     typename vec4<T>::Type *ij1,
00406                     typename vec4<T>::Type *ij2,
00407                     typename vec4<T>::Type *ij3,
00408                     T *adcfm,
00409                     T *rhs,
00410                     T sorParam, int numConstraints)
00411 {
00412   typedef typename vec3<T>::Type Vec3T;
00413 
00414   int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00415 
00416   if (index >= numConstraints)
00417     return;
00418 
00419   int body1ID = bodyIDs[ index ].y;
00420 
00421   register T adcfm_i = 0.0;
00422   register T cfm_i = adcfm[ index ];
00423   register Vec3T j0_temp = make_vec3( j0[ index ] );
00424   register Vec3T j1_temp = make_vec3( j1[ index ] );
00425   register Vec3T ij0_temp = make_vec3( ij0[ index ] );
00426   register Vec3T ij1_temp = make_vec3( ij1[ index ] );
00427 
00428   {
00429     adcfm_i += dot( j0_temp, ij0_temp );
00430     adcfm_i += dot( j1_temp, ij1_temp );
00431 
00432     if(body1ID >= 0) {
00433       j0_temp = make_vec3( j2[ index ] );
00434       j1_temp = make_vec3( j3[ index ] );
00435       ij0_temp = make_vec3( ij2[ index ] );
00436       ij1_temp = make_vec3( ij3[ index ] );
00437 
00438       adcfm_i += dot( j0_temp, ij0_temp );
00439       adcfm_i += dot( j1_temp, ij1_temp );
00440     }
00441     adcfm_i = sorParam / (adcfm_i + cfm_i);
00442   }
00443 
00444   {
00445     j0[ index ] *= adcfm_i;
00446     j1[ index ] *= adcfm_i;
00447     j2[ index ] *= adcfm_i;
00448     j3[ index ] *= adcfm_i;
00449     rhs[ index ] *= adcfm_i;
00450     adcfm[ index ] = adcfm_i + cfm_i;
00451   }
00452 
00453 }
00454 
00455 template<typename T>
00456 dxGlobal void
00457 cudaIntegrateT( typename vec4<T>::Type* pos,
00458                 typename vec4<T>::Type* lVel,
00459                 typename vec4<T>::Type* aVel,
00460                 float deltaTime,
00461                 int numConstraints)
00462 {
00463     int index = dxBlockIdx.x * dxBlockDim.x + dxThreadIdx.x;
00464 
00465     if (index >= numConstraints)
00466         return;
00467 
00468     //typename vec4<T>::Type position = pos[index];
00469     //typename vec3<T>::Type accel = computeBodyAccel<T, multithreadBodies>(position, oldPos, numBodies);
00470 }
00471 
00472 #endif