$search
00001 #include <parallel_solver.h> 00002 #include <parallel_common.h> 00003 #include <parallel_utils.h> 00004 #include <parallel_stepper.h> 00005 00006 namespace parallel_ode 00007 { 00008 00009 using ::parallel_utils::fillSequentialVector; 00010 using ::parallel_utils::permuteVector; 00011 00012 // Forward template declarations 00013 #if defined(USE_CUDA) 00014 #ifdef CUDA_DOUBLESUPPORT 00015 template class ParallelPGSSolver<dReal,dReal,ParallelTypes::PARALLEL_TYPE>; 00016 #else 00017 template class ParallelPGSSolver<float,dReal,ParallelTypes::PARALLEL_TYPE>; 00018 #endif 00019 #else 00020 template class ParallelPGSSolver<dReal,dReal,ParallelTypes::PARALLEL_TYPE>; 00021 #endif 00022 00023 template<typename CudaT, typename ParamsT, ParallelType PType> 00024 void ParallelPGSSolver<CudaT,ParamsT,PType>::initialize( ) 00025 { 00026 if(batchStrategy_) delete batchStrategy_; 00027 batchStrategy_ = BatchStrategyFactory::create( batchStrategyType_, numBatches_, alignEnabled(), getDefaultAlign() ); 00028 00029 if(reduceStrategy_) delete reduceStrategy_; 00030 reduceStrategy_ = ReduceStrategyFactory::create( reduceStrategyType_ ); 00031 00032 if( !bInit_ ) dxInitDevice(); 00033 00034 bInit_ = true; 00035 } 00036 00037 template<typename CudaT, typename ParamsT, ParallelType PType> 00038 void ParallelPGSSolver<CudaT,ParamsT,PType>::worldSolve( SolverParams* params ) { 00039 00040 checkInit(); 00041 parallelParams_ = params; 00042 00043 syncODEToDevice( ); 00044 00045 { 00046 IFTIMING( ParallelTimer timer( "solving LCP problem - parallel" ) ); 00047 00048 const int numIterations = parallelParams_->qs->num_iterations; 00049 const int numBatches = getNumBatches(); 00050 00051 IntVector batchOrder( numBatches ); 00052 fillSequentialVector( batchOrder ); 00053 00054 for(int iteration = 0; iteration < numIterations; ++iteration) { 00055 00056 if( randomizeEnabled( ) ) permuteVector( batchOrder ); 00057 00058 for(int batchCount = 0; batchCount < numBatches; ++batchCount) { 00059 00060 const int batch = batchOrder[ batchCount ]; 00061 const int offset = batchIndices_[ batch ]; 00062 const int batchSize = batchSizes_[ batch ]; 00063 00064 if( batchSize == 0 ) continue; 00065 00066 solveAndReduce( offset, batchSize ); 00067 00068 } // batch 00069 } // iteration 00070 } 00071 00072 syncDeviceToODE( ); 00073 00074 parallelParams_ = NULL; 00075 } 00076 00077 template<typename CudaT, typename ParamsT, ParallelType PType> 00078 void ParallelPGSSolver<CudaT,ParamsT,PType>::preProcessHost( ) 00079 { 00080 dSetZero (parallelParams_->lambda,parallelParams_->m); 00081 dSetZero (parallelParams_->fc,parallelParams_->nb*6); 00082 00083 if( !preprocessEnabled( ) ) 00084 { 00085 compute_invM_JT (parallelParams_->m, 00086 parallelParams_->J, 00087 parallelParams_->iMJ, 00088 parallelParams_->jb, 00089 parallelParams_->body, 00090 parallelParams_->invI); 00091 compute_Adcfm_b (parallelParams_->m, 00092 parallelParams_->qs->w, 00093 parallelParams_->J, 00094 parallelParams_->iMJ, 00095 parallelParams_->jb, 00096 parallelParams_->cfm, 00097 parallelParams_->Adcfm, 00098 parallelParams_->b); 00099 } 00100 else 00101 { 00102 parallelParams_->iMJ = NULL; 00103 00104 //parallelParams_->Adcfm = const_cast<CudaTMutablePtr>(parallelParams_->cfm); 00105 for ( int i = 0; i < parallelParams_->m; i++) 00106 parallelParams_->Adcfm[i] = (CudaT)(parallelParams_->cfm[i]); 00107 } 00108 } 00109 00110 template<typename CudaT, typename ParamsT, ParallelType PType> 00111 void ParallelPGSSolver<CudaT,ParamsT,PType>::preProcessDevice( const CudaT sorParam, const CudaT stepSize ) 00112 { 00113 IFTIMING( ParallelTimer timer(" -- preprocess parallel") ); 00114 } 00115 00116 template<typename CudaT, typename ParamsT, ParallelType PType> 00117 void ParallelPGSSolver<CudaT,ParamsT,PType>::syncODEToDevice( ) 00118 { 00119 preProcessHost( ); 00120 00121 // Defer array resizing until we've calculated necessary reduction sizes for body force accumulator 00122 setBodySize( parallelParams_->nb, false ); 00123 setConstraintSize( parallelParams_->m, false ); 00124 loadBatches( parallelParams_->jb ); 00125 // With batch sizes computed we can now properly allocate the host/device buffers 00126 setBodySize( parallelParams_->nb, true ); 00127 setConstraintSize( parallelParams_->m, true ); 00128 00129 loadBodies( ); 00130 loadConstraints( ); 00131 loadConstants( ); 00132 loadKernels( ); 00133 00134 if( preprocessEnabled( ) ) { 00135 preProcessDevice( parallelParams_->qs->w, parallelParams_->stepsize ); 00136 } 00137 00138 IFVERBOSE( printConfig( ) ); 00139 } 00140 00141 template<typename CudaT, typename ParamsT, ParallelType PType> 00142 void ParallelPGSSolver<CudaT,ParamsT,PType>::syncDeviceToODE( ) 00143 { 00144 { 00145 IFTIMING( ParallelTimer timer(" -- sync Device to Host ") ); 00146 CopyType copyType = getCopyType( ); 00147 bodyFAcc.syncToHost( copyType ); 00148 bodyTAcc.syncToHost( copyType ); 00149 lambda0.syncToHost( copyType ); 00150 } 00151 00152 { 00153 IFTIMING( ParallelTimer timer(" -- load solution ") ); 00154 loadSolution( ); 00155 } 00156 } 00157 00158 template<typename CudaT, typename ParamsT, ParallelType PType> 00159 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadBatches( int* jb ) 00160 { 00161 IFTIMING( ParallelTimer timer(" -- load batches ") ); 00162 int maxRepetitionCount = batchStrategy_->batch(jb, 00163 getNumConstraints(), 00164 getNumBodies(), 00165 constraintIndices_, 00166 reductionOffsets0_, 00167 reductionOffsets1_, 00168 batchRepetitionCount_, 00169 batchIndices_, 00170 batchSizes_); 00171 00172 reduceStrategy_->initialize( getNumBodies( ), min(maxRepetitionCount, ParallelOptions::MAXBODYREPETITION), batchRepetitionCount_ ); 00173 } 00174 00175 template<typename CudaT, typename ParamsT, ParallelType PType> 00176 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadBodies( ) 00177 { 00178 IFTIMING( ParallelTimer timer(" -- load bodies ") ); 00179 00180 const CopyType copyType = getCopyType( ); 00181 00182 Vec4T *bodyFAccP = bodyFAcc.getHostBuffer(); 00183 Vec4T *bodyTAccP = bodyTAcc.getHostBuffer(); 00184 Vec4T zeroVec = make_vec4( (CudaT)0.0 ); 00185 00186 for(size_t bodyID = 0; bodyID < bodyFAcc.getSize(); ++bodyID ) 00187 { 00188 bodyFAccP[ bodyID ] = zeroVec; 00189 bodyTAccP[ bodyID ] = zeroVec; 00190 } 00191 bodyFAcc.syncToDevice( copyType ); 00192 bodyTAcc.syncToDevice( copyType ); 00193 00194 if( preprocessEnabled( ) ) 00195 { 00196 CudaT *iMassP = iMass.getHostBuffer(); 00197 const ParamsT *invIP = parallelParams_->invI; 00198 Vec4T *iP = i0.getHostBuffer(); 00199 00200 for(int bodyID = 0; bodyID < getNumBodies(); ++bodyID, invIP += 12) 00201 { 00202 iMassP[ bodyID ] = parallelParams_->body[ bodyID ]->invMass; 00203 iP[ bID(bodyID,0) ] = make_vec4( (CudaT)invIP[0], (CudaT)invIP[1], (CudaT)invIP[2]); 00204 iP[ bID(bodyID,1) ] = make_vec4( (CudaT)invIP[4], (CudaT)invIP[5], (CudaT)invIP[6]); 00205 iP[ bID(bodyID,2) ] = make_vec4( (CudaT)invIP[8], (CudaT)invIP[9], (CudaT)invIP[10]); 00206 } 00207 iMass.syncToDevice( copyType ); 00208 i0.syncToDevice( copyType ); 00209 } 00210 } 00211 00212 template<typename CudaT, typename ParamsT, ParallelType PType> 00213 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadConstraints( ) 00214 { 00215 IFTIMING( ParallelTimer timer(" -- load constraints ") ); 00216 const CopyType copyType = getCopyType( ); 00217 00218 CudaT *adcfmP = adcfm.getHostBuffer(); 00219 CudaT *lambdaP = lambda0.getHostBuffer(); 00220 CudaT *rhsP = rhs.getHostBuffer(); 00221 CudaT *lohiP = lohiD.getHostBuffer(); 00222 int4 *bodyIDsP = bodyIDs.getHostBuffer(); 00223 int *fIDP = fIDs.getHostBuffer(); 00224 Vec4T *jP = j0.getHostBuffer(); 00225 00226 for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID) 00227 { 00228 size_t dIndex = batchIndices_[ batchID ]; 00229 for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex) 00230 { 00231 const int scalarIndex = constraintIndices_[hIndex]; 00232 const int vecIndex = scalarIndex * 12; 00233 00234 const int body0ID = parallelParams_->jb[ scalarIndex*2 ]; 00235 const int body1ID = parallelParams_->jb[ scalarIndex*2+1 ]; 00236 00237 const int body0ReductionID = reduceStrategy_->getFinalIndex( body0ID, (reductionOffsets0_[ hIndex ] % ParallelOptions::MAXBODYREPETITION) ); 00238 const int body1ReductionID = body1ID < 0 ? -1 : reduceStrategy_->getFinalIndex( body1ID, (reductionOffsets1_[ hIndex ] % ParallelOptions::MAXBODYREPETITION) ); 00239 00240 bodyIDsP[dIndex] = make_int4( body0ID, body1ID, body0ReductionID, body1ReductionID ); 00241 00242 fIDP[dIndex] = parallelParams_->findex[ scalarIndex ]; 00243 rhsP[dIndex] = parallelParams_->b[ scalarIndex ]; 00244 adcfmP[dIndex] = parallelParams_->Adcfm[ scalarIndex ]; 00245 00246 lohiP[ dIndex ] = parallelParams_->lo[ scalarIndex ]; 00247 lohiP[ cID(dIndex,1) ] = parallelParams_->hi[ scalarIndex ]; 00248 00249 jP[ dIndex ] = make_vec4( (CudaT)parallelParams_->J[vecIndex], (CudaT)parallelParams_->J[vecIndex+1], (CudaT)parallelParams_->J[vecIndex+2] ); 00250 jP[ cID(dIndex,1) ] = make_vec4( (CudaT)parallelParams_->J[vecIndex+3], (CudaT)parallelParams_->J[vecIndex+4], (CudaT)parallelParams_->J[vecIndex+5] ); 00251 jP[ cID(dIndex,2) ] = make_vec4( (CudaT)parallelParams_->J[vecIndex+6], (CudaT)parallelParams_->J[vecIndex+7], (CudaT)parallelParams_->J[vecIndex+8] ); 00252 jP[ cID(dIndex,3) ] = make_vec4( (CudaT)parallelParams_->J[vecIndex+9], (CudaT)parallelParams_->J[vecIndex+10], (CudaT)parallelParams_->J[vecIndex+11] ); 00253 00254 lambdaP[dIndex] = 0.0; 00255 } 00256 } 00257 00258 adcfm.syncToDevice( copyType ); 00259 lambda0.syncToDevice( copyType ); 00260 rhs.syncToDevice( copyType ); 00261 lohiD.syncToDevice( copyType ); 00262 fIDs.syncToDevice( copyType ); 00263 j0.syncToDevice( copyType ); 00264 bodyIDs.syncToDevice( copyType ); 00265 fIDs.syncToDevice( copyType ); 00266 00267 00268 if( !preprocessEnabled( ) ) 00269 { 00270 Vec4T* imjP = ij0.getHostBuffer(); 00271 ParamsTPtr iMJ = parallelParams_->iMJ; 00272 for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID) 00273 { 00274 size_t dIndex = batchIndices_[ batchID ]; 00275 for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex) 00276 { 00277 const int vecIndex = constraintIndices_[hIndex] * 12; 00278 00279 imjP[ dIndex ] = make_vec4( (CudaT)iMJ[vecIndex], (CudaT)iMJ[vecIndex+1], (CudaT)iMJ[vecIndex+2] ); 00280 imjP[ cID(dIndex,1) ] = make_vec4( (CudaT)iMJ[vecIndex+3], (CudaT)iMJ[vecIndex+4], (CudaT)iMJ[vecIndex+5] ); 00281 imjP[ cID(dIndex,2) ] = make_vec4( (CudaT)iMJ[vecIndex+6], (CudaT)iMJ[vecIndex+7], (CudaT)iMJ[vecIndex+8] ); 00282 imjP[ cID(dIndex,3) ] = make_vec4( (CudaT)iMJ[vecIndex+9], (CudaT)iMJ[vecIndex+10], (CudaT)iMJ[vecIndex+11] ); 00283 } 00284 } 00285 00286 ij0.syncToDevice( copyType ); 00287 } 00288 } 00289 00290 template<typename CudaT, typename ParamsT, ParallelType PType> 00291 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadConstants( ) 00292 { 00293 00294 } 00295 00296 template<typename CudaT, typename ParamsT, ParallelType PType> 00297 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadKernels( ) 00298 { 00299 00300 } 00301 00302 template<typename CudaT, typename ParamsT, ParallelType PType> 00303 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadSolution( ) 00304 { 00305 IFVERBOSE( lambda0.print("LAMBDA") ); 00306 IFVERBOSE( bodyFAcc.print("BODYFACC") ); 00307 IFVERBOSE( bodyTAcc.print("BODYTACC") ); 00308 00309 dxGlobalSync(); 00310 00311 CudaT *lambdaP = lambda0.getHostBuffer(); 00312 Vec4T *fcP0 = bodyFAcc.getHostBuffer(); 00313 Vec4T *fcP1 = bodyTAcc.getHostBuffer(); 00314 00315 for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID) 00316 { 00317 size_t dIndex = batchIndices_[ batchID ]; 00318 for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex) 00319 { 00320 parallelParams_->lambda[ constraintIndices_[ hIndex ] ] = lambdaP[ dIndex ]; 00321 } 00322 } 00323 00324 for(int bodyID = 0, fID = 0; bodyID < getNumBodies(); ++bodyID, fID+=6) 00325 { 00326 parallelParams_->fc[ fID ] = fcP0[ bodyID ].x; 00327 parallelParams_->fc[ fID + 1] = fcP0[ bodyID ].y; 00328 parallelParams_->fc[ fID + 2] = fcP0[ bodyID ].z; 00329 parallelParams_->fc[ fID + 3] = fcP1[ bodyID ].x; 00330 parallelParams_->fc[ fID + 4] = fcP1[ bodyID ].y; 00331 parallelParams_->fc[ fID + 5] = fcP1[ bodyID ].z; 00332 } 00333 } 00334 00335 template<typename CudaT, typename ParamsT, ParallelType PType> 00336 void ParallelPGSSolver<CudaT,ParamsT,PType>::setBodySize( int n, bool bResizeArrays ) 00337 { 00338 IFTIMING( ParallelTimer timer(" -- setting body size ") ); 00339 n_ = n; 00340 00341 if( bResizeArrays ) 00342 { 00343 setBodyStride( alignDefaultSize( n_ ) ); 00344 setReduceStride( alignDefaultSize( reduceStrategy_->getBodySizeWithReduction( ) ) ); 00345 00346 if( preprocessEnabled( ) ) 00347 { 00348 iMass.setSize( n ); 00349 i0.setSize( getBodyStride() * 3 ); 00350 } 00351 00352 bodyFAcc.setSize( getBodyStride() ); 00353 bodyTAcc.setSize( getBodyStride() ); 00354 00355 if( reduceEnabled( ) ) { 00356 bodyFAccReduction.setSize( getReduceStride() ); 00357 bodyTAccReduction.setSize( getReduceStride() ); 00358 } 00359 } 00360 } 00361 00362 template<typename CudaT, typename ParamsT, ParallelType PType> 00363 void ParallelPGSSolver<CudaT,ParamsT,PType>::setConstraintSize( int m, bool bResizeArrays ) 00364 { 00365 IFTIMING( ParallelTimer timer(" -- setting constraint size ") ); 00366 m_ = m; 00367 00368 constraintIndices_.resize(m); 00369 reductionOffsets0_.resize(m); 00370 reductionOffsets1_.resize(m); 00371 00372 if( bResizeArrays ) 00373 { 00374 setConstraintStride( parallel_utils::alignedSize( batchSizes_ ) ); 00375 00376 if( alignEnabled( ) ) { 00377 m = getConstraintStride( ); 00378 } 00379 00380 j0.setSize( m * 4 ); 00381 ij0.setSize( m * 4 ); 00382 00383 if( compactScalarsEnabled( ) ) { 00384 adcfm.setSize( m * 4 ); 00385 } else { 00386 adcfm.setSize( m ); 00387 rhs.setSize( m ); 00388 lohiD.setSize( m * 2 ); 00389 } 00390 00391 lambda0.setSize( m ); 00392 bodyIDs.setSize( m ); 00393 fIDs.setSize( m ); 00394 } 00395 } 00396 00397 template<typename CudaT, typename ParamsT, ParallelType PType> 00398 void ParallelPGSSolver<CudaT,ParamsT,PType>::setMemFlags( MemFlags flags ) 00399 { 00400 j0.setFlags( flags ); 00401 ij0.setFlags( flags ); 00402 lambda0.setFlags( flags ); 00403 adcfm.setFlags( flags ); 00404 rhs.setFlags( flags ); 00405 lohiD.setFlags( flags ); 00406 bodyIDs.setFlags( flags ); 00407 fIDs.setFlags( flags ); 00408 iMass.setFlags( flags ); 00409 i0.setFlags( flags ); 00410 bodyFAcc.setFlags( flags ); 00411 bodyTAcc.setFlags( flags ); 00412 bodyFAccReduction.setFlags( flags ); 00413 bodyTAccReduction.setFlags( flags ); 00414 } 00415 00416 template<typename CudaT, typename ParamsT, ParallelType PType> 00417 void ParallelPGSSolver<CudaT,ParamsT,PType>::printConfig( ) 00418 { 00419 if( getNumBodies() <= 0 || getNumConstraints() <= 0 ) return; 00420 00421 IFVERBOSE( lambda0.print("j0") ); 00422 IFVERBOSE( bodyFAcc.print("BODYFACC") ); 00423 IFVERBOSE( bodyTAcc.print("BODYTACC") ); 00424 00425 int maxRepetitionCount = 0; 00426 for( ArrayIndex i = 0; i < batchRepetitionCount_.size(); ++i ) { 00427 maxRepetitionCount = max(maxRepetitionCount, batchRepetitionCount_[ i ]); 00428 } 00429 00430 printf(" -- (CSize,BSize,AvgCPerB,MaxBRepetition)=(%d,%d,%f,%d) --\n", 00431 getNumConstraints(), 00432 getNumBodies(), 00433 (CudaT)getNumConstraints() / (CudaT)getNumBodies(), 00434 maxRepetitionCount); 00435 } 00436 00437 }