00001 #include <parallel_solver.h>
00002 #include <parallel_common.h>
00003 #include <parallel_utils.h>
00004 #include <parallel_stepper.h>
00005
00006 namespace parallel_ode
00007 {
00008
00009 using ::parallel_utils::fillSequentialVector;
00010 using ::parallel_utils::permuteVector;
00011
00012
00013 #if defined(USE_CUDA)
00014 #ifdef CUDA_DOUBLESUPPORT
00015 template class ParallelPGSSolver<dReal,dReal,ParallelTypes::PARALLEL_TYPE>;
00016 #else
00017 template class ParallelPGSSolver<float,dReal,ParallelTypes::PARALLEL_TYPE>;
00018 #endif
00019 #else
00020 template class ParallelPGSSolver<dReal,dReal,ParallelTypes::PARALLEL_TYPE>;
00021 #endif
00022
00023 template<typename CudaT, typename ParamsT, ParallelType PType>
00024 void ParallelPGSSolver<CudaT,ParamsT,PType>::initialize( )
00025 {
00026 if(batchStrategy_) delete batchStrategy_;
00027 batchStrategy_ = BatchStrategyFactory::create( batchStrategyType_, numBatches_, alignEnabled(), getDefaultAlign() );
00028
00029 if(reduceStrategy_) delete reduceStrategy_;
00030 reduceStrategy_ = ReduceStrategyFactory::create( reduceStrategyType_ );
00031
00032 if( !bInit_ ) dxInitDevice();
00033
00034 bInit_ = true;
00035 }
00036
00037 template<typename CudaT, typename ParamsT, ParallelType PType>
00038 void ParallelPGSSolver<CudaT,ParamsT,PType>::worldSolve( SolverParams* params ) {
00039
00040 checkInit();
00041 parallelParams_ = params;
00042
00043 syncODEToDevice( );
00044
00045 {
00046 IFTIMING( ParallelTimer timer( "solving LCP problem - parallel" ) );
00047
00048 const int numIterations = parallelParams_->qs->num_iterations;
00049 const int numBatches = getNumBatches();
00050
00051 IntVector batchOrder( numBatches );
00052 fillSequentialVector( batchOrder );
00053
00054 for(int iteration = 0; iteration < numIterations; ++iteration) {
00055
00056 if( randomizeEnabled( ) ) permuteVector( batchOrder );
00057
00058 for(int batchCount = 0; batchCount < numBatches; ++batchCount) {
00059
00060 const int batch = batchOrder[ batchCount ];
00061 const int offset = batchIndices_[ batch ];
00062 const int batchSize = batchSizes_[ batch ];
00063
00064 if( batchSize == 0 ) continue;
00065
00066 solveAndReduce( offset, batchSize );
00067
00068 }
00069 }
00070 }
00071
00072 syncDeviceToODE( );
00073
00074 parallelParams_ = NULL;
00075 }
00076
00077 template<typename CudaT, typename ParamsT, ParallelType PType>
00078 void ParallelPGSSolver<CudaT,ParamsT,PType>::preProcessHost( )
00079 {
00080 dSetZero (parallelParams_->lambda,parallelParams_->m);
00081 dSetZero (parallelParams_->fc,parallelParams_->nb*6);
00082
00083 if( !preprocessEnabled( ) )
00084 {
00085 compute_invM_JT (parallelParams_->m,
00086 parallelParams_->J,
00087 parallelParams_->iMJ,
00088 parallelParams_->jb,
00089 parallelParams_->body,
00090 parallelParams_->invI);
00091 compute_Adcfm_b (parallelParams_->m,
00092 parallelParams_->qs->w,
00093 parallelParams_->J,
00094 parallelParams_->iMJ,
00095 parallelParams_->jb,
00096 parallelParams_->cfm,
00097 parallelParams_->Adcfm,
00098 parallelParams_->b);
00099 }
00100 else
00101 {
00102 parallelParams_->iMJ = NULL;
00103
00104
00105 for ( int i = 0; i < parallelParams_->m; i++)
00106 parallelParams_->Adcfm[i] = (CudaT)(parallelParams_->cfm[i]);
00107 }
00108 }
00109
00110 template<typename CudaT, typename ParamsT, ParallelType PType>
00111 void ParallelPGSSolver<CudaT,ParamsT,PType>::preProcessDevice( const CudaT sorParam, const CudaT stepSize )
00112 {
00113 IFTIMING( ParallelTimer timer(" -- preprocess parallel") );
00114 }
00115
00116 template<typename CudaT, typename ParamsT, ParallelType PType>
00117 void ParallelPGSSolver<CudaT,ParamsT,PType>::syncODEToDevice( )
00118 {
00119 preProcessHost( );
00120
00121
00122 setBodySize( parallelParams_->nb, false );
00123 setConstraintSize( parallelParams_->m, false );
00124 loadBatches( parallelParams_->jb );
00125
00126 setBodySize( parallelParams_->nb, true );
00127 setConstraintSize( parallelParams_->m, true );
00128
00129 loadBodies( );
00130 loadConstraints( );
00131 loadConstants( );
00132 loadKernels( );
00133
00134 if( preprocessEnabled( ) ) {
00135 preProcessDevice( parallelParams_->qs->w, parallelParams_->stepsize );
00136 }
00137
00138 IFVERBOSE( printConfig( ) );
00139 }
00140
00141 template<typename CudaT, typename ParamsT, ParallelType PType>
00142 void ParallelPGSSolver<CudaT,ParamsT,PType>::syncDeviceToODE( )
00143 {
00144 {
00145 IFTIMING( ParallelTimer timer(" -- sync Device to Host ") );
00146 CopyType copyType = getCopyType( );
00147 bodyFAcc.syncToHost( copyType );
00148 bodyTAcc.syncToHost( copyType );
00149 lambda0.syncToHost( copyType );
00150 }
00151
00152 {
00153 IFTIMING( ParallelTimer timer(" -- load solution ") );
00154 loadSolution( );
00155 }
00156 }
00157
00158 template<typename CudaT, typename ParamsT, ParallelType PType>
00159 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadBatches( int* jb )
00160 {
00161 IFTIMING( ParallelTimer timer(" -- load batches ") );
00162 int maxRepetitionCount = batchStrategy_->batch(jb,
00163 getNumConstraints(),
00164 getNumBodies(),
00165 constraintIndices_,
00166 reductionOffsets0_,
00167 reductionOffsets1_,
00168 batchRepetitionCount_,
00169 batchIndices_,
00170 batchSizes_);
00171
00172 reduceStrategy_->initialize( getNumBodies( ), min(maxRepetitionCount, ParallelOptions::MAXBODYREPETITION), batchRepetitionCount_ );
00173 }
00174
00175 template<typename CudaT, typename ParamsT, ParallelType PType>
00176 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadBodies( )
00177 {
00178 IFTIMING( ParallelTimer timer(" -- load bodies ") );
00179
00180 const CopyType copyType = getCopyType( );
00181
00182 Vec4T *bodyFAccP = bodyFAcc.getHostBuffer();
00183 Vec4T *bodyTAccP = bodyTAcc.getHostBuffer();
00184 Vec4T zeroVec = make_vec4( (CudaT)0.0 );
00185
00186 for(size_t bodyID = 0; bodyID < bodyFAcc.getSize(); ++bodyID )
00187 {
00188 bodyFAccP[ bodyID ] = zeroVec;
00189 bodyTAccP[ bodyID ] = zeroVec;
00190 }
00191 bodyFAcc.syncToDevice( copyType );
00192 bodyTAcc.syncToDevice( copyType );
00193
00194 if( preprocessEnabled( ) )
00195 {
00196 CudaT *iMassP = iMass.getHostBuffer();
00197 const ParamsT *invIP = parallelParams_->invI;
00198 Vec4T *iP = i0.getHostBuffer();
00199
00200 for(int bodyID = 0; bodyID < getNumBodies(); ++bodyID, invIP += 12)
00201 {
00202 iMassP[ bodyID ] = parallelParams_->body[ bodyID ]->invMass;
00203 iP[ bID(bodyID,0) ] = make_vec4( (CudaT)invIP[0], (CudaT)invIP[1], (CudaT)invIP[2]);
00204 iP[ bID(bodyID,1) ] = make_vec4( (CudaT)invIP[4], (CudaT)invIP[5], (CudaT)invIP[6]);
00205 iP[ bID(bodyID,2) ] = make_vec4( (CudaT)invIP[8], (CudaT)invIP[9], (CudaT)invIP[10]);
00206 }
00207 iMass.syncToDevice( copyType );
00208 i0.syncToDevice( copyType );
00209 }
00210 }
00211
00212 template<typename CudaT, typename ParamsT, ParallelType PType>
00213 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadConstraints( )
00214 {
00215 IFTIMING( ParallelTimer timer(" -- load constraints ") );
00216 const CopyType copyType = getCopyType( );
00217
00218 CudaT *adcfmP = adcfm.getHostBuffer();
00219 CudaT *lambdaP = lambda0.getHostBuffer();
00220 CudaT *rhsP = rhs.getHostBuffer();
00221 CudaT *lohiP = lohiD.getHostBuffer();
00222 int4 *bodyIDsP = bodyIDs.getHostBuffer();
00223 int *fIDP = fIDs.getHostBuffer();
00224 Vec4T *jP = j0.getHostBuffer();
00225
00226 for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID)
00227 {
00228 size_t dIndex = batchIndices_[ batchID ];
00229 for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex)
00230 {
00231 const int scalarIndex = constraintIndices_[hIndex];
00232 const int vecIndex = scalarIndex * 12;
00233
00234 const int body0ID = parallelParams_->jb[ scalarIndex*2 ];
00235 const int body1ID = parallelParams_->jb[ scalarIndex*2+1 ];
00236
00237 const int body0ReductionID = reduceStrategy_->getFinalIndex( body0ID, (reductionOffsets0_[ hIndex ] % ParallelOptions::MAXBODYREPETITION) );
00238 const int body1ReductionID = body1ID < 0 ? -1 : reduceStrategy_->getFinalIndex( body1ID, (reductionOffsets1_[ hIndex ] % ParallelOptions::MAXBODYREPETITION) );
00239
00240 bodyIDsP[dIndex] = make_int4( body0ID, body1ID, body0ReductionID, body1ReductionID );
00241
00242 fIDP[dIndex] = parallelParams_->findex[ scalarIndex ];
00243 rhsP[dIndex] = parallelParams_->b[ scalarIndex ];
00244 adcfmP[dIndex] = parallelParams_->Adcfm[ scalarIndex ];
00245
00246 lohiP[ dIndex ] = parallelParams_->lo[ scalarIndex ];
00247 lohiP[ cID(dIndex,1) ] = parallelParams_->hi[ scalarIndex ];
00248
00249 jP[ dIndex ] = make_vec4( (CudaT)parallelParams_->J[vecIndex], (CudaT)parallelParams_->J[vecIndex+1], (CudaT)parallelParams_->J[vecIndex+2] );
00250 jP[ cID(dIndex,1) ] = make_vec4( (CudaT)parallelParams_->J[vecIndex+3], (CudaT)parallelParams_->J[vecIndex+4], (CudaT)parallelParams_->J[vecIndex+5] );
00251 jP[ cID(dIndex,2) ] = make_vec4( (CudaT)parallelParams_->J[vecIndex+6], (CudaT)parallelParams_->J[vecIndex+7], (CudaT)parallelParams_->J[vecIndex+8] );
00252 jP[ cID(dIndex,3) ] = make_vec4( (CudaT)parallelParams_->J[vecIndex+9], (CudaT)parallelParams_->J[vecIndex+10], (CudaT)parallelParams_->J[vecIndex+11] );
00253
00254 lambdaP[dIndex] = 0.0;
00255 }
00256 }
00257
00258 adcfm.syncToDevice( copyType );
00259 lambda0.syncToDevice( copyType );
00260 rhs.syncToDevice( copyType );
00261 lohiD.syncToDevice( copyType );
00262 fIDs.syncToDevice( copyType );
00263 j0.syncToDevice( copyType );
00264 bodyIDs.syncToDevice( copyType );
00265 fIDs.syncToDevice( copyType );
00266
00267
00268 if( !preprocessEnabled( ) )
00269 {
00270 Vec4T* imjP = ij0.getHostBuffer();
00271 ParamsTPtr iMJ = parallelParams_->iMJ;
00272 for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID)
00273 {
00274 size_t dIndex = batchIndices_[ batchID ];
00275 for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex)
00276 {
00277 const int vecIndex = constraintIndices_[hIndex] * 12;
00278
00279 imjP[ dIndex ] = make_vec4( (CudaT)iMJ[vecIndex], (CudaT)iMJ[vecIndex+1], (CudaT)iMJ[vecIndex+2] );
00280 imjP[ cID(dIndex,1) ] = make_vec4( (CudaT)iMJ[vecIndex+3], (CudaT)iMJ[vecIndex+4], (CudaT)iMJ[vecIndex+5] );
00281 imjP[ cID(dIndex,2) ] = make_vec4( (CudaT)iMJ[vecIndex+6], (CudaT)iMJ[vecIndex+7], (CudaT)iMJ[vecIndex+8] );
00282 imjP[ cID(dIndex,3) ] = make_vec4( (CudaT)iMJ[vecIndex+9], (CudaT)iMJ[vecIndex+10], (CudaT)iMJ[vecIndex+11] );
00283 }
00284 }
00285
00286 ij0.syncToDevice( copyType );
00287 }
00288 }
00289
00290 template<typename CudaT, typename ParamsT, ParallelType PType>
00291 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadConstants( )
00292 {
00293
00294 }
00295
00296 template<typename CudaT, typename ParamsT, ParallelType PType>
00297 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadKernels( )
00298 {
00299
00300 }
00301
00302 template<typename CudaT, typename ParamsT, ParallelType PType>
00303 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadSolution( )
00304 {
00305 IFVERBOSE( lambda0.print("LAMBDA") );
00306 IFVERBOSE( bodyFAcc.print("BODYFACC") );
00307 IFVERBOSE( bodyTAcc.print("BODYTACC") );
00308
00309 dxGlobalSync();
00310
00311 CudaT *lambdaP = lambda0.getHostBuffer();
00312 Vec4T *fcP0 = bodyFAcc.getHostBuffer();
00313 Vec4T *fcP1 = bodyTAcc.getHostBuffer();
00314
00315 for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID)
00316 {
00317 size_t dIndex = batchIndices_[ batchID ];
00318 for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex)
00319 {
00320 parallelParams_->lambda[ constraintIndices_[ hIndex ] ] = lambdaP[ dIndex ];
00321 }
00322 }
00323
00324 for(int bodyID = 0, fID = 0; bodyID < getNumBodies(); ++bodyID, fID+=6)
00325 {
00326 parallelParams_->fc[ fID ] = fcP0[ bodyID ].x;
00327 parallelParams_->fc[ fID + 1] = fcP0[ bodyID ].y;
00328 parallelParams_->fc[ fID + 2] = fcP0[ bodyID ].z;
00329 parallelParams_->fc[ fID + 3] = fcP1[ bodyID ].x;
00330 parallelParams_->fc[ fID + 4] = fcP1[ bodyID ].y;
00331 parallelParams_->fc[ fID + 5] = fcP1[ bodyID ].z;
00332 }
00333 }
00334
00335 template<typename CudaT, typename ParamsT, ParallelType PType>
00336 void ParallelPGSSolver<CudaT,ParamsT,PType>::setBodySize( int n, bool bResizeArrays )
00337 {
00338 IFTIMING( ParallelTimer timer(" -- setting body size ") );
00339 n_ = n;
00340
00341 if( bResizeArrays )
00342 {
00343 setBodyStride( alignDefaultSize( n_ ) );
00344 setReduceStride( alignDefaultSize( reduceStrategy_->getBodySizeWithReduction( ) ) );
00345
00346 if( preprocessEnabled( ) )
00347 {
00348 iMass.setSize( n );
00349 i0.setSize( getBodyStride() * 3 );
00350 }
00351
00352 bodyFAcc.setSize( getBodyStride() );
00353 bodyTAcc.setSize( getBodyStride() );
00354
00355 if( reduceEnabled( ) ) {
00356 bodyFAccReduction.setSize( getReduceStride() );
00357 bodyTAccReduction.setSize( getReduceStride() );
00358 }
00359 }
00360 }
00361
00362 template<typename CudaT, typename ParamsT, ParallelType PType>
00363 void ParallelPGSSolver<CudaT,ParamsT,PType>::setConstraintSize( int m, bool bResizeArrays )
00364 {
00365 IFTIMING( ParallelTimer timer(" -- setting constraint size ") );
00366 m_ = m;
00367
00368 constraintIndices_.resize(m);
00369 reductionOffsets0_.resize(m);
00370 reductionOffsets1_.resize(m);
00371
00372 if( bResizeArrays )
00373 {
00374 setConstraintStride( parallel_utils::alignedSize( batchSizes_ ) );
00375
00376 if( alignEnabled( ) ) {
00377 m = getConstraintStride( );
00378 }
00379
00380 j0.setSize( m * 4 );
00381 ij0.setSize( m * 4 );
00382
00383 if( compactScalarsEnabled( ) ) {
00384 adcfm.setSize( m * 4 );
00385 } else {
00386 adcfm.setSize( m );
00387 rhs.setSize( m );
00388 lohiD.setSize( m * 2 );
00389 }
00390
00391 lambda0.setSize( m );
00392 bodyIDs.setSize( m );
00393 fIDs.setSize( m );
00394 }
00395 }
00396
00397 template<typename CudaT, typename ParamsT, ParallelType PType>
00398 void ParallelPGSSolver<CudaT,ParamsT,PType>::setMemFlags( MemFlags flags )
00399 {
00400 j0.setFlags( flags );
00401 ij0.setFlags( flags );
00402 lambda0.setFlags( flags );
00403 adcfm.setFlags( flags );
00404 rhs.setFlags( flags );
00405 lohiD.setFlags( flags );
00406 bodyIDs.setFlags( flags );
00407 fIDs.setFlags( flags );
00408 iMass.setFlags( flags );
00409 i0.setFlags( flags );
00410 bodyFAcc.setFlags( flags );
00411 bodyTAcc.setFlags( flags );
00412 bodyFAccReduction.setFlags( flags );
00413 bodyTAccReduction.setFlags( flags );
00414 }
00415
00416 template<typename CudaT, typename ParamsT, ParallelType PType>
00417 void ParallelPGSSolver<CudaT,ParamsT,PType>::printConfig( )
00418 {
00419 if( getNumBodies() <= 0 || getNumConstraints() <= 0 ) return;
00420
00421 IFVERBOSE( lambda0.print("j0") );
00422 IFVERBOSE( bodyFAcc.print("BODYFACC") );
00423 IFVERBOSE( bodyTAcc.print("BODYTACC") );
00424
00425 int maxRepetitionCount = 0;
00426 for( ArrayIndex i = 0; i < batchRepetitionCount_.size(); ++i ) {
00427 maxRepetitionCount = max(maxRepetitionCount, batchRepetitionCount_[ i ]);
00428 }
00429
00430 printf(" -- (CSize,BSize,AvgCPerB,MaxBRepetition)=(%d,%d,%f,%d) --\n",
00431 getNumConstraints(),
00432 getNumBodies(),
00433 (CudaT)getNumConstraints() / (CudaT)getNumBodies(),
00434 maxRepetitionCount);
00435 }
00436
00437 }