parallel_quickstep: parallel_solver.cpp Source File

Go to the documentation of this file.
00001 #include <parallel_solver.h>
00002 #include <parallel_common.h>
00003 #include <parallel_utils.h>
00004 #include <parallel_stepper.h>
00005 
00006 namespace parallel_ode
00007 {
00008 
00009 using ::parallel_utils::fillSequentialVector;
00010 using ::parallel_utils::permuteVector;
00011 
00012 // Forward template declarations
00013 #if defined(USE_CUDA)
00014 #ifdef CUDA_DOUBLESUPPORT
00015 template class ParallelPGSSolver<dReal,dReal,ParallelTypes::PARALLEL_TYPE>;
00016 #else
00017 template class ParallelPGSSolver<float,dReal,ParallelTypes::PARALLEL_TYPE>;
00018 #endif
00019 #else
00020 template class ParallelPGSSolver<dReal,dReal,ParallelTypes::PARALLEL_TYPE>;
00021 #endif
00022 
00023 template<typename CudaT, typename ParamsT, ParallelType PType>
00024 void ParallelPGSSolver<CudaT,ParamsT,PType>::initialize( )
00025 {
00026   if(batchStrategy_) delete batchStrategy_;
00027   batchStrategy_ = BatchStrategyFactory::create( batchStrategyType_, numBatches_, alignEnabled(), getDefaultAlign() );
00028 
00029   if(reduceStrategy_) delete reduceStrategy_;
00030   reduceStrategy_ = ReduceStrategyFactory::create( reduceStrategyType_ );
00031 
00032   if( !bInit_ ) dxInitDevice();
00033 
00034   bInit_ = true;
00035 }
00036 
00037 template<typename CudaT, typename ParamsT, ParallelType PType>
00038 void ParallelPGSSolver<CudaT,ParamsT,PType>::worldSolve( SolverParams* params ) {
00039 
00040   checkInit();
00041   parallelParams_ = params;
00042 
00043   syncODEToDevice( );
00044 
00045   {
00046     IFTIMING( ParallelTimer timer( "solving LCP problem - parallel" ) );
00047 
00048     const int numIterations = parallelParams_->qs->num_iterations;
00049     const int numBatches = getNumBatches();
00050 
00051     IntVector batchOrder( numBatches );
00052     fillSequentialVector( batchOrder );
00053 
00054     for(int iteration = 0; iteration < numIterations; ++iteration) {
00055 
00056       if( randomizeEnabled( ) ) permuteVector( batchOrder );
00057 
00058       for(int batchCount = 0; batchCount < numBatches; ++batchCount) {
00059 
00060         const int batch = batchOrder[ batchCount ];
00061         const int offset = batchIndices_[ batch ];
00062         const int batchSize = batchSizes_[ batch ];
00063 
00064         if( batchSize == 0 ) continue;
00065 
00066         solveAndReduce( offset, batchSize );
00067 
00068       } // batch
00069     } // iteration
00070   }
00071 
00072   syncDeviceToODE( );
00073 
00074   parallelParams_ = NULL;
00075 }
00076 
00077 template<typename CudaT, typename ParamsT, ParallelType PType>
00078 void ParallelPGSSolver<CudaT,ParamsT,PType>::preProcessHost( )
00079 {
00080   dSetZero (parallelParams_->lambda,parallelParams_->m);
00081   dSetZero (parallelParams_->fc,parallelParams_->nb*6);
00082 
00083   if( !preprocessEnabled( ) )
00084   {
00085     compute_invM_JT (parallelParams_->m,
00086                      parallelParams_->J,
00087                      parallelParams_->iMJ,
00088                      parallelParams_->jb,
00089                      parallelParams_->body,
00090                      parallelParams_->invI);
00091     compute_Adcfm_b (parallelParams_->m,
00092                      parallelParams_->qs->w,
00093                      parallelParams_->J,
00094                      parallelParams_->iMJ,
00095                      parallelParams_->jb,
00096                      parallelParams_->cfm,
00097                      parallelParams_->Adcfm,
00098                      parallelParams_->b);
00099   }
00100   else
00101   {
00102     parallelParams_->iMJ = NULL;
00103 
00104     //parallelParams_->Adcfm = const_cast<CudaTMutablePtr>(parallelParams_->cfm);
00105     for ( int i = 0; i < parallelParams_->m; i++)
00106       parallelParams_->Adcfm[i] = (CudaT)(parallelParams_->cfm[i]);
00107   }
00108 }
00109 
00110 template<typename CudaT, typename ParamsT, ParallelType PType>
00111 void ParallelPGSSolver<CudaT,ParamsT,PType>::preProcessDevice( const CudaT sorParam, const CudaT stepSize )
00112 {
00113   IFTIMING( ParallelTimer timer(" -- preprocess parallel") );
00114 }
00115 
00116 template<typename CudaT, typename ParamsT, ParallelType PType>
00117 void ParallelPGSSolver<CudaT,ParamsT,PType>::syncODEToDevice( )
00118 {
00119   preProcessHost( );
00120 
00121   // Defer array resizing until we've calculated necessary reduction sizes for body force accumulator
00122   setBodySize( parallelParams_->nb, false );
00123   setConstraintSize( parallelParams_->m, false );
00124   loadBatches( parallelParams_->jb );
00125   // With batch sizes computed we can now properly allocate the host/device buffers
00126   setBodySize( parallelParams_->nb, true );
00127   setConstraintSize( parallelParams_->m, true );
00128 
00129   loadBodies( );
00130   loadConstraints( );
00131   loadConstants( );
00132   loadKernels( );
00133 
00134   if( preprocessEnabled( ) ) {
00135     preProcessDevice( parallelParams_->qs->w, parallelParams_->stepsize );
00136   }
00137 
00138   IFVERBOSE( printConfig( ) );
00139 }
00140 
00141 template<typename CudaT, typename ParamsT, ParallelType PType>
00142 void ParallelPGSSolver<CudaT,ParamsT,PType>::syncDeviceToODE( )
00143 {
00144   {
00145     IFTIMING( ParallelTimer timer(" -- sync Device to Host ") );
00146     CopyType copyType = getCopyType( );
00147     bodyFAcc.syncToHost( copyType );
00148     bodyTAcc.syncToHost( copyType );
00149     lambda0.syncToHost( copyType );
00150   }
00151 
00152   {
00153      IFTIMING( ParallelTimer timer(" -- load solution ") );
00154      loadSolution( );
00155   }
00156 }
00157 
00158 template<typename CudaT, typename ParamsT, ParallelType PType>
00159 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadBatches( int* jb )
00160 {
00161   IFTIMING( ParallelTimer timer("  -- load batches ") );
00162   int maxRepetitionCount = batchStrategy_->batch(jb,
00163                                                  getNumConstraints(),
00164                                                  getNumBodies(),
00165                                                  constraintIndices_,
00166                                                  reductionOffsets0_,
00167                                                  reductionOffsets1_,
00168                                                  batchRepetitionCount_,
00169                                                  batchIndices_,
00170                                                  batchSizes_);
00171 
00172   reduceStrategy_->initialize( getNumBodies( ), min(maxRepetitionCount, ParallelOptions::MAXBODYREPETITION), batchRepetitionCount_ );
00173 }
00174 
00175 template<typename CudaT, typename ParamsT, ParallelType PType>
00176 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadBodies( )
00177 {
00178   IFTIMING( ParallelTimer timer("  -- load bodies ") );
00179 
00180   const CopyType copyType = getCopyType( );
00181 
00182   Vec4T *bodyFAccP = bodyFAcc.getHostBuffer();
00183   Vec4T *bodyTAccP = bodyTAcc.getHostBuffer();
00184   Vec4T zeroVec = make_vec4( (CudaT)0.0 );
00185 
00186   for(size_t bodyID = 0; bodyID < bodyFAcc.getSize(); ++bodyID )
00187   {
00188     bodyFAccP[ bodyID ] = zeroVec;
00189     bodyTAccP[ bodyID ] = zeroVec;
00190   }
00191   bodyFAcc.syncToDevice( copyType );
00192   bodyTAcc.syncToDevice( copyType );
00193 
00194   if( preprocessEnabled( ) )
00195   {
00196     CudaT *iMassP = iMass.getHostBuffer();
00197     const ParamsT *invIP = parallelParams_->invI;
00198     Vec4T *iP = i0.getHostBuffer();
00199 
00200     for(int bodyID = 0; bodyID < getNumBodies(); ++bodyID, invIP += 12)
00201     {
00202       iMassP[ bodyID ] = parallelParams_->body[ bodyID ]->invMass;
00203       iP[ bID(bodyID,0) ] = make_vec4( (CudaT)invIP[0], (CudaT)invIP[1], (CudaT)invIP[2]);
00204       iP[ bID(bodyID,1) ] = make_vec4( (CudaT)invIP[4], (CudaT)invIP[5], (CudaT)invIP[6]);
00205       iP[ bID(bodyID,2) ] = make_vec4( (CudaT)invIP[8], (CudaT)invIP[9], (CudaT)invIP[10]);
00206     }
00207     iMass.syncToDevice( copyType );
00208     i0.syncToDevice( copyType );
00209   }
00210 }
00211 
00212 template<typename CudaT, typename ParamsT, ParallelType PType>
00213 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadConstraints( )
00214 {
00215   IFTIMING( ParallelTimer timer("  -- load constraints ") );
00216   const CopyType copyType = getCopyType( );
00217 
00218   CudaT *adcfmP = adcfm.getHostBuffer();
00219   CudaT *lambdaP = lambda0.getHostBuffer();
00220   CudaT *rhsP = rhs.getHostBuffer();
00221   CudaT *lohiP = lohiD.getHostBuffer();
00222   int4 *bodyIDsP = bodyIDs.getHostBuffer();
00223   int *fIDP = fIDs.getHostBuffer();
00224   Vec4T *jP = j0.getHostBuffer();
00225 
00226   for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID)
00227   {
00228     size_t dIndex = batchIndices_[ batchID ];
00229     for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex)
00230     {
00231       const int scalarIndex = constraintIndices_[hIndex];
00232       const int vecIndex = scalarIndex * 12;
00233 
00234       const int body0ID = parallelParams_->jb[ scalarIndex*2   ];
00235       const int body1ID = parallelParams_->jb[ scalarIndex*2+1 ];
00236 
00237       const int body0ReductionID = reduceStrategy_->getFinalIndex( body0ID, (reductionOffsets0_[ hIndex ] %  ParallelOptions::MAXBODYREPETITION) );
00238       const int body1ReductionID = body1ID < 0 ? -1 : reduceStrategy_->getFinalIndex( body1ID, (reductionOffsets1_[ hIndex ] %  ParallelOptions::MAXBODYREPETITION) );
00239 
00240       bodyIDsP[dIndex]  = make_int4( body0ID, body1ID, body0ReductionID, body1ReductionID );
00241 
00242       fIDP[dIndex]   = parallelParams_->findex[ scalarIndex ];
00243       rhsP[dIndex]   = parallelParams_->b[ scalarIndex ];
00244       adcfmP[dIndex] = parallelParams_->Adcfm[ scalarIndex ];
00245 
00246       lohiP[ dIndex ]          = parallelParams_->lo[ scalarIndex ];
00247       lohiP[ cID(dIndex,1) ]   = parallelParams_->hi[ scalarIndex ];
00248 
00249       jP[ dIndex ]           = make_vec4( (CudaT)parallelParams_->J[vecIndex],   (CudaT)parallelParams_->J[vecIndex+1],  (CudaT)parallelParams_->J[vecIndex+2] );
00250       jP[ cID(dIndex,1) ]    = make_vec4( (CudaT)parallelParams_->J[vecIndex+3], (CudaT)parallelParams_->J[vecIndex+4],  (CudaT)parallelParams_->J[vecIndex+5] );
00251       jP[ cID(dIndex,2) ]    = make_vec4( (CudaT)parallelParams_->J[vecIndex+6], (CudaT)parallelParams_->J[vecIndex+7],  (CudaT)parallelParams_->J[vecIndex+8] );
00252       jP[ cID(dIndex,3) ]    = make_vec4( (CudaT)parallelParams_->J[vecIndex+9], (CudaT)parallelParams_->J[vecIndex+10], (CudaT)parallelParams_->J[vecIndex+11] );
00253 
00254       lambdaP[dIndex] = 0.0;
00255     }
00256   }
00257 
00258   adcfm.syncToDevice( copyType );
00259   lambda0.syncToDevice( copyType );
00260   rhs.syncToDevice( copyType );
00261   lohiD.syncToDevice( copyType );
00262   fIDs.syncToDevice( copyType );
00263   j0.syncToDevice( copyType );
00264   bodyIDs.syncToDevice( copyType );
00265   fIDs.syncToDevice( copyType );
00266 
00267 
00268   if( !preprocessEnabled( ) )
00269   {
00270     Vec4T* imjP = ij0.getHostBuffer();
00271     ParamsTPtr iMJ = parallelParams_->iMJ;
00272     for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID)
00273     {
00274       size_t dIndex = batchIndices_[ batchID ];
00275       for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex)
00276       {
00277         const int vecIndex = constraintIndices_[hIndex] * 12;
00278 
00279         imjP[ dIndex ]         = make_vec4( (CudaT)iMJ[vecIndex],   (CudaT)iMJ[vecIndex+1],  (CudaT)iMJ[vecIndex+2] );
00280         imjP[ cID(dIndex,1) ]  = make_vec4( (CudaT)iMJ[vecIndex+3], (CudaT)iMJ[vecIndex+4],  (CudaT)iMJ[vecIndex+5] );
00281         imjP[ cID(dIndex,2) ]  = make_vec4( (CudaT)iMJ[vecIndex+6], (CudaT)iMJ[vecIndex+7],  (CudaT)iMJ[vecIndex+8] );
00282         imjP[ cID(dIndex,3) ]  = make_vec4( (CudaT)iMJ[vecIndex+9], (CudaT)iMJ[vecIndex+10], (CudaT)iMJ[vecIndex+11] );
00283       }
00284     }
00285 
00286     ij0.syncToDevice( copyType );
00287   }
00288 }
00289 
00290 template<typename CudaT, typename ParamsT, ParallelType PType>
00291 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadConstants( )
00292 {
00293 
00294 }
00295 
00296 template<typename CudaT, typename ParamsT, ParallelType PType>
00297 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadKernels( )
00298 {
00299 
00300 }
00301 
00302 template<typename CudaT, typename ParamsT, ParallelType PType>
00303 void ParallelPGSSolver<CudaT,ParamsT,PType>::loadSolution( )
00304 {
00305   IFVERBOSE( lambda0.print("LAMBDA") );
00306   IFVERBOSE( bodyFAcc.print("BODYFACC") );
00307   IFVERBOSE( bodyTAcc.print("BODYTACC") );
00308 
00309   dxGlobalSync();
00310 
00311   CudaT *lambdaP = lambda0.getHostBuffer();
00312   Vec4T *fcP0 = bodyFAcc.getHostBuffer();
00313   Vec4T *fcP1 = bodyTAcc.getHostBuffer();
00314 
00315   for(size_t batchID = 0, hIndex = 0; batchID < batchSizes_.size(); ++batchID)
00316   {
00317     size_t dIndex = batchIndices_[ batchID ];
00318     for(int batchCount = 0; batchCount < batchSizes_[ batchID ]; ++batchCount,++hIndex,++dIndex)
00319     {
00320       parallelParams_->lambda[ constraintIndices_[ hIndex ] ] = lambdaP[ dIndex ];
00321     }
00322   }
00323 
00324   for(int bodyID = 0, fID = 0; bodyID < getNumBodies(); ++bodyID, fID+=6)
00325   {
00326     parallelParams_->fc[ fID    ] = fcP0[ bodyID ].x;
00327     parallelParams_->fc[ fID + 1] = fcP0[ bodyID ].y;
00328     parallelParams_->fc[ fID + 2] = fcP0[ bodyID ].z;
00329     parallelParams_->fc[ fID + 3] = fcP1[ bodyID ].x;
00330     parallelParams_->fc[ fID + 4] = fcP1[ bodyID ].y;
00331     parallelParams_->fc[ fID + 5] = fcP1[ bodyID ].z;
00332   }
00333 }
00334 
00335 template<typename CudaT, typename ParamsT, ParallelType PType>
00336 void ParallelPGSSolver<CudaT,ParamsT,PType>::setBodySize( int n, bool bResizeArrays )
00337 {
00338   IFTIMING( ParallelTimer timer("  -- setting body size ") );
00339   n_ = n;
00340 
00341   if( bResizeArrays )
00342   {
00343     setBodyStride( alignDefaultSize( n_ ) );
00344     setReduceStride( alignDefaultSize( reduceStrategy_->getBodySizeWithReduction( ) ) );
00345 
00346     if( preprocessEnabled( ) )
00347     {
00348       iMass.setSize( n );
00349       i0.setSize( getBodyStride() * 3 );
00350     }
00351 
00352     bodyFAcc.setSize( getBodyStride() );
00353     bodyTAcc.setSize( getBodyStride() );
00354 
00355     if( reduceEnabled( ) ) {
00356       bodyFAccReduction.setSize( getReduceStride() );
00357       bodyTAccReduction.setSize( getReduceStride() );
00358     }
00359   }
00360 }
00361 
00362 template<typename CudaT, typename ParamsT, ParallelType PType>
00363 void ParallelPGSSolver<CudaT,ParamsT,PType>::setConstraintSize( int m, bool bResizeArrays )
00364 {
00365   IFTIMING( ParallelTimer timer("  -- setting constraint size ") );
00366   m_ = m;
00367 
00368   constraintIndices_.resize(m);
00369   reductionOffsets0_.resize(m);
00370   reductionOffsets1_.resize(m);
00371 
00372   if( bResizeArrays )
00373   {
00374     setConstraintStride( parallel_utils::alignedSize( batchSizes_ ) );
00375 
00376     if( alignEnabled( ) ) {
00377       m = getConstraintStride( );
00378     }
00379 
00380     j0.setSize( m * 4 );
00381     ij0.setSize( m * 4 );
00382 
00383     if( compactScalarsEnabled( ) ) {
00384       adcfm.setSize( m * 4 );
00385     } else {
00386       adcfm.setSize( m );
00387       rhs.setSize( m );
00388       lohiD.setSize( m * 2 );
00389     }
00390 
00391     lambda0.setSize( m );
00392     bodyIDs.setSize( m );
00393     fIDs.setSize( m );
00394   }
00395 }
00396 
00397 template<typename CudaT, typename ParamsT, ParallelType PType>
00398 void ParallelPGSSolver<CudaT,ParamsT,PType>::setMemFlags( MemFlags flags )
00399 {
00400   j0.setFlags( flags );
00401   ij0.setFlags( flags );
00402   lambda0.setFlags( flags );
00403   adcfm.setFlags( flags );
00404   rhs.setFlags( flags );
00405   lohiD.setFlags( flags );
00406   bodyIDs.setFlags( flags );
00407   fIDs.setFlags( flags );
00408   iMass.setFlags( flags );
00409   i0.setFlags( flags );
00410   bodyFAcc.setFlags( flags );
00411   bodyTAcc.setFlags( flags );
00412   bodyFAccReduction.setFlags( flags );
00413   bodyTAccReduction.setFlags( flags );
00414 }
00415 
00416 template<typename CudaT, typename ParamsT, ParallelType PType>
00417 void ParallelPGSSolver<CudaT,ParamsT,PType>::printConfig( )
00418 {
00419   if( getNumBodies() <= 0 || getNumConstraints() <= 0 ) return;
00420 
00421   IFVERBOSE( lambda0.print("j0") );
00422   IFVERBOSE( bodyFAcc.print("BODYFACC") );
00423   IFVERBOSE( bodyTAcc.print("BODYTACC") );
00424 
00425   int maxRepetitionCount = 0;
00426   for( ArrayIndex i = 0; i < batchRepetitionCount_.size(); ++i ) {
00427     maxRepetitionCount = max(maxRepetitionCount, batchRepetitionCount_[ i ]);
00428   }
00429 
00430   printf(" -- (CSize,BSize,AvgCPerB,MaxBRepetition)=(%d,%d,%f,%d) --\n",
00431          getNumConstraints(),
00432          getNumBodies(),
00433          (CudaT)getNumConstraints() / (CudaT)getNumBodies(),
00434          maxRepetitionCount);
00435 }
00436 
00437 }