parallel_quickstep: parallel_quickstep: parallel

00001 #include <ode/objects.h>
00002 #include <ode/ode.h>
00003 #include <ode/odemath.h>
00004 #include <ode/rotation.h>
00005 #include <ode/timer.h>
00006 #include <ode/error.h>
00007 #include <ode/matrix.h>
00008 #include <ode/misc.h>
00009 #include "objects.h"
00010 #include "config.h"
00011 #include "joints/joint.h"
00012 #include "lcp.h"
00013 #include "util.h"
00014 
00015 #include <parallel_common.h>
00016 #include <parallel_utils.h>
00017 #include <parallel_stepper.h>
00018 
00019 #if defined(USE_CUDA)
00020 #include <cuda_solver.h>
00021 #ifdef CUDA_DOUBLESUPPORT
00022 typedef parallel_ode::CudaPGSSolver<dReal,dReal> SolverType;
00023 #else
00024 typedef parallel_ode::CudaPGSSolver<float,dReal> SolverType;
00025 #endif
00026 #elif defined(USE_OPENCL)
00027 #include <opencl_solver.h>
00028 typedef parallel_ode::OpenCLPGSSolver<dReal> SolverType;
00029 #elif defined(USE_OPENMP)
00030 #include <openmp_solver.h>
00031 typedef parallel_ode::OpenMPPGSSolver<dReal> SolverType;
00032 #endif
00033 
00034 #if PARALLEL_ENABLED
00035 static SolverType parallelSolver;
00036 #endif
00037 
00038 typedef const dReal *dRealPtr;
00039 typedef dReal *dRealMutablePtr;
00040 
00041 template
00042 void compute_invM_JT<dReal> (int m, dRealPtr J, dRealMutablePtr iMJ, int *jb,
00043                              dxBody * const *body, const dRealPtr invI);
00044 
00045 template
00046 void compute_Adcfm_b<dReal> (int m, dReal sor_w, dRealMutablePtr J, dRealPtr iMJ, int* jb, dRealPtr cfm,
00047                              dRealMutablePtr Adcfm, dRealMutablePtr b);
00048 
00049 #define RANDOMLY_REORDER_CONSTRAINTS 1
00050 
00051 #ifdef BENCHMARKING
00052 static unsigned long totalConstraints = 0;
00053 static unsigned long totalBodies = 0;
00054 static const unsigned long maxBenchmarkIters = 300;
00055 static const unsigned long minBenchmarkIters = 50;
00056 static unsigned long benchmarkRoundsRemaining = 5;
00057 static unsigned long benchmarkIters = 0;
00058 static int sorItersMult = 2;
00059 static dReal sorParamMult = .85;
00060 
00061 static void benchmarkIteration( dxWorld *world) {
00062    ++benchmarkIters;
00063 
00064   if (benchmarkIters >= maxBenchmarkIters) {
00065     printf("Iterations = %d, Avg Constraints  = %f, Avg Bodies = %f\n",
00066            dWorldGetQuickStepNumIterations( world ),
00067            (dReal)totalConstraints / (dReal)(maxBenchmarkIters - minBenchmarkIters),
00068            (dReal)totalBodies / (dReal)(maxBenchmarkIters - minBenchmarkIters) );
00069 
00070     dTimerReport (stdout,1);
00071 
00072     --benchmarkRoundsRemaining;
00073     if( benchmarkRoundsRemaining == 0 ) exit(0);
00074 
00075     totalConstraints=0;
00076     totalBodies=0;
00077     benchmarkIters=0;
00078     dWorldSetQuickStepNumIterations( world, (int)(dWorldGetQuickStepNumIterations( world ) * sorItersMult) );
00079     dWorldSetQuickStepW( world, (dReal)(dWorldGetQuickStepW( world ) * sorParamMult) );
00080   }
00081 }
00082 #endif
00083 
00084 struct IndexError {
00085   int index;            // row index
00086 };
00087 
00088 template<typename T>
00089 void compute_invM_JT (int m, const T* J, T* iMJ, int *jb,
00090                       dxBody * const *body, const T* invI)
00091 {
00092   IFTIMING( dTimerNow("compute invM/JT ") );
00093 
00094   // precompute iMJ = inv(M)*J'
00095   dRealMutablePtr iMJ_ptr = iMJ;
00096   dRealPtr J_ptr = J;
00097   for (int i=0; i<m; J_ptr += 12, iMJ_ptr += 12, i++) {
00098     int b1 = jb[i*2];
00099     int b2 = jb[i*2+1];
00100     dReal k1 = body[b1]->invMass;
00101     for (int j=0; j<3; j++) iMJ_ptr[j] = k1*J_ptr[j];
00102     const dReal *invIrow1 = invI + 12*b1;
00103     dMultiply0_331 (iMJ_ptr + 3, invIrow1, J_ptr + 3);
00104     if (b2 >= 0) {
00105       dReal k2 = body[b2]->invMass;
00106       for (int j=0; j<3; j++) iMJ_ptr[j+6] = k2*J_ptr[j+6];
00107       const dReal *invIrow2 = invI + 12*b2;
00108       dMultiply0_331 (iMJ_ptr + 9, invIrow2, J_ptr + 9);
00109     }
00110   }
00111 }
00112 
00113 template<typename T>
00114 void compute_Adcfm_b (int m, T sor_w, T* J, const T* iMJ, int* jb, const T* cfm,
00115                              T* Adcfm, T* b)
00116 {
00117   IFTIMING( dTimerNow("compute Adcfm/b") );
00118   {
00119     // precompute 1 / diagonals of A
00120     dRealPtr iMJ_ptr = iMJ;
00121     dRealPtr J_ptr = J;
00122     for (int i=0; i<m; J_ptr += 12, iMJ_ptr += 12, i++) {
00123       dReal sum = 0;
00124       for (int j=0; j<6; j++) sum += iMJ_ptr[j] * J_ptr[j];
00125       if (jb[i*2+1] >= 0) {
00126         for (int k=6; k<12; k++) sum += iMJ_ptr[k] * J_ptr[k];
00127       }
00128       Adcfm[i] = sor_w / (sum + cfm[i]);
00129     }
00130   }
00131 
00132   {
00133     // scale J and b by Ad
00134     dRealMutablePtr J_ptr = J;
00135     for (int i=0; i<m; J_ptr += 12, i++) {
00136       dReal Ad_i = Adcfm[i];
00137       for (int j=0; j<12; j++) {
00138         J_ptr[j] *= Ad_i;
00139       }
00140       b[i] *= Ad_i;
00141       // scale Ad by CFM. N.B. this should be done last since it is used above
00142       Adcfm[i] = Ad_i * cfm[i];
00143     }
00144   }
00145 }
00146 
00147 static void worldSolve(dxWorldProcessContext *context,
00148   const int m, const int nb, dRealMutablePtr J, int *jb, dxBody * const *body,
00149   dRealPtr invI, dRealMutablePtr lambda, dRealMutablePtr fc, dRealMutablePtr b,
00150   dRealPtr lo, dRealPtr hi, dRealPtr cfm, dRealMutablePtr iMJ, dRealMutablePtr Ad, const int *findex,
00151   const dxQuickStepParameters *qs)
00152 {
00153   IFVERBOSE( printf ("(# Bodies, #Constraints) = (%d, %d )\n",nb,m) );
00154 
00155   compute_invM_JT (m,J,iMJ,jb,body,invI);
00156   dSetZero (lambda,m);
00157   compute_Adcfm_b (m, qs->w, J, iMJ, jb, cfm, Ad, b);
00158   dSetZero (fc,nb*6);
00159 
00160   IFTIMING( dTimerNow ("solving LCP problem") );
00161 
00162   // order to solve constraint rows in
00163   IndexError *order = context->AllocateArray<IndexError> (m);
00164 
00165   {
00166     // make sure constraints with findex < 0 come first.
00167     IndexError *orderhead = order, *ordertail = order + (m - 1);
00168 
00169     // Fill the array from both ends
00170     for (int i=0; i<m; i++) {
00171       if (findex[i] < 0) {
00172         orderhead->index = i; // Place them at the front
00173         ++orderhead;
00174       } else {
00175         ordertail->index = i; // Place them at the end
00176         --ordertail;
00177       }
00178     }
00179     dIASSERT (orderhead-ordertail==1);
00180   }
00181 
00182 #if COMPUTE_ERROR
00183   dReal *delta_error = context->AllocateArray<dReal> (m);
00184 #endif
00185 
00186   const int num_iterations = qs->num_iterations;
00187   for (int iteration=0; iteration < num_iterations; iteration++) {
00188 
00189 #ifdef RANDOMLY_REORDER_CONSTRAINTS
00190     if ((iteration & 7) == 0) {
00191       for (int i=1; i<m; i++) {
00192         int swapi = dRandInt(i+1);
00193         IndexError tmp = order[i];
00194         order[i] = order[swapi];
00195         order[swapi] = tmp;
00196       }
00197     }
00198 #endif
00199 
00200 #if COMPUTE_ERROR
00201     dReal rms_error = 0;
00202 #endif
00203 
00204     for (int i=0; i<m; i++) {
00205       // @@@ potential optimization: we could pre-sort J and iMJ, thereby
00206       //     linearizing access to those arrays. hmmm, this does not seem
00207       //     like a win, but we should think carefully about our memory
00208       //     access pattern.
00209 
00210       int index = order[i].index;
00211 
00212       dRealMutablePtr fc_ptr1;
00213       dRealMutablePtr fc_ptr2;
00214       dReal delta;
00215 
00216       {
00217         int b1 = jb[index*2];
00218         int b2 = jb[index*2+1];
00219         fc_ptr1 = fc + 6*b1;
00220         fc_ptr2 = (b2 >= 0) ? fc + 6*b2 : NULL;
00221       }
00222 
00223       dReal old_lambda = lambda[index];
00224 
00225       {
00226         delta = b[index] - old_lambda*Ad[index];
00227 
00228         dRealPtr J_ptr = J + index*12;
00229         // @@@ potential optimization: SIMD-ize this and the b2 >= 0 case
00230         delta -=fc_ptr1[0] * J_ptr[0] + fc_ptr1[1] * J_ptr[1] +
00231           fc_ptr1[2] * J_ptr[2] + fc_ptr1[3] * J_ptr[3] +
00232           fc_ptr1[4] * J_ptr[4] + fc_ptr1[5] * J_ptr[5];
00233         // @@@ potential optimization: handle 1-body constraints in a separate
00234         //     loop to avoid the cost of test & jump?
00235         if (fc_ptr2) {
00236           delta -=fc_ptr2[0] * J_ptr[6] + fc_ptr2[1] * J_ptr[7] +
00237             fc_ptr2[2] * J_ptr[8] + fc_ptr2[3] * J_ptr[9] +
00238             fc_ptr2[4] * J_ptr[10] + fc_ptr2[5] * J_ptr[11];
00239         }
00240       }
00241 
00242 
00243       {
00244         dReal hi_act, lo_act;
00245 
00246         // set the limits for this constraint.
00247         // this is the place where the QuickStep method differs from the
00248         // direct LCP solving method, since that method only performs this
00249         // limit adjustment once per time step, whereas this method performs
00250         // once per iteration per constraint row.
00251         // the constraints are ordered so that all lambda[] values needed have
00252         // already been computed.
00253         if (findex[index] >= 0) {
00254           hi_act = dFabs (hi[index] * lambda[findex[index]]);
00255           lo_act = -hi_act;
00256         } else {
00257           hi_act = hi[index];
00258           lo_act = lo[index];
00259         }
00260 
00261         // compute lambda and clamp it to [lo,hi].
00262         // @@@ potential optimization: does SSE have clamping instructions
00263         //     to save test+jump penalties here?
00264         dReal new_lambda = old_lambda + delta;
00265         if (new_lambda < lo_act) {
00266           delta = lo_act-old_lambda;
00267           lambda[index] = lo_act;
00268         }
00269         else if (new_lambda > hi_act) {
00270           delta = hi_act-old_lambda;
00271           lambda[index] = hi_act;
00272         }
00273         else {
00274           lambda[index] = new_lambda;
00275         }
00276       }
00277 
00278 #if COMPUTE_ERROR
00279       rms_error += delta*delta;
00280       delta_error[index] = dFabs(delta);
00281 #endif
00282 
00283       //@@@ a trick that may or may not help
00284       //dReal ramp = (1-((dReal)(iteration+1)/(dReal)num_iterations));
00285       //delta *= ramp;
00286 
00287       {
00288         dRealPtr iMJ_ptr = iMJ + index*12;
00289         // update fc.
00290         // @@@ potential optimization: SIMD for this and the b2 >= 0 case
00291         fc_ptr1[0] += delta * iMJ_ptr[0];
00292         fc_ptr1[1] += delta * iMJ_ptr[1];
00293         fc_ptr1[2] += delta * iMJ_ptr[2];
00294         fc_ptr1[3] += delta * iMJ_ptr[3];
00295         fc_ptr1[4] += delta * iMJ_ptr[4];
00296         fc_ptr1[5] += delta * iMJ_ptr[5];
00297         // @@@ potential optimization: handle 1-body constraints in a separate
00298         //     loop to avoid the cost of test & jump?
00299         if (fc_ptr2) {
00300           fc_ptr2[0] += delta * iMJ_ptr[6];
00301           fc_ptr2[1] += delta * iMJ_ptr[7];
00302           fc_ptr2[2] += delta * iMJ_ptr[8];
00303           fc_ptr2[3] += delta * iMJ_ptr[9];
00304           fc_ptr2[4] += delta * iMJ_ptr[10];
00305           fc_ptr2[5] += delta * iMJ_ptr[11];
00306         }
00307       }
00308     }
00309 #if COMPUTE_ERROR
00310     if ( iteration == num_iterations - 1 ) {
00311       // do we need to compute norm across entire solution space (0,m)?
00312       // since local convergence might produce errors in other nodes?
00313 #ifdef RECOMPUTE_RMS
00314       // recompute rms_error to be sure swap is not corrupting arrays
00315       rms_error = 0;
00316 #ifdef USE_1NORM
00317       for (int i=0; i<m; i++)
00318       {
00319         rms_error = dFabs(delta_error[order[i].index]) > rms_error ? dFabs(delta_error[order[i].index]) : rms_error; // 1norm test
00320       }
00321 #else // use 2 norm
00322       for (int i=0; i<m; i++)  // use entire solution vector errors
00323         rms_error += delta_error[order[i].index]*delta_error[order[i].index]; 
00324       rms_error = sqrt(rms_error); 
00325 #endif
00326 #else
00327       rms_error = sqrt(rms_error); 
00328 #endif
00329       printf("(Iteration, Error) = (%d, %20.18f)\n",iteration,rms_error);
00330     }
00331 #endif
00332   }
00333 }
00334 
00335 void dxParallelProcessIslands (dxWorld *world, dReal stepsize, dstepper_fn_t stepper)
00336 {
00337   const int sizeelements = 2;
00338 
00339   dxStepWorkingMemory *wmem = world->wmem;
00340   dIASSERT(wmem != NULL);
00341 
00342   dxWorldProcessContext *context = wmem->GetWorldProcessingContext();
00343 
00344   size_t const *islandreqs = NULL;
00345   int islandcount;
00346   int const *islandsizes;
00347   dxBody *const *body;
00348   dxJoint *const *joint;
00349   context->RetrievePreallocations(islandcount, islandsizes, body, joint, islandreqs);
00350 
00351   dxBody *const *bodystart = body;
00352   dxJoint *const *jointstart = joint;
00353 
00354   int const *const sizesend = islandsizes + islandcount * sizeelements;
00355 
00356   int bcount = 0;
00357   int jcount = 0;
00358 
00359   for (int const *sizescurr = islandsizes; sizescurr != sizesend; sizescurr += sizeelements) {
00360     bcount += sizescurr[0];
00361     jcount += sizescurr[1];
00362   }
00363 
00364   BEGIN_STATE_SAVE(context, stepperstate) {
00365     stepper (context,world,bodystart,bcount,jointstart,jcount,stepsize);
00366   } END_STATE_SAVE(context, stepperstate);
00367 
00368   context->CleanupContext();
00369   dIASSERT(context->IsStructureValid());
00370 }
00371 
00372 //***************************************************************************
00373 // configuration
00374 
00375 #define RANDOMLY_REORDER_CONSTRAINTS 1
00376 
00377 //****************************************************************************
00378 // special matrix multipliers
00379 
00380 // multiply block of B matrix (q x 6) with 12 dReal per row with C vektor (q)
00381 static void Multiply1_12q1 (dReal *A, const dReal *B, const dReal *C, int q)
00382 {
00383   dIASSERT (q>0 && A && B && C);
00384 
00385   dReal a = 0;
00386   dReal b = 0;
00387   dReal c = 0;
00388   dReal d = 0;
00389   dReal e = 0;
00390   dReal f = 0;
00391   dReal s;
00392 
00393   for(int i=0, k = 0; i<q; k += 12, i++)
00394   {
00395     s = C[i]; //C[i] and B[n+k] cannot overlap because its value has been read into a temporary.
00396 
00397     //For the rest of the loop, the only memory dependency (array) is from B[]
00398     a += B[  k] * s;
00399     b += B[1+k] * s;
00400     c += B[2+k] * s;
00401     d += B[3+k] * s;
00402     e += B[4+k] * s;
00403     f += B[5+k] * s;
00404   }
00405 
00406   A[0] = a;
00407   A[1] = b;
00408   A[2] = c;
00409   A[3] = d;
00410   A[4] = e;
00411   A[5] = f;
00412 }
00413 
00414 //***************************************************************************
00415 // various common computations involving the matrix J
00416 
00417 // compute out = J*in.
00418 
00419 static void multiply_J (int m, dRealPtr J, int *jb,
00420   dRealPtr in, dRealMutablePtr out)
00421 {
00422   dRealPtr J_ptr = J;
00423   for (int i=0; i<m; i++) {
00424     int b1 = jb[i*2];
00425     int b2 = jb[i*2+1];
00426     dReal sum = 0;
00427     dRealPtr in_ptr = in + b1*6;
00428     for (int j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j];
00429     J_ptr += 6;
00430     if (b2 >= 0) {
00431       in_ptr = in + b2*6;
00432       for (int j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j];
00433     }
00434     J_ptr += 6;
00435     out[i] = sum;
00436   }
00437 }
00438 
00439 struct dJointWithInfo1
00440 {
00441   dxJoint *joint;
00442   dxJoint::Info1 info;
00443 };
00444 
00445 void dxParallelQuickStepper( dxWorldProcessContext *context,
00446                              dxWorld *world, dxBody * const *body, int nb,
00447                              dxJoint * const *_joint, int _nj, dReal stepsize)
00448 {
00449 
00450   IFTIMING(dTimerStart("preprocessing"));
00451   const dReal stepsize1 = dRecip(stepsize);
00452 
00453   {
00454     // number all bodies in the body list - set their tag values
00455     for (int i=0; i<nb; i++) body[i]->tag = i;
00456   }
00457 
00458   // for all bodies, compute the inertia tensor and its inverse in the global
00459   // frame, and compute the rotational force and add it to the torque
00460   // accumulator. I and invI are a vertical stack of 3x4 matrices, one per body.
00461   dReal *invI = context->AllocateArray<dReal> (3*4*nb);
00462 
00463   {
00464     dReal *invIrow = invI;
00465     dxBody *const *const bodyend = body + nb;
00466     for (dxBody *const *bodycurr = body; bodycurr != bodyend; invIrow += 12, bodycurr++) {
00467       dMatrix3 tmp;
00468       dxBody *b = *bodycurr;
00469 
00470       // compute inverse inertia tensor in global frame
00471       dMultiply2_333 (tmp,b->invI,b->posr.R);
00472       dMultiply0_333 (invIrow,b->posr.R,tmp);
00473 
00474       if (b->flags & dxBodyGyroscopic) {
00475         dMatrix3 I;
00476         // compute inertia tensor in global frame
00477         dMultiply2_333 (tmp,b->mass.I,b->posr.R);
00478         dMultiply0_333 (I,b->posr.R,tmp);
00479         // compute rotational force
00480         dMultiply0_331 (tmp,I,b->avel);
00481         dSubtractVectorCross3(b->tacc,b->avel,tmp);
00482       }
00483     }
00484   }
00485 
00486   // get the masses for every body
00487   dReal *invM = context->AllocateArray<dReal> (nb);
00488   {
00489     dReal *invMrow = invM;
00490     dxBody *const *const bodyend = body + nb;
00491     for (dxBody *const *bodycurr = body; bodycurr != bodyend; invMrow++, bodycurr++) {
00492       dxBody *b = *bodycurr;
00493       //*invMrow = b->mass.mass;
00494       *invMrow = b->invMass;
00495 
00496     }
00497   }
00498 
00499   {
00500     // add the gravity force to all bodies
00501     // since gravity does normally have only one component it's more efficient
00502     // to run three loops for each individual component
00503     dxBody *const *const bodyend = body + nb;
00504     dReal gravity_x = world->gravity[0];
00505     if (gravity_x) {
00506       for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00507         dxBody *b = *bodycurr;
00508         if ((b->flags & dxBodyNoGravity)==0) {
00509           b->facc[0] += b->mass.mass * gravity_x;
00510         }
00511       }
00512     }
00513     dReal gravity_y = world->gravity[1];
00514     if (gravity_y) {
00515       for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00516         dxBody *b = *bodycurr;
00517         if ((b->flags & dxBodyNoGravity)==0) {
00518           b->facc[1] += b->mass.mass * gravity_y;
00519         }
00520       }
00521     }
00522     dReal gravity_z = world->gravity[2];
00523     if (gravity_z) {
00524       for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00525         dxBody *b = *bodycurr;
00526         if ((b->flags & dxBodyNoGravity)==0) {
00527           b->facc[2] += b->mass.mass * gravity_z;
00528         }
00529       }
00530     }
00531   }
00532 
00533   // get joint information (m = total constraint dimension, nub = number of unbounded variables).
00534   // joints with m=0 are inactive and are removed from the joints array
00535   // entirely, so that the code that follows does not consider them.
00536   dJointWithInfo1 *const jointiinfos = context->AllocateArray<dJointWithInfo1> (_nj);
00537   int nj;
00538 
00539   {
00540     dJointWithInfo1 *jicurr = jointiinfos;
00541     dxJoint *const *const _jend = _joint + _nj;
00542     for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) {  // jicurr=dest, _jcurr=src
00543       dxJoint *j = *_jcurr;
00544       j->getInfo1 (&jicurr->info);
00545       dIASSERT (jicurr->info.m >= 0 && jicurr->info.m <= 6 && jicurr->info.nub >= 0 && jicurr->info.nub <= jicurr->info.m);
00546       if (jicurr->info.m > 0) {
00547         jicurr->joint = j;
00548         jicurr++;
00549       }
00550     }
00551     nj = jicurr - jointiinfos;
00552   }
00553 
00554   context->ShrinkArray<dJointWithInfo1>(jointiinfos, _nj, nj);
00555 
00556   int m;
00557   int mfb; // number of rows of Jacobian we will have to save for joint feedback
00558 
00559   {
00560     int mcurr = 0, mfbcurr = 0;
00561     const dJointWithInfo1 *jicurr = jointiinfos;
00562     const dJointWithInfo1 *const jiend = jicurr + nj;
00563     for (; jicurr != jiend; jicurr++) {
00564       int jm = jicurr->info.m;
00565       mcurr += jm;
00566       if (jicurr->joint->feedback)
00567         mfbcurr += jm;
00568     }
00569 
00570     m = mcurr;
00571     mfb = mfbcurr;
00572   }
00573 
00574   // if there are constraints, compute the constraint force
00575   dReal *J = NULL;
00576   int *jb = NULL;
00577 
00578   if (m > 0) {
00579     dReal *cfm, *lo, *hi, *rhs, *Jcopy;
00580     int *findex;
00581 
00582     {
00583       int mlocal = m;
00584 
00585       const unsigned jelements = mlocal*12;
00586       J = context->AllocateArray<dReal> (jelements);
00587       dSetZero (J,jelements);
00588 
00589       // create a constraint equation right hand side vector `c', a constraint
00590       // force mixing vector `cfm', and LCP low and high bound vectors, and an
00591       // 'findex' vector.
00592       cfm = context->AllocateArray<dReal> (mlocal);
00593       dSetValue (cfm,mlocal,world->global_cfm);
00594 
00595       lo = context->AllocateArray<dReal> (mlocal);
00596       dSetValue (lo,mlocal,-dInfinity);
00597 
00598       hi = context->AllocateArray<dReal> (mlocal);
00599       dSetValue (hi,mlocal,dInfinity);
00600 
00601       findex = context->AllocateArray<int> (mlocal);
00602       for (int i=0; i<mlocal; i++) findex[i] = -1;
00603 
00604       const unsigned jbelements = mlocal*2;
00605       jb = context->AllocateArray<int> (jbelements);
00606 
00607       rhs = context->AllocateArray<dReal> (mlocal);
00608 
00609       Jcopy = context->AllocateArray<dReal> (mfb*12);
00610 
00611     }
00612 
00613     BEGIN_STATE_SAVE(context, cstate) {
00614       dReal *c = context->AllocateArray<dReal> (m);
00615       dSetZero (c, m);
00616 
00617       {
00618         IFTIMING (dTimerNow ("create J"));
00619         // get jacobian data from constraints. an m*12 matrix will be created
00620         // to store the two jacobian blocks from each constraint. it has this
00621         // format:
00622         //
00623         //   l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 \    .
00624         //   l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2  )-- jacobian for joint 0, body 1 and body 2 (3 rows)
00625         //   l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 /
00626         //   l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 )--- jacobian for joint 1, body 1 and body 2 (3 rows)
00627         //   etc...
00628         //
00629         //   (lll) = linear jacobian data
00630         //   (aaa) = angular jacobian data
00631         //
00632         dxJoint::Info2 Jinfo;
00633         Jinfo.rowskip = 12;
00634         Jinfo.fps = stepsize1;
00635         Jinfo.erp = world->global_erp;
00636 
00637         dReal *Jcopyrow = Jcopy;
00638         unsigned ofsi = 0;
00639         const dJointWithInfo1 *jicurr = jointiinfos;
00640         const dJointWithInfo1 *const jiend = jicurr + nj;
00641         for (; jicurr != jiend; jicurr++) {
00642           dReal *const Jrow = J + ofsi * 12;
00643           Jinfo.J1l = Jrow;
00644           Jinfo.J1a = Jrow + 3;
00645           Jinfo.J2l = Jrow + 6;
00646           Jinfo.J2a = Jrow + 9;
00647           Jinfo.c = c + ofsi;
00648           Jinfo.cfm = cfm + ofsi;
00649           Jinfo.lo = lo + ofsi;
00650           Jinfo.hi = hi + ofsi;
00651           Jinfo.findex = findex + ofsi;
00652 
00653 
00654           // now write all information into J
00655           dxJoint *joint = jicurr->joint;
00656           joint->getInfo2 (&Jinfo);
00657           const int infom = jicurr->info.m;
00658 
00659           // we need a copy of Jacobian for joint feedbacks
00660           // because it gets destroyed by SOR solver
00661           // instead of saving all Jacobian, we can save just rows
00662           // for joints, that requested feedback (which is normally much less)
00663           if (joint->feedback) {
00664             const int rowels = infom * 12;
00665             memcpy(Jcopyrow, Jrow, rowels * sizeof(dReal));
00666             Jcopyrow += rowels;
00667           }
00668 
00669           // adjust returned findex values for global index numbering
00670           int *findex_ofsi = findex + ofsi;
00671           for (int j=0; j<infom; j++) {
00672             int fival = findex_ofsi[j];
00673             if (fival >= 0)
00674               findex_ofsi[j] = fival + ofsi;
00675           }
00676 
00677           ofsi += infom;
00678         }
00679       }
00680 
00681       {
00682         // create an array of body numbers for each joint row
00683         int *jb_ptr = jb;
00684         const dJointWithInfo1 *jicurr = jointiinfos;
00685         const dJointWithInfo1 *const jiend = jicurr + nj;
00686         for (; jicurr != jiend; jicurr++) {
00687           dxJoint *joint = jicurr->joint;
00688           const int infom = jicurr->info.m;
00689 
00690           int b1 = (joint->node[0].body) ? (joint->node[0].body->tag) : -1;
00691           int b2 = (joint->node[1].body) ? (joint->node[1].body->tag) : -1;
00692           for (int j=0; j<infom; j++) {
00693             jb_ptr[0] = b1;
00694             jb_ptr[1] = b2;
00695             jb_ptr += 2;
00696           }
00697         }
00698         dIASSERT (jb_ptr == jb+2*m);
00699       }
00700 
00701       BEGIN_STATE_SAVE(context, tmp1state) {
00702         IFTIMING (dTimerNow ("compute rhs"));
00703         // compute the right hand side `rhs'
00704         dReal *tmp1 = context->AllocateArray<dReal> (nb*6);
00705         // put v/h + invM*fe into tmp1
00706         dReal *tmp1curr = tmp1;
00707         const dReal *invIrow = invI;
00708         dxBody *const *const bodyend = body + nb;
00709         for (dxBody *const *bodycurr = body; bodycurr != bodyend; tmp1curr+=6, invIrow+=12, bodycurr++) {
00710           dxBody *b = *bodycurr;
00711           dReal body_invMass = b->invMass;
00712           for (int j=0; j<3; j++) tmp1curr[j] = b->facc[j] * body_invMass + b->lvel[j] * stepsize1;
00713           dMultiply0_331 (tmp1curr + 3,invIrow,b->tacc);
00714           for (int k=0; k<3; k++) tmp1curr[3+k] += b->avel[k] * stepsize1;
00715         }
00716 
00717         // put J*tmp1 into rhs
00718         multiply_J (m,J,jb,tmp1,rhs);
00719       } END_STATE_SAVE(context, tmp1state);
00720 
00721       // complete rhs
00722       for (int i=0; i<m; i++) rhs[i] = c[i]*stepsize1 - rhs[i];
00723 
00724       // scale CFM
00725       for (int j=0; j<m; j++) cfm[j] *= stepsize1;
00726 
00727     } END_STATE_SAVE(context, cstate);
00728 
00729     /*
00730     for( int i=0; i<m; i++ ) {
00731       lo[i] = CUDA_MIN > lo[i] ? CUDA_MIN : lo[i];//std::min(lo[i], (dReal)CUDA_MIN);//
00732       hi[i] = hi[i] > CUDA_MAX ? CUDA_MAX : hi[i];//std::min(hi[i], (dReal)CUDA_MAX);//
00733     }
00734     */
00735 
00736     // load lambda from the value saved on the previous iteration
00737     dReal *lambda = context->AllocateArray<dReal> (m);
00738 
00739     dReal *cforce = context->AllocateArray<dReal> (nb*6);
00740 
00741     dReal *iMJ = context->AllocateArray<dReal> (m*12);
00742     dReal *Ad = context->AllocateArray<dReal> (m);
00743 
00744     BEGIN_STATE_SAVE(context, lcpstate) {
00745       // solve the LCP problem and get lambda and invM*constraint_force
00746       IFBENCHMARKING( if( benchmarkIters > minBenchmarkIters ) totalConstraints+=m; totalBodies+=nb; dTimerStart(" LCP Solve - Parallel "););
00747 
00748 #if PARALLEL_ENABLED
00749         // FIXMED: need to first cast params into float, right now, SolverType is float, but params contains dRal pointers
00750         SolverType::SolverParams params(context,&world->qs,m,nb,J,jb,body,invI,lambda,cforce,rhs,lo,hi,cfm,iMJ,Ad,findex,stepsize);
00751         parallelSolver.worldSolve( &params );
00752 #else
00753         worldSolve(context,m,nb,J,jb,body,invI,lambda,cforce,rhs,lo,hi,cfm,iMJ,Ad,findex,&world->qs);
00754 #endif
00755 
00756       IFBENCHMARKING (dTimerEnd());
00757 
00758     } END_STATE_SAVE(context, lcpstate);
00759 
00760     // note that the SOR method overwrites rhs and J at this point, so
00761     // they should not be used again.
00762     {
00763       IFTIMING (dTimerNow ("velocity update due to constraint forces"));
00764       // note that cforce is really not a force but an acceleration, hence there is
00765       // no premultiplying of invM here (compare to update due to external force 'facc' below)
00766       //
00767       // add stepsize * cforce to the body velocity
00768       const dReal *cforcecurr = cforce;
00769       dxBody *const *const bodyend = body + nb;
00770       for (dxBody *const *bodycurr = body; bodycurr != bodyend; cforcecurr+=6, bodycurr++) {
00771         dxBody *b = *bodycurr;
00772         for (int j=0; j<3; j++) {
00773           b->lvel[j] += stepsize * cforcecurr[j];
00774           b->avel[j] += stepsize * cforcecurr[3+j];
00775         }
00776       }
00777     }
00778 
00779     if (mfb > 0) {
00780       // straightforward computation of joint constraint forces:
00781       // multiply related lambdas with respective J' block for joints
00782       // where feedback was requested
00783       dReal data[6];
00784       const dReal *lambdacurr = lambda;
00785       const dReal *Jcopyrow = Jcopy;
00786       const dJointWithInfo1 *jicurr = jointiinfos;
00787       const dJointWithInfo1 *const jiend = jicurr + nj;
00788       for (; jicurr != jiend; jicurr++) {
00789         dxJoint *joint = jicurr->joint;
00790         const int infom = jicurr->info.m;
00791 
00792         if (joint->feedback) {
00793           dJointFeedback *fb = joint->feedback;
00794           Multiply1_12q1 (data, Jcopyrow, lambdacurr, infom);
00795           fb->f1[0] = data[0];
00796           fb->f1[1] = data[1];
00797           fb->f1[2] = data[2];
00798           fb->t1[0] = data[3];
00799           fb->t1[1] = data[4];
00800           fb->t1[2] = data[5];
00801 
00802           if (joint->node[1].body)
00803           {
00804             Multiply1_12q1 (data, Jcopyrow+6, lambdacurr, infom);
00805             fb->f2[0] = data[0];
00806             fb->f2[1] = data[1];
00807             fb->f2[2] = data[2];
00808             fb->t2[0] = data[3];
00809             fb->t2[1] = data[4];
00810             fb->t2[2] = data[5];
00811           }
00812 
00813           Jcopyrow += infom * 12;
00814         }
00815 
00816         lambdacurr += infom;
00817       }
00818     }
00819   }
00820 
00821   {
00822     IFTIMING (dTimerNow ("compute velocity update"));
00823     // compute the velocity update:
00824     // add stepsize * invM * fe to the body velocity
00825     const dReal *invIrow = invI;
00826     dxBody *const *const bodyend = body + nb;
00827     for (dxBody *const *bodycurr = body; bodycurr != bodyend; invIrow += 12, bodycurr++) {
00828       dxBody *b = *bodycurr;
00829       dReal body_invMass_mul_stepsize = stepsize * b->invMass;
00830       for (int j=0; j<3; j++) {
00831         b->lvel[j] += body_invMass_mul_stepsize * b->facc[j];
00832         b->tacc[j] *= stepsize;
00833       }
00834       dMultiplyAdd0_331 (b->avel, invIrow, b->tacc);
00835     }
00836   }
00837 
00838   {
00839     // update the position and orientation from the new linear/angular velocity
00840     // (over the given timestep)
00841     IFTIMING (dTimerNow ("update position"));
00842     dxBody *const *const bodyend = body + nb;
00843     for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00844       dxBody *b = *bodycurr;
00845       dxStepBody (b,stepsize);
00846     }
00847   }
00848 
00849   {
00850     IFTIMING (dTimerNow ("tidy up"));
00851     // zero all force accumulators
00852     dxBody *const *const bodyend = body + nb;
00853     for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00854       dxBody *b = *bodycurr;
00855       dSetZero (b->facc,3);
00856       dSetZero (b->tacc,3);
00857     }
00858   }
00859 
00860   IFTIMING (dTimerEnd());
00861   IFTIMING (if (m > 0) dTimerReport (stdout,1));
00862   IFBENCHMARKING ( benchmarkIteration( world ) );
00863 }
00864 
00865 static size_t EstimateParallelSOR_LCPMemoryRequirements(int m, int n)
00866 {
00867   size_t res = dEFFICIENT_SIZE(sizeof(dReal) * 4 * m) * 4; // for ij
00868   res += dEFFICIENT_SIZE(sizeof(dReal) * 4 * m) * 4; // for j
00869   res += dEFFICIENT_SIZE(sizeof(dReal) * m); // for adcfm
00870   res += dEFFICIENT_SIZE(sizeof(dReal) * m); // for rhs
00871   res += dEFFICIENT_SIZE(sizeof(dReal) * m) * 2; // for lohi
00872   res += dEFFICIENT_SIZE(sizeof(int) * m * 4); // for bodyIDs
00873   res += dEFFICIENT_SIZE(sizeof(int) * m); // for fIDs
00874   res += dEFFICIENT_SIZE(sizeof(dReal) * m); // for lambda
00875 
00876   res += dEFFICIENT_SIZE(sizeof(dReal) * m); // for iMass
00877   res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * 3; // for i0
00878   res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4); // for bodyFAcc
00879   res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4); // for bodyTAcc
00880   res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * (size_t)ParallelOptions::MAXBODYREPETITION; // for bodyFAccReduction
00881   res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * (size_t)ParallelOptions::MAXBODYREPETITION; // for bodyTAccReduction
00882 
00883   return res;
00884 }
00885 
00886 size_t dxEstimateParallelStepMemoryRequirements (
00887   dxBody * const *body, int nb, dxJoint * const *_joint, int _nj)
00888 {
00889   int nj, m, mfb;
00890 
00891   {
00892     int njcurr = 0, mcurr = 0, mfbcurr = 0;
00893     dxJoint::SureMaxInfo info;
00894     dxJoint *const *const _jend = _joint + _nj;
00895     for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) {
00896       dxJoint *j = *_jcurr;
00897       j->getSureMaxInfo (&info);
00898 
00899       int jm = info.max_m;
00900       if (jm > 0) {
00901         njcurr++;
00902 
00903         mcurr += jm;
00904         if (j->feedback)
00905           mfbcurr += jm;
00906       }
00907     }
00908     nj = njcurr; m = mcurr; mfb = mfbcurr;
00909   }
00910 
00911   size_t res = 0;
00912 
00913   res += dEFFICIENT_SIZE(sizeof(dReal) * 3 * 4 * nb); // for invI
00914   res += dEFFICIENT_SIZE(sizeof(dReal) * nb); // for invM
00915 
00916   {
00917     size_t sub1_res1 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * _nj); // for initial jointiinfos
00918     size_t sub1_res2 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * nj); // for shrunk jointiinfos
00919     if (m > 0) {
00920       sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 12 * m); // for J
00921       sub1_res2 += 4 * dEFFICIENT_SIZE(sizeof(dReal) * m); // for cfm, lo, hi, rhs
00922       sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * 12 * m); // for jb            FIXME: shoulbe be 2 not 12?
00923       sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * m); // for findex
00924       sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 12 * mfb); // for Jcopy
00925 
00926       {
00927         size_t sub2_res1 = dEFFICIENT_SIZE(sizeof(dReal) * m); // for c
00928         {
00929           size_t sub3_res1 = dEFFICIENT_SIZE(sizeof(dReal) * 6 * nb); // for tmp1
00930 
00931           size_t sub3_res2 = 0;
00932 
00933           sub2_res1 += (sub3_res1 >= sub3_res2) ? sub3_res1 : sub3_res2;
00934         }
00935 
00936         size_t sub2_res2 = dEFFICIENT_SIZE(sizeof(dReal) * m); // for lambda
00937         sub2_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 6 * nb); // for cforce
00938         {
00939           size_t sub3_res1 = EstimateParallelSOR_LCPMemoryRequirements(m,nj); // for SOR_LCP
00940           size_t sub3_res2 = 0;
00941           sub2_res2 += (sub3_res1 >= sub3_res2) ? sub3_res1 : sub3_res2;
00942         }
00943 
00944         sub1_res2 += (sub2_res1 >= sub2_res2) ? sub2_res1 : sub2_res2;
00945       }
00946     }
00947 
00948     res += 3 * ((sub1_res1 >= sub1_res2) ? sub1_res1 : sub1_res2);
00949   }
00950 
00951   return res;
00952 }
00953 
00954 static size_t BuildIslandAndEstimateStepperMemoryRequirements(dxWorldProcessContext *context,
00955   dxWorld *world, dReal stepsize, dmemestimate_fn_t stepperestimate)
00956 {
00957   const int sizeelements = 2;
00958   size_t maxreq = 0;
00959 
00960   // handle auto-disabling of bodies
00961   dInternalHandleAutoDisabling (world,stepsize);
00962 
00963   int nb = world->nb, nj = world->nj;
00964   // Make array for island body/joint counts
00965   int *islandsizes = context->AllocateArray<int>(sizeelements);
00966 
00967   // make arrays for body and joint lists (for a single island) to go into
00968   dxBody **body = context->AllocateArray<dxBody *>(nb);
00969   dxJoint **joint = context->AllocateArray<dxJoint *>(nj);
00970 
00971   BEGIN_STATE_SAVE(context, stackstate) {
00972     // allocate a stack of unvisited bodies in the island. the maximum size of
00973     // the stack can be the lesser of the number of bodies or joints, because
00974     // new bodies are only ever added to the stack by going through untagged
00975     // joints. all the bodies in the stack must be tagged!
00976 
00977     int stackalloc = (nj < nb) ? nj : nb;
00978     dxBody **stack = context->AllocateArray<dxBody *>(stackalloc);
00979 
00980     {
00981       // set all body/joint tags to 0
00982       for (dxBody *b=world->firstbody; b; b=(dxBody*)b->next) b->tag = 0;
00983       for (dxJoint *j=world->firstjoint; j; j=(dxJoint*)j->next) j->tag = 0;
00984     }
00985 
00986     int bcount = 0, jcount =0;
00987 
00988     dxBody **bodystart = body;
00989     dxJoint **jointstart = joint;
00990     for (dxBody *bb=world->firstbody; bb; bb=(dxBody*)bb->next) {
00991       // get bb = the next enabled, untagged body, and tag it
00992       if (!bb->tag) {
00993         if (!(bb->flags & dxBodyDisabled)) {
00994           bb->tag = 1;
00995 
00996           dxBody **bodycurr = bodystart;
00997           dxJoint **jointcurr = jointstart;
00998 
00999           // tag all bodies and joints starting from bb.
01000           *bodycurr++ = bb;
01001 
01002           int stacksize = 0;
01003           dxBody *b = bb;
01004 
01005           while (true) {
01006             // traverse and tag all body's joints, add untagged connected bodies
01007             // to stack
01008             for (dxJointNode *n=b->firstjoint; n; n=n->next) {
01009               dxJoint *njoint = n->joint;
01010               if (!njoint->tag) {
01011                 if (njoint->isEnabled()) {
01012                   njoint->tag = 1;
01013                   *jointcurr++ = njoint;
01014 
01015                   dxBody *nbody = n->body;
01016                   // Body disabled flag is not checked here. This is how auto-enable works.
01017                   if (nbody && nbody->tag <= 0) {
01018                     nbody->tag = 1;
01019                     // Make sure all bodies are in the enabled state.
01020                     nbody->flags &= ~dxBodyDisabled;
01021                     stack[stacksize++] = nbody;
01022                   }
01023                 } else {
01024                   njoint->tag = -1; // Used in Step to prevent search over disabled joints (not needed for QuickStep so far)
01025                 }
01026               }
01027             }
01028             dIASSERT(stacksize <= world->nb);
01029             dIASSERT(stacksize <= world->nj);
01030 
01031             if (stacksize == 0) {
01032               break;
01033             }
01034 
01035             b = stack[--stacksize];     // pop body off stack
01036             *bodycurr++ = b;    // put body on body list
01037           }
01038 
01039           bcount += (bodycurr - bodystart);
01040           jcount += (jointcurr - jointstart);
01041 
01042           bodystart = bodycurr;
01043           jointstart = jointcurr;
01044         } else {
01045           bb->tag = -1; // Not used so far (assigned to retain consistency with joints)
01046         }
01047       }
01048     }
01049 
01050     islandsizes[0] = bcount;
01051     islandsizes[1] = jcount;
01052 
01053     size_t islandreq = stepperestimate(body, bcount, joint, jcount);
01054     maxreq = islandreq;
01055 
01056   } END_STATE_SAVE(context, stackstate);
01057 
01058 # ifndef dNODEBUG
01059   // if debugging, check that all objects (except for disabled bodies,
01060   // unconnected joints, and joints that are connected to disabled bodies)
01061   // were tagged.
01062   {
01063     for (dxBody *b=world->firstbody; b; b=(dxBody*)b->next) {
01064       if (b->flags & dxBodyDisabled) {
01065         if (b->tag > 0) dDebug (0,"disabled body tagged");
01066       }
01067       else {
01068         if (b->tag <= 0) dDebug (0,"enabled body not tagged");
01069       }
01070     }
01071     for (dxJoint *j=world->firstjoint; j; j=(dxJoint*)j->next) {
01072       if ( (( j->node[0].body && (j->node[0].body->flags & dxBodyDisabled)==0 ) ||
01073         (j->node[1].body && (j->node[1].body->flags & dxBodyDisabled)==0) )
01074         &&
01075         j->isEnabled() ) {
01076           if (j->tag <= 0) dDebug (0,"attached enabled joint not tagged");
01077       }
01078       else {
01079         if (j->tag > 0) dDebug (0,"unattached or disabled joint tagged");
01080       }
01081     }
01082   }
01083 # endif
01084 
01085   size_t const *islandreqs = NULL;
01086   int islandcount = 1;
01087   context->SavePreallocations(islandcount, islandsizes, body, joint, islandreqs);
01088 
01089   return maxreq;
01090 }
01091 
01092 static size_t EstimateIslandsProcessingMemoryRequirements(dxWorld *world, size_t &sesize)
01093 {
01094   size_t res = 0;
01095 
01096   size_t islandcounts = dEFFICIENT_SIZE(world->nb * 2 * sizeof(int));
01097   res += islandcounts;
01098 
01099   size_t bodiessize = dEFFICIENT_SIZE(world->nb * sizeof(dxBody*));
01100   size_t jointssize = dEFFICIENT_SIZE(world->nj * sizeof(dxJoint*));
01101   res += bodiessize + jointssize;
01102 
01103   sesize = (bodiessize < jointssize) ? bodiessize : jointssize;
01104   return res;
01105 }
01106 
01107 static size_t AdjustArenaSizeForReserveRequirements(size_t arenareq, float rsrvfactor, unsigned rsrvminimum)
01108 {
01109   float scaledarena = arenareq * rsrvfactor;
01110   size_t adjustedarena = (scaledarena < SIZE_MAX) ? (size_t)scaledarena : SIZE_MAX;
01111   size_t boundedarena = (adjustedarena > rsrvminimum) ? adjustedarena : (size_t)rsrvminimum;
01112   return dEFFICIENT_SIZE(boundedarena);
01113 }
01114 
01115 static dxWorldProcessContext *InternalReallocateWorldProcessContext (
01116   dxWorldProcessContext *oldcontext, size_t memreq,
01117   const dxWorldProcessMemoryManager *memmgr, float rsrvfactor, unsigned rsrvminimum)
01118 {
01119   dxWorldProcessContext *context = oldcontext;
01120   bool allocsuccess = false;
01121 
01122   size_t oldarenasize;
01123   void *pOldArena;
01124 
01125   do {
01126     size_t oldmemsize = oldcontext ? oldcontext->GetMemorySize() : 0;
01127     if (!oldcontext || oldmemsize < memreq) {
01128       oldarenasize = oldcontext ? dxWorldProcessContext::MakeArenaSize(oldmemsize) : 0;
01129       pOldArena = oldcontext ? oldcontext->m_pArenaBegin : NULL;
01130 
01131       if (!dxWorldProcessContext::IsArenaPossible(memreq)) {
01132         break;
01133       }
01134 
01135       size_t arenareq = dxWorldProcessContext::MakeArenaSize(memreq);
01136       size_t arenareq_with_reserve = AdjustArenaSizeForReserveRequirements(arenareq, rsrvfactor, rsrvminimum);
01137       size_t memreq_with_reserve = memreq + (arenareq_with_reserve - arenareq);
01138 
01139       if (oldcontext) {
01140 
01141         if (oldcontext->m_pAllocCurrent != oldcontext->m_pAllocBegin) {
01142 
01143           // Save old efficient offset and meaningful data size for the case if
01144           // reallocation throws the block at different efficient offset
01145           size_t oldcontextofs = (size_t)oldcontext - (size_t)pOldArena;
01146           size_t datasize = (size_t)oldcontext->m_pAllocCurrent - (size_t)oldcontext;
01147 
01148           // Extra EFFICIENT_ALIGNMENT bytes might be needed after re-allocation with different alignment
01149           size_t shrunkarenasize = dEFFICIENT_SIZE(datasize + oldcontextofs) + EFFICIENT_ALIGNMENT;
01150           if (shrunkarenasize < oldarenasize) {
01151 
01152             void *pShrunkOldArena = oldcontext->m_pArenaMemMgr->m_fnShrink(pOldArena, oldarenasize, shrunkarenasize);
01153             if (!pShrunkOldArena) {
01154               break;
01155             }
01156 
01157             // In case if shrinking is not supported and memory manager had to allocate-copy-free
01158             if (pShrunkOldArena != pOldArena) {
01159               dxWorldProcessContext *shrunkcontext = (dxWorldProcessContext *)dEFFICIENT_PTR(pShrunkOldArena);
01160 
01161               // Preform data shift in case if efficient alignment of new block
01162               // does not match that of old block
01163               size_t shrunkcontextofs = (size_t)shrunkcontext - (size_t)pShrunkOldArena;
01164               size_t offsetdiff = oldcontextofs - shrunkcontextofs;
01165               if (offsetdiff != 0) {
01166                 memmove(shrunkcontext, (void *)((size_t)shrunkcontext + offsetdiff), datasize);
01167               }
01168 
01169               // Make sure allocation pointers are valid - that is necessary to
01170               // be able to calculate size and free old arena later
01171               size_t shrunkdatasize = dxWorldProcessContext::MakeBufferSize(shrunkarenasize);
01172               void *blockbegin = dEFFICIENT_PTR(shrunkcontext + 1);
01173               void *blockend = dOFFSET_EFFICIENTLY(blockbegin, shrunkdatasize);
01174               shrunkcontext->m_pAllocBegin = blockbegin;
01175               shrunkcontext->m_pAllocEnd = blockend;
01176               shrunkcontext->m_pAllocCurrent = blockend; // -- set to end to prevent possibility of further allocation
01177               shrunkcontext->m_pArenaBegin = pShrunkOldArena;
01178 
01179               size_t stOffset = ((size_t)pShrunkOldArena - (size_t)pOldArena) - offsetdiff;
01180               shrunkcontext->OffsetPreallocations(stOffset);
01181 
01182               oldcontext = shrunkcontext;
01183 
01184               // Reassign to old arena variables for potential freeing at exit
01185               pOldArena = pShrunkOldArena;
01186             }
01187 
01188             // Reassign to old arena variables for potential freeing at exit
01189             oldarenasize = shrunkarenasize;
01190           }
01191 
01192         } else {
01193           oldcontext->m_pArenaMemMgr->m_fnFree(pOldArena, oldarenasize);
01194           oldcontext = NULL;
01195 
01196           // Zero variables to avoid another freeing on exit
01197           pOldArena = NULL;
01198           oldarenasize = 0;
01199         }
01200       }
01201 
01202       // Allocate new arena
01203       void *pNewArena = memmgr->m_fnAlloc(arenareq_with_reserve);
01204       if (!pNewArena) {
01205         break;
01206       }
01207 
01208       context = (dxWorldProcessContext *)dEFFICIENT_PTR(pNewArena);
01209 
01210       void *blockbegin = dEFFICIENT_PTR(context + 1);
01211       void *blockend = dOFFSET_EFFICIENTLY(blockbegin, memreq_with_reserve);
01212 
01213       context->m_pAllocBegin = blockbegin;
01214       context->m_pAllocEnd = blockend;
01215       context->m_pArenaBegin = pNewArena;
01216       context->m_pAllocCurrent = blockbegin;
01217 
01218       if (oldcontext) {
01219         context->CopyPreallocations(oldcontext);
01220       } else {
01221         context->ClearPreallocations();
01222       }
01223 
01224       context->m_pArenaMemMgr = memmgr;
01225       context->m_pPreallocationcContext = oldcontext;
01226     }
01227 
01228     allocsuccess = true;
01229   } while (false);
01230 
01231   if (!allocsuccess) {
01232     if (pOldArena) {
01233       dIASSERT(oldcontext);
01234       oldcontext->m_pArenaMemMgr->m_fnFree(pOldArena, oldarenasize);
01235     }
01236     context = NULL;
01237   }
01238 
01239   return context;
01240 }
01241 
01242 bool dxReallocateParallelWorldProcessContext (dxWorld *world, dReal stepsize, dmemestimate_fn_t stepperestimate)
01243 {
01244   dxStepWorkingMemory *wmem = AllocateOnDemand(world->wmem);
01245   if (!wmem) return false;
01246 
01247   dxWorldProcessContext *oldcontext = wmem->GetWorldProcessingContext();
01248   dIASSERT (!oldcontext || oldcontext->IsStructureValid());
01249 
01250   const dxWorldProcessMemoryReserveInfo *reserveinfo = wmem->SureGetMemoryReserveInfo();
01251   const dxWorldProcessMemoryManager *memmgr = wmem->SureGetMemoryManager();
01252 
01253   dxWorldProcessContext *context = oldcontext;
01254 
01255   size_t sesize;
01256   size_t islandsreq = EstimateIslandsProcessingMemoryRequirements(world, sesize);
01257   dIASSERT(islandsreq == dEFFICIENT_SIZE(islandsreq));
01258   dIASSERT(sesize == dEFFICIENT_SIZE(sesize));
01259 
01260   size_t stepperestimatereq = islandsreq + sesize;
01261   context = InternalReallocateWorldProcessContext(context, stepperestimatereq, memmgr, 1.0f, reserveinfo->m_uiReserveMinimum);
01262 
01263   if (context)
01264   {
01265     size_t stepperreq = BuildIslandAndEstimateStepperMemoryRequirements(context, world, stepsize, stepperestimate);
01266     dIASSERT(stepperreq == dEFFICIENT_SIZE(stepperreq));
01267 
01268     size_t memreq = stepperreq + islandsreq;
01269     context = InternalReallocateWorldProcessContext(context, memreq, memmgr, reserveinfo->m_fReserveFactor, reserveinfo->m_uiReserveMinimum);
01270   }
01271 
01272   wmem->SetWorldProcessingContext(context);
01273   return context != NULL;
01274 }
01275 
01276
parallel_stepper.cpp