00001 #include <ode/objects.h>
00002 #include <ode/ode.h>
00003 #include <ode/odemath.h>
00004 #include <ode/rotation.h>
00005 #include <ode/timer.h>
00006 #include <ode/error.h>
00007 #include <ode/matrix.h>
00008 #include <ode/misc.h>
00009 #include "objects.h"
00010 #include "config.h"
00011 #include "joints/joint.h"
00012 #include "lcp.h"
00013 #include "util.h"
00014
00015 #include <parallel_common.h>
00016 #include <parallel_utils.h>
00017 #include <parallel_stepper.h>
00018
00019 #if defined(USE_CUDA)
00020 #include <cuda_solver.h>
00021 #ifdef CUDA_DOUBLESUPPORT
00022 typedef parallel_ode::CudaPGSSolver<dReal,dReal> SolverType;
00023 #else
00024 typedef parallel_ode::CudaPGSSolver<float,dReal> SolverType;
00025 #endif
00026 #elif defined(USE_OPENCL)
00027 #include <opencl_solver.h>
00028 typedef parallel_ode::OpenCLPGSSolver<dReal> SolverType;
00029 #elif defined(USE_OPENMP)
00030 #include <openmp_solver.h>
00031 typedef parallel_ode::OpenMPPGSSolver<dReal> SolverType;
00032 #endif
00033
00034 #if PARALLEL_ENABLED
00035 static SolverType parallelSolver;
00036 #endif
00037
00038 typedef const dReal *dRealPtr;
00039 typedef dReal *dRealMutablePtr;
00040
00041 template
00042 void compute_invM_JT<dReal> (int m, dRealPtr J, dRealMutablePtr iMJ, int *jb,
00043 dxBody * const *body, const dRealPtr invI);
00044
00045 template
00046 void compute_Adcfm_b<dReal> (int m, dReal sor_w, dRealMutablePtr J, dRealPtr iMJ, int* jb, dRealPtr cfm,
00047 dRealMutablePtr Adcfm, dRealMutablePtr b);
00048
00049 #define RANDOMLY_REORDER_CONSTRAINTS 1
00050
00051 #ifdef BENCHMARKING
00052 static unsigned long totalConstraints = 0;
00053 static unsigned long totalBodies = 0;
00054 static const unsigned long maxBenchmarkIters = 300;
00055 static const unsigned long minBenchmarkIters = 50;
00056 static unsigned long benchmarkRoundsRemaining = 5;
00057 static unsigned long benchmarkIters = 0;
00058 static int sorItersMult = 2;
00059 static dReal sorParamMult = .85;
00060
00061 static void benchmarkIteration( dxWorld *world) {
00062 ++benchmarkIters;
00063
00064 if (benchmarkIters >= maxBenchmarkIters) {
00065 printf("Iterations = %d, Avg Constraints = %f, Avg Bodies = %f\n",
00066 dWorldGetQuickStepNumIterations( world ),
00067 (dReal)totalConstraints / (dReal)(maxBenchmarkIters - minBenchmarkIters),
00068 (dReal)totalBodies / (dReal)(maxBenchmarkIters - minBenchmarkIters) );
00069
00070 dTimerReport (stdout,1);
00071
00072 --benchmarkRoundsRemaining;
00073 if( benchmarkRoundsRemaining == 0 ) exit(0);
00074
00075 totalConstraints=0;
00076 totalBodies=0;
00077 benchmarkIters=0;
00078 dWorldSetQuickStepNumIterations( world, (int)(dWorldGetQuickStepNumIterations( world ) * sorItersMult) );
00079 dWorldSetQuickStepW( world, (dReal)(dWorldGetQuickStepW( world ) * sorParamMult) );
00080 }
00081 }
00082 #endif
00083
00084 struct IndexError {
00085 int index;
00086 };
00087
00088 template<typename T>
00089 void compute_invM_JT (int m, const T* J, T* iMJ, int *jb,
00090 dxBody * const *body, const T* invI)
00091 {
00092 IFTIMING( dTimerNow("compute invM/JT ") );
00093
00094
00095 dRealMutablePtr iMJ_ptr = iMJ;
00096 dRealPtr J_ptr = J;
00097 for (int i=0; i<m; J_ptr += 12, iMJ_ptr += 12, i++) {
00098 int b1 = jb[i*2];
00099 int b2 = jb[i*2+1];
00100 dReal k1 = body[b1]->invMass;
00101 for (int j=0; j<3; j++) iMJ_ptr[j] = k1*J_ptr[j];
00102 const dReal *invIrow1 = invI + 12*b1;
00103 dMultiply0_331 (iMJ_ptr + 3, invIrow1, J_ptr + 3);
00104 if (b2 >= 0) {
00105 dReal k2 = body[b2]->invMass;
00106 for (int j=0; j<3; j++) iMJ_ptr[j+6] = k2*J_ptr[j+6];
00107 const dReal *invIrow2 = invI + 12*b2;
00108 dMultiply0_331 (iMJ_ptr + 9, invIrow2, J_ptr + 9);
00109 }
00110 }
00111 }
00112
00113 template<typename T>
00114 void compute_Adcfm_b (int m, T sor_w, T* J, const T* iMJ, int* jb, const T* cfm,
00115 T* Adcfm, T* b)
00116 {
00117 IFTIMING( dTimerNow("compute Adcfm/b") );
00118 {
00119
00120 dRealPtr iMJ_ptr = iMJ;
00121 dRealPtr J_ptr = J;
00122 for (int i=0; i<m; J_ptr += 12, iMJ_ptr += 12, i++) {
00123 dReal sum = 0;
00124 for (int j=0; j<6; j++) sum += iMJ_ptr[j] * J_ptr[j];
00125 if (jb[i*2+1] >= 0) {
00126 for (int k=6; k<12; k++) sum += iMJ_ptr[k] * J_ptr[k];
00127 }
00128 Adcfm[i] = sor_w / (sum + cfm[i]);
00129 }
00130 }
00131
00132 {
00133
00134 dRealMutablePtr J_ptr = J;
00135 for (int i=0; i<m; J_ptr += 12, i++) {
00136 dReal Ad_i = Adcfm[i];
00137 for (int j=0; j<12; j++) {
00138 J_ptr[j] *= Ad_i;
00139 }
00140 b[i] *= Ad_i;
00141
00142 Adcfm[i] = Ad_i * cfm[i];
00143 }
00144 }
00145 }
00146
00147 static void worldSolve(dxWorldProcessContext *context,
00148 const int m, const int nb, dRealMutablePtr J, int *jb, dxBody * const *body,
00149 dRealPtr invI, dRealMutablePtr lambda, dRealMutablePtr fc, dRealMutablePtr b,
00150 dRealPtr lo, dRealPtr hi, dRealPtr cfm, dRealMutablePtr iMJ, dRealMutablePtr Ad, const int *findex,
00151 const dxQuickStepParameters *qs)
00152 {
00153 IFVERBOSE( printf ("(# Bodies, #Constraints) = (%d, %d )\n",nb,m) );
00154
00155 compute_invM_JT (m,J,iMJ,jb,body,invI);
00156 dSetZero (lambda,m);
00157 compute_Adcfm_b (m, qs->w, J, iMJ, jb, cfm, Ad, b);
00158 dSetZero (fc,nb*6);
00159
00160 IFTIMING( dTimerNow ("solving LCP problem") );
00161
00162
00163 IndexError *order = context->AllocateArray<IndexError> (m);
00164
00165 {
00166
00167 IndexError *orderhead = order, *ordertail = order + (m - 1);
00168
00169
00170 for (int i=0; i<m; i++) {
00171 if (findex[i] < 0) {
00172 orderhead->index = i;
00173 ++orderhead;
00174 } else {
00175 ordertail->index = i;
00176 --ordertail;
00177 }
00178 }
00179 dIASSERT (orderhead-ordertail==1);
00180 }
00181
00182 #if COMPUTE_ERROR
00183 dReal *delta_error = context->AllocateArray<dReal> (m);
00184 #endif
00185
00186 const int num_iterations = qs->num_iterations;
00187 for (int iteration=0; iteration < num_iterations; iteration++) {
00188
00189 #ifdef RANDOMLY_REORDER_CONSTRAINTS
00190 if ((iteration & 7) == 0) {
00191 for (int i=1; i<m; i++) {
00192 int swapi = dRandInt(i+1);
00193 IndexError tmp = order[i];
00194 order[i] = order[swapi];
00195 order[swapi] = tmp;
00196 }
00197 }
00198 #endif
00199
00200 #if COMPUTE_ERROR
00201 dReal rms_error = 0;
00202 #endif
00203
00204 for (int i=0; i<m; i++) {
00205
00206
00207
00208
00209
00210 int index = order[i].index;
00211
00212 dRealMutablePtr fc_ptr1;
00213 dRealMutablePtr fc_ptr2;
00214 dReal delta;
00215
00216 {
00217 int b1 = jb[index*2];
00218 int b2 = jb[index*2+1];
00219 fc_ptr1 = fc + 6*b1;
00220 fc_ptr2 = (b2 >= 0) ? fc + 6*b2 : NULL;
00221 }
00222
00223 dReal old_lambda = lambda[index];
00224
00225 {
00226 delta = b[index] - old_lambda*Ad[index];
00227
00228 dRealPtr J_ptr = J + index*12;
00229
00230 delta -=fc_ptr1[0] * J_ptr[0] + fc_ptr1[1] * J_ptr[1] +
00231 fc_ptr1[2] * J_ptr[2] + fc_ptr1[3] * J_ptr[3] +
00232 fc_ptr1[4] * J_ptr[4] + fc_ptr1[5] * J_ptr[5];
00233
00234
00235 if (fc_ptr2) {
00236 delta -=fc_ptr2[0] * J_ptr[6] + fc_ptr2[1] * J_ptr[7] +
00237 fc_ptr2[2] * J_ptr[8] + fc_ptr2[3] * J_ptr[9] +
00238 fc_ptr2[4] * J_ptr[10] + fc_ptr2[5] * J_ptr[11];
00239 }
00240 }
00241
00242
00243 {
00244 dReal hi_act, lo_act;
00245
00246
00247
00248
00249
00250
00251
00252
00253 if (findex[index] >= 0) {
00254 hi_act = dFabs (hi[index] * lambda[findex[index]]);
00255 lo_act = -hi_act;
00256 } else {
00257 hi_act = hi[index];
00258 lo_act = lo[index];
00259 }
00260
00261
00262
00263
00264 dReal new_lambda = old_lambda + delta;
00265 if (new_lambda < lo_act) {
00266 delta = lo_act-old_lambda;
00267 lambda[index] = lo_act;
00268 }
00269 else if (new_lambda > hi_act) {
00270 delta = hi_act-old_lambda;
00271 lambda[index] = hi_act;
00272 }
00273 else {
00274 lambda[index] = new_lambda;
00275 }
00276 }
00277
00278 #if COMPUTE_ERROR
00279 rms_error += delta*delta;
00280 delta_error[index] = dFabs(delta);
00281 #endif
00282
00283
00284
00285
00286
00287 {
00288 dRealPtr iMJ_ptr = iMJ + index*12;
00289
00290
00291 fc_ptr1[0] += delta * iMJ_ptr[0];
00292 fc_ptr1[1] += delta * iMJ_ptr[1];
00293 fc_ptr1[2] += delta * iMJ_ptr[2];
00294 fc_ptr1[3] += delta * iMJ_ptr[3];
00295 fc_ptr1[4] += delta * iMJ_ptr[4];
00296 fc_ptr1[5] += delta * iMJ_ptr[5];
00297
00298
00299 if (fc_ptr2) {
00300 fc_ptr2[0] += delta * iMJ_ptr[6];
00301 fc_ptr2[1] += delta * iMJ_ptr[7];
00302 fc_ptr2[2] += delta * iMJ_ptr[8];
00303 fc_ptr2[3] += delta * iMJ_ptr[9];
00304 fc_ptr2[4] += delta * iMJ_ptr[10];
00305 fc_ptr2[5] += delta * iMJ_ptr[11];
00306 }
00307 }
00308 }
00309 #if COMPUTE_ERROR
00310 if ( iteration == num_iterations - 1 ) {
00311
00312
00313 #ifdef RECOMPUTE_RMS
00314
00315 rms_error = 0;
00316 #ifdef USE_1NORM
00317 for (int i=0; i<m; i++)
00318 {
00319 rms_error = dFabs(delta_error[order[i].index]) > rms_error ? dFabs(delta_error[order[i].index]) : rms_error;
00320 }
00321 #else // use 2 norm
00322 for (int i=0; i<m; i++)
00323 rms_error += delta_error[order[i].index]*delta_error[order[i].index];
00324 rms_error = sqrt(rms_error);
00325 #endif
00326 #else
00327 rms_error = sqrt(rms_error);
00328 #endif
00329 printf("(Iteration, Error) = (%d, %20.18f)\n",iteration,rms_error);
00330 }
00331 #endif
00332 }
00333 }
00334
00335 void dxParallelProcessIslands (dxWorld *world, dReal stepsize, dstepper_fn_t stepper)
00336 {
00337 const int sizeelements = 2;
00338
00339 dxStepWorkingMemory *wmem = world->wmem;
00340 dIASSERT(wmem != NULL);
00341
00342 dxWorldProcessContext *context = wmem->GetWorldProcessingContext();
00343
00344 size_t const *islandreqs = NULL;
00345 int islandcount;
00346 int const *islandsizes;
00347 dxBody *const *body;
00348 dxJoint *const *joint;
00349 context->RetrievePreallocations(islandcount, islandsizes, body, joint, islandreqs);
00350
00351 dxBody *const *bodystart = body;
00352 dxJoint *const *jointstart = joint;
00353
00354 int const *const sizesend = islandsizes + islandcount * sizeelements;
00355
00356 int bcount = 0;
00357 int jcount = 0;
00358
00359 for (int const *sizescurr = islandsizes; sizescurr != sizesend; sizescurr += sizeelements) {
00360 bcount += sizescurr[0];
00361 jcount += sizescurr[1];
00362 }
00363
00364 BEGIN_STATE_SAVE(context, stepperstate) {
00365 stepper (context,world,bodystart,bcount,jointstart,jcount,stepsize);
00366 } END_STATE_SAVE(context, stepperstate);
00367
00368 context->CleanupContext();
00369 dIASSERT(context->IsStructureValid());
00370 }
00371
00372
00373
00374
00375 #define RANDOMLY_REORDER_CONSTRAINTS 1
00376
00377
00378
00379
00380
00381 static void Multiply1_12q1 (dReal *A, const dReal *B, const dReal *C, int q)
00382 {
00383 dIASSERT (q>0 && A && B && C);
00384
00385 dReal a = 0;
00386 dReal b = 0;
00387 dReal c = 0;
00388 dReal d = 0;
00389 dReal e = 0;
00390 dReal f = 0;
00391 dReal s;
00392
00393 for(int i=0, k = 0; i<q; k += 12, i++)
00394 {
00395 s = C[i];
00396
00397
00398 a += B[ k] * s;
00399 b += B[1+k] * s;
00400 c += B[2+k] * s;
00401 d += B[3+k] * s;
00402 e += B[4+k] * s;
00403 f += B[5+k] * s;
00404 }
00405
00406 A[0] = a;
00407 A[1] = b;
00408 A[2] = c;
00409 A[3] = d;
00410 A[4] = e;
00411 A[5] = f;
00412 }
00413
00414
00415
00416
00417
00418
00419 static void multiply_J (int m, dRealPtr J, int *jb,
00420 dRealPtr in, dRealMutablePtr out)
00421 {
00422 dRealPtr J_ptr = J;
00423 for (int i=0; i<m; i++) {
00424 int b1 = jb[i*2];
00425 int b2 = jb[i*2+1];
00426 dReal sum = 0;
00427 dRealPtr in_ptr = in + b1*6;
00428 for (int j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j];
00429 J_ptr += 6;
00430 if (b2 >= 0) {
00431 in_ptr = in + b2*6;
00432 for (int j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j];
00433 }
00434 J_ptr += 6;
00435 out[i] = sum;
00436 }
00437 }
00438
00439 struct dJointWithInfo1
00440 {
00441 dxJoint *joint;
00442 dxJoint::Info1 info;
00443 };
00444
00445 void dxParallelQuickStepper( dxWorldProcessContext *context,
00446 dxWorld *world, dxBody * const *body, int nb,
00447 dxJoint * const *_joint, int _nj, dReal stepsize)
00448 {
00449
00450 IFTIMING(dTimerStart("preprocessing"));
00451 const dReal stepsize1 = dRecip(stepsize);
00452
00453 {
00454
00455 for (int i=0; i<nb; i++) body[i]->tag = i;
00456 }
00457
00458
00459
00460
00461 dReal *invI = context->AllocateArray<dReal> (3*4*nb);
00462
00463 {
00464 dReal *invIrow = invI;
00465 dxBody *const *const bodyend = body + nb;
00466 for (dxBody *const *bodycurr = body; bodycurr != bodyend; invIrow += 12, bodycurr++) {
00467 dMatrix3 tmp;
00468 dxBody *b = *bodycurr;
00469
00470
00471 dMultiply2_333 (tmp,b->invI,b->posr.R);
00472 dMultiply0_333 (invIrow,b->posr.R,tmp);
00473
00474 if (b->flags & dxBodyGyroscopic) {
00475 dMatrix3 I;
00476
00477 dMultiply2_333 (tmp,b->mass.I,b->posr.R);
00478 dMultiply0_333 (I,b->posr.R,tmp);
00479
00480 dMultiply0_331 (tmp,I,b->avel);
00481 dSubtractVectorCross3(b->tacc,b->avel,tmp);
00482 }
00483 }
00484 }
00485
00486
00487 dReal *invM = context->AllocateArray<dReal> (nb);
00488 {
00489 dReal *invMrow = invM;
00490 dxBody *const *const bodyend = body + nb;
00491 for (dxBody *const *bodycurr = body; bodycurr != bodyend; invMrow++, bodycurr++) {
00492 dxBody *b = *bodycurr;
00493
00494 *invMrow = b->invMass;
00495
00496 }
00497 }
00498
00499 {
00500
00501
00502
00503 dxBody *const *const bodyend = body + nb;
00504 dReal gravity_x = world->gravity[0];
00505 if (gravity_x) {
00506 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00507 dxBody *b = *bodycurr;
00508 if ((b->flags & dxBodyNoGravity)==0) {
00509 b->facc[0] += b->mass.mass * gravity_x;
00510 }
00511 }
00512 }
00513 dReal gravity_y = world->gravity[1];
00514 if (gravity_y) {
00515 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00516 dxBody *b = *bodycurr;
00517 if ((b->flags & dxBodyNoGravity)==0) {
00518 b->facc[1] += b->mass.mass * gravity_y;
00519 }
00520 }
00521 }
00522 dReal gravity_z = world->gravity[2];
00523 if (gravity_z) {
00524 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00525 dxBody *b = *bodycurr;
00526 if ((b->flags & dxBodyNoGravity)==0) {
00527 b->facc[2] += b->mass.mass * gravity_z;
00528 }
00529 }
00530 }
00531 }
00532
00533
00534
00535
00536 dJointWithInfo1 *const jointiinfos = context->AllocateArray<dJointWithInfo1> (_nj);
00537 int nj;
00538
00539 {
00540 dJointWithInfo1 *jicurr = jointiinfos;
00541 dxJoint *const *const _jend = _joint + _nj;
00542 for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) {
00543 dxJoint *j = *_jcurr;
00544 j->getInfo1 (&jicurr->info);
00545 dIASSERT (jicurr->info.m >= 0 && jicurr->info.m <= 6 && jicurr->info.nub >= 0 && jicurr->info.nub <= jicurr->info.m);
00546 if (jicurr->info.m > 0) {
00547 jicurr->joint = j;
00548 jicurr++;
00549 }
00550 }
00551 nj = jicurr - jointiinfos;
00552 }
00553
00554 context->ShrinkArray<dJointWithInfo1>(jointiinfos, _nj, nj);
00555
00556 int m;
00557 int mfb;
00558
00559 {
00560 int mcurr = 0, mfbcurr = 0;
00561 const dJointWithInfo1 *jicurr = jointiinfos;
00562 const dJointWithInfo1 *const jiend = jicurr + nj;
00563 for (; jicurr != jiend; jicurr++) {
00564 int jm = jicurr->info.m;
00565 mcurr += jm;
00566 if (jicurr->joint->feedback)
00567 mfbcurr += jm;
00568 }
00569
00570 m = mcurr;
00571 mfb = mfbcurr;
00572 }
00573
00574
00575 dReal *J = NULL;
00576 int *jb = NULL;
00577
00578 if (m > 0) {
00579 dReal *cfm, *lo, *hi, *rhs, *Jcopy;
00580 int *findex;
00581
00582 {
00583 int mlocal = m;
00584
00585 const unsigned jelements = mlocal*12;
00586 J = context->AllocateArray<dReal> (jelements);
00587 dSetZero (J,jelements);
00588
00589
00590
00591
00592 cfm = context->AllocateArray<dReal> (mlocal);
00593 dSetValue (cfm,mlocal,world->global_cfm);
00594
00595 lo = context->AllocateArray<dReal> (mlocal);
00596 dSetValue (lo,mlocal,-dInfinity);
00597
00598 hi = context->AllocateArray<dReal> (mlocal);
00599 dSetValue (hi,mlocal,dInfinity);
00600
00601 findex = context->AllocateArray<int> (mlocal);
00602 for (int i=0; i<mlocal; i++) findex[i] = -1;
00603
00604 const unsigned jbelements = mlocal*2;
00605 jb = context->AllocateArray<int> (jbelements);
00606
00607 rhs = context->AllocateArray<dReal> (mlocal);
00608
00609 Jcopy = context->AllocateArray<dReal> (mfb*12);
00610
00611 }
00612
00613 BEGIN_STATE_SAVE(context, cstate) {
00614 dReal *c = context->AllocateArray<dReal> (m);
00615 dSetZero (c, m);
00616
00617 {
00618 IFTIMING (dTimerNow ("create J"));
00619
00620
00621
00622
00623
00624
00625
00626
00627
00628
00629
00630
00631
00632 dxJoint::Info2 Jinfo;
00633 Jinfo.rowskip = 12;
00634 Jinfo.fps = stepsize1;
00635 Jinfo.erp = world->global_erp;
00636
00637 dReal *Jcopyrow = Jcopy;
00638 unsigned ofsi = 0;
00639 const dJointWithInfo1 *jicurr = jointiinfos;
00640 const dJointWithInfo1 *const jiend = jicurr + nj;
00641 for (; jicurr != jiend; jicurr++) {
00642 dReal *const Jrow = J + ofsi * 12;
00643 Jinfo.J1l = Jrow;
00644 Jinfo.J1a = Jrow + 3;
00645 Jinfo.J2l = Jrow + 6;
00646 Jinfo.J2a = Jrow + 9;
00647 Jinfo.c = c + ofsi;
00648 Jinfo.cfm = cfm + ofsi;
00649 Jinfo.lo = lo + ofsi;
00650 Jinfo.hi = hi + ofsi;
00651 Jinfo.findex = findex + ofsi;
00652
00653
00654
00655 dxJoint *joint = jicurr->joint;
00656 joint->getInfo2 (&Jinfo);
00657 const int infom = jicurr->info.m;
00658
00659
00660
00661
00662
00663 if (joint->feedback) {
00664 const int rowels = infom * 12;
00665 memcpy(Jcopyrow, Jrow, rowels * sizeof(dReal));
00666 Jcopyrow += rowels;
00667 }
00668
00669
00670 int *findex_ofsi = findex + ofsi;
00671 for (int j=0; j<infom; j++) {
00672 int fival = findex_ofsi[j];
00673 if (fival >= 0)
00674 findex_ofsi[j] = fival + ofsi;
00675 }
00676
00677 ofsi += infom;
00678 }
00679 }
00680
00681 {
00682
00683 int *jb_ptr = jb;
00684 const dJointWithInfo1 *jicurr = jointiinfos;
00685 const dJointWithInfo1 *const jiend = jicurr + nj;
00686 for (; jicurr != jiend; jicurr++) {
00687 dxJoint *joint = jicurr->joint;
00688 const int infom = jicurr->info.m;
00689
00690 int b1 = (joint->node[0].body) ? (joint->node[0].body->tag) : -1;
00691 int b2 = (joint->node[1].body) ? (joint->node[1].body->tag) : -1;
00692 for (int j=0; j<infom; j++) {
00693 jb_ptr[0] = b1;
00694 jb_ptr[1] = b2;
00695 jb_ptr += 2;
00696 }
00697 }
00698 dIASSERT (jb_ptr == jb+2*m);
00699 }
00700
00701 BEGIN_STATE_SAVE(context, tmp1state) {
00702 IFTIMING (dTimerNow ("compute rhs"));
00703
00704 dReal *tmp1 = context->AllocateArray<dReal> (nb*6);
00705
00706 dReal *tmp1curr = tmp1;
00707 const dReal *invIrow = invI;
00708 dxBody *const *const bodyend = body + nb;
00709 for (dxBody *const *bodycurr = body; bodycurr != bodyend; tmp1curr+=6, invIrow+=12, bodycurr++) {
00710 dxBody *b = *bodycurr;
00711 dReal body_invMass = b->invMass;
00712 for (int j=0; j<3; j++) tmp1curr[j] = b->facc[j] * body_invMass + b->lvel[j] * stepsize1;
00713 dMultiply0_331 (tmp1curr + 3,invIrow,b->tacc);
00714 for (int k=0; k<3; k++) tmp1curr[3+k] += b->avel[k] * stepsize1;
00715 }
00716
00717
00718 multiply_J (m,J,jb,tmp1,rhs);
00719 } END_STATE_SAVE(context, tmp1state);
00720
00721
00722 for (int i=0; i<m; i++) rhs[i] = c[i]*stepsize1 - rhs[i];
00723
00724
00725 for (int j=0; j<m; j++) cfm[j] *= stepsize1;
00726
00727 } END_STATE_SAVE(context, cstate);
00728
00729
00730
00731
00732
00733
00734
00735
00736
00737 dReal *lambda = context->AllocateArray<dReal> (m);
00738
00739 dReal *cforce = context->AllocateArray<dReal> (nb*6);
00740
00741 dReal *iMJ = context->AllocateArray<dReal> (m*12);
00742 dReal *Ad = context->AllocateArray<dReal> (m);
00743
00744 BEGIN_STATE_SAVE(context, lcpstate) {
00745
00746 IFBENCHMARKING( if( benchmarkIters > minBenchmarkIters ) totalConstraints+=m; totalBodies+=nb; dTimerStart(" LCP Solve - Parallel "););
00747
00748 #if PARALLEL_ENABLED
00749
00750 SolverType::SolverParams params(context,&world->qs,m,nb,J,jb,body,invI,lambda,cforce,rhs,lo,hi,cfm,iMJ,Ad,findex,stepsize);
00751 parallelSolver.worldSolve( ¶ms );
00752 #else
00753 worldSolve(context,m,nb,J,jb,body,invI,lambda,cforce,rhs,lo,hi,cfm,iMJ,Ad,findex,&world->qs);
00754 #endif
00755
00756 IFBENCHMARKING (dTimerEnd());
00757
00758 } END_STATE_SAVE(context, lcpstate);
00759
00760
00761
00762 {
00763 IFTIMING (dTimerNow ("velocity update due to constraint forces"));
00764
00765
00766
00767
00768 const dReal *cforcecurr = cforce;
00769 dxBody *const *const bodyend = body + nb;
00770 for (dxBody *const *bodycurr = body; bodycurr != bodyend; cforcecurr+=6, bodycurr++) {
00771 dxBody *b = *bodycurr;
00772 for (int j=0; j<3; j++) {
00773 b->lvel[j] += stepsize * cforcecurr[j];
00774 b->avel[j] += stepsize * cforcecurr[3+j];
00775 }
00776 }
00777 }
00778
00779 if (mfb > 0) {
00780
00781
00782
00783 dReal data[6];
00784 const dReal *lambdacurr = lambda;
00785 const dReal *Jcopyrow = Jcopy;
00786 const dJointWithInfo1 *jicurr = jointiinfos;
00787 const dJointWithInfo1 *const jiend = jicurr + nj;
00788 for (; jicurr != jiend; jicurr++) {
00789 dxJoint *joint = jicurr->joint;
00790 const int infom = jicurr->info.m;
00791
00792 if (joint->feedback) {
00793 dJointFeedback *fb = joint->feedback;
00794 Multiply1_12q1 (data, Jcopyrow, lambdacurr, infom);
00795 fb->f1[0] = data[0];
00796 fb->f1[1] = data[1];
00797 fb->f1[2] = data[2];
00798 fb->t1[0] = data[3];
00799 fb->t1[1] = data[4];
00800 fb->t1[2] = data[5];
00801
00802 if (joint->node[1].body)
00803 {
00804 Multiply1_12q1 (data, Jcopyrow+6, lambdacurr, infom);
00805 fb->f2[0] = data[0];
00806 fb->f2[1] = data[1];
00807 fb->f2[2] = data[2];
00808 fb->t2[0] = data[3];
00809 fb->t2[1] = data[4];
00810 fb->t2[2] = data[5];
00811 }
00812
00813 Jcopyrow += infom * 12;
00814 }
00815
00816 lambdacurr += infom;
00817 }
00818 }
00819 }
00820
00821 {
00822 IFTIMING (dTimerNow ("compute velocity update"));
00823
00824
00825 const dReal *invIrow = invI;
00826 dxBody *const *const bodyend = body + nb;
00827 for (dxBody *const *bodycurr = body; bodycurr != bodyend; invIrow += 12, bodycurr++) {
00828 dxBody *b = *bodycurr;
00829 dReal body_invMass_mul_stepsize = stepsize * b->invMass;
00830 for (int j=0; j<3; j++) {
00831 b->lvel[j] += body_invMass_mul_stepsize * b->facc[j];
00832 b->tacc[j] *= stepsize;
00833 }
00834 dMultiplyAdd0_331 (b->avel, invIrow, b->tacc);
00835 }
00836 }
00837
00838 {
00839
00840
00841 IFTIMING (dTimerNow ("update position"));
00842 dxBody *const *const bodyend = body + nb;
00843 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00844 dxBody *b = *bodycurr;
00845 dxStepBody (b,stepsize);
00846 }
00847 }
00848
00849 {
00850 IFTIMING (dTimerNow ("tidy up"));
00851
00852 dxBody *const *const bodyend = body + nb;
00853 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) {
00854 dxBody *b = *bodycurr;
00855 dSetZero (b->facc,3);
00856 dSetZero (b->tacc,3);
00857 }
00858 }
00859
00860 IFTIMING (dTimerEnd());
00861 IFTIMING (if (m > 0) dTimerReport (stdout,1));
00862 IFBENCHMARKING ( benchmarkIteration( world ) );
00863 }
00864
00865 static size_t EstimateParallelSOR_LCPMemoryRequirements(int m, int n)
00866 {
00867 size_t res = dEFFICIENT_SIZE(sizeof(dReal) * 4 * m) * 4;
00868 res += dEFFICIENT_SIZE(sizeof(dReal) * 4 * m) * 4;
00869 res += dEFFICIENT_SIZE(sizeof(dReal) * m);
00870 res += dEFFICIENT_SIZE(sizeof(dReal) * m);
00871 res += dEFFICIENT_SIZE(sizeof(dReal) * m) * 2;
00872 res += dEFFICIENT_SIZE(sizeof(int) * m * 4);
00873 res += dEFFICIENT_SIZE(sizeof(int) * m);
00874 res += dEFFICIENT_SIZE(sizeof(dReal) * m);
00875
00876 res += dEFFICIENT_SIZE(sizeof(dReal) * m);
00877 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * 3;
00878 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4);
00879 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4);
00880 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * (size_t)ParallelOptions::MAXBODYREPETITION;
00881 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * (size_t)ParallelOptions::MAXBODYREPETITION;
00882
00883 return res;
00884 }
00885
00886 size_t dxEstimateParallelStepMemoryRequirements (
00887 dxBody * const *body, int nb, dxJoint * const *_joint, int _nj)
00888 {
00889 int nj, m, mfb;
00890
00891 {
00892 int njcurr = 0, mcurr = 0, mfbcurr = 0;
00893 dxJoint::SureMaxInfo info;
00894 dxJoint *const *const _jend = _joint + _nj;
00895 for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) {
00896 dxJoint *j = *_jcurr;
00897 j->getSureMaxInfo (&info);
00898
00899 int jm = info.max_m;
00900 if (jm > 0) {
00901 njcurr++;
00902
00903 mcurr += jm;
00904 if (j->feedback)
00905 mfbcurr += jm;
00906 }
00907 }
00908 nj = njcurr; m = mcurr; mfb = mfbcurr;
00909 }
00910
00911 size_t res = 0;
00912
00913 res += dEFFICIENT_SIZE(sizeof(dReal) * 3 * 4 * nb);
00914 res += dEFFICIENT_SIZE(sizeof(dReal) * nb);
00915
00916 {
00917 size_t sub1_res1 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * _nj);
00918 size_t sub1_res2 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * nj);
00919 if (m > 0) {
00920 sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 12 * m);
00921 sub1_res2 += 4 * dEFFICIENT_SIZE(sizeof(dReal) * m);
00922 sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * 12 * m);
00923 sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * m);
00924 sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 12 * mfb);
00925
00926 {
00927 size_t sub2_res1 = dEFFICIENT_SIZE(sizeof(dReal) * m);
00928 {
00929 size_t sub3_res1 = dEFFICIENT_SIZE(sizeof(dReal) * 6 * nb);
00930
00931 size_t sub3_res2 = 0;
00932
00933 sub2_res1 += (sub3_res1 >= sub3_res2) ? sub3_res1 : sub3_res2;
00934 }
00935
00936 size_t sub2_res2 = dEFFICIENT_SIZE(sizeof(dReal) * m);
00937 sub2_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 6 * nb);
00938 {
00939 size_t sub3_res1 = EstimateParallelSOR_LCPMemoryRequirements(m,nj);
00940 size_t sub3_res2 = 0;
00941 sub2_res2 += (sub3_res1 >= sub3_res2) ? sub3_res1 : sub3_res2;
00942 }
00943
00944 sub1_res2 += (sub2_res1 >= sub2_res2) ? sub2_res1 : sub2_res2;
00945 }
00946 }
00947
00948 res += 3 * ((sub1_res1 >= sub1_res2) ? sub1_res1 : sub1_res2);
00949 }
00950
00951 return res;
00952 }
00953
00954 static size_t BuildIslandAndEstimateStepperMemoryRequirements(dxWorldProcessContext *context,
00955 dxWorld *world, dReal stepsize, dmemestimate_fn_t stepperestimate)
00956 {
00957 const int sizeelements = 2;
00958 size_t maxreq = 0;
00959
00960
00961 dInternalHandleAutoDisabling (world,stepsize);
00962
00963 int nb = world->nb, nj = world->nj;
00964
00965 int *islandsizes = context->AllocateArray<int>(sizeelements);
00966
00967
00968 dxBody **body = context->AllocateArray<dxBody *>(nb);
00969 dxJoint **joint = context->AllocateArray<dxJoint *>(nj);
00970
00971 BEGIN_STATE_SAVE(context, stackstate) {
00972
00973
00974
00975
00976
00977 int stackalloc = (nj < nb) ? nj : nb;
00978 dxBody **stack = context->AllocateArray<dxBody *>(stackalloc);
00979
00980 {
00981
00982 for (dxBody *b=world->firstbody; b; b=(dxBody*)b->next) b->tag = 0;
00983 for (dxJoint *j=world->firstjoint; j; j=(dxJoint*)j->next) j->tag = 0;
00984 }
00985
00986 int bcount = 0, jcount =0;
00987
00988 dxBody **bodystart = body;
00989 dxJoint **jointstart = joint;
00990 for (dxBody *bb=world->firstbody; bb; bb=(dxBody*)bb->next) {
00991
00992 if (!bb->tag) {
00993 if (!(bb->flags & dxBodyDisabled)) {
00994 bb->tag = 1;
00995
00996 dxBody **bodycurr = bodystart;
00997 dxJoint **jointcurr = jointstart;
00998
00999
01000 *bodycurr++ = bb;
01001
01002 int stacksize = 0;
01003 dxBody *b = bb;
01004
01005 while (true) {
01006
01007
01008 for (dxJointNode *n=b->firstjoint; n; n=n->next) {
01009 dxJoint *njoint = n->joint;
01010 if (!njoint->tag) {
01011 if (njoint->isEnabled()) {
01012 njoint->tag = 1;
01013 *jointcurr++ = njoint;
01014
01015 dxBody *nbody = n->body;
01016
01017 if (nbody && nbody->tag <= 0) {
01018 nbody->tag = 1;
01019
01020 nbody->flags &= ~dxBodyDisabled;
01021 stack[stacksize++] = nbody;
01022 }
01023 } else {
01024 njoint->tag = -1;
01025 }
01026 }
01027 }
01028 dIASSERT(stacksize <= world->nb);
01029 dIASSERT(stacksize <= world->nj);
01030
01031 if (stacksize == 0) {
01032 break;
01033 }
01034
01035 b = stack[--stacksize];
01036 *bodycurr++ = b;
01037 }
01038
01039 bcount += (bodycurr - bodystart);
01040 jcount += (jointcurr - jointstart);
01041
01042 bodystart = bodycurr;
01043 jointstart = jointcurr;
01044 } else {
01045 bb->tag = -1;
01046 }
01047 }
01048 }
01049
01050 islandsizes[0] = bcount;
01051 islandsizes[1] = jcount;
01052
01053 size_t islandreq = stepperestimate(body, bcount, joint, jcount);
01054 maxreq = islandreq;
01055
01056 } END_STATE_SAVE(context, stackstate);
01057
01058 # ifndef dNODEBUG
01059
01060
01061
01062 {
01063 for (dxBody *b=world->firstbody; b; b=(dxBody*)b->next) {
01064 if (b->flags & dxBodyDisabled) {
01065 if (b->tag > 0) dDebug (0,"disabled body tagged");
01066 }
01067 else {
01068 if (b->tag <= 0) dDebug (0,"enabled body not tagged");
01069 }
01070 }
01071 for (dxJoint *j=world->firstjoint; j; j=(dxJoint*)j->next) {
01072 if ( (( j->node[0].body && (j->node[0].body->flags & dxBodyDisabled)==0 ) ||
01073 (j->node[1].body && (j->node[1].body->flags & dxBodyDisabled)==0) )
01074 &&
01075 j->isEnabled() ) {
01076 if (j->tag <= 0) dDebug (0,"attached enabled joint not tagged");
01077 }
01078 else {
01079 if (j->tag > 0) dDebug (0,"unattached or disabled joint tagged");
01080 }
01081 }
01082 }
01083 # endif
01084
01085 size_t const *islandreqs = NULL;
01086 int islandcount = 1;
01087 context->SavePreallocations(islandcount, islandsizes, body, joint, islandreqs);
01088
01089 return maxreq;
01090 }
01091
01092 static size_t EstimateIslandsProcessingMemoryRequirements(dxWorld *world, size_t &sesize)
01093 {
01094 size_t res = 0;
01095
01096 size_t islandcounts = dEFFICIENT_SIZE(world->nb * 2 * sizeof(int));
01097 res += islandcounts;
01098
01099 size_t bodiessize = dEFFICIENT_SIZE(world->nb * sizeof(dxBody*));
01100 size_t jointssize = dEFFICIENT_SIZE(world->nj * sizeof(dxJoint*));
01101 res += bodiessize + jointssize;
01102
01103 sesize = (bodiessize < jointssize) ? bodiessize : jointssize;
01104 return res;
01105 }
01106
01107 static size_t AdjustArenaSizeForReserveRequirements(size_t arenareq, float rsrvfactor, unsigned rsrvminimum)
01108 {
01109 float scaledarena = arenareq * rsrvfactor;
01110 size_t adjustedarena = (scaledarena < SIZE_MAX) ? (size_t)scaledarena : SIZE_MAX;
01111 size_t boundedarena = (adjustedarena > rsrvminimum) ? adjustedarena : (size_t)rsrvminimum;
01112 return dEFFICIENT_SIZE(boundedarena);
01113 }
01114
01115 static dxWorldProcessContext *InternalReallocateWorldProcessContext (
01116 dxWorldProcessContext *oldcontext, size_t memreq,
01117 const dxWorldProcessMemoryManager *memmgr, float rsrvfactor, unsigned rsrvminimum)
01118 {
01119 dxWorldProcessContext *context = oldcontext;
01120 bool allocsuccess = false;
01121
01122 size_t oldarenasize;
01123 void *pOldArena;
01124
01125 do {
01126 size_t oldmemsize = oldcontext ? oldcontext->GetMemorySize() : 0;
01127 if (!oldcontext || oldmemsize < memreq) {
01128 oldarenasize = oldcontext ? dxWorldProcessContext::MakeArenaSize(oldmemsize) : 0;
01129 pOldArena = oldcontext ? oldcontext->m_pArenaBegin : NULL;
01130
01131 if (!dxWorldProcessContext::IsArenaPossible(memreq)) {
01132 break;
01133 }
01134
01135 size_t arenareq = dxWorldProcessContext::MakeArenaSize(memreq);
01136 size_t arenareq_with_reserve = AdjustArenaSizeForReserveRequirements(arenareq, rsrvfactor, rsrvminimum);
01137 size_t memreq_with_reserve = memreq + (arenareq_with_reserve - arenareq);
01138
01139 if (oldcontext) {
01140
01141 if (oldcontext->m_pAllocCurrent != oldcontext->m_pAllocBegin) {
01142
01143
01144
01145 size_t oldcontextofs = (size_t)oldcontext - (size_t)pOldArena;
01146 size_t datasize = (size_t)oldcontext->m_pAllocCurrent - (size_t)oldcontext;
01147
01148
01149 size_t shrunkarenasize = dEFFICIENT_SIZE(datasize + oldcontextofs) + EFFICIENT_ALIGNMENT;
01150 if (shrunkarenasize < oldarenasize) {
01151
01152 void *pShrunkOldArena = oldcontext->m_pArenaMemMgr->m_fnShrink(pOldArena, oldarenasize, shrunkarenasize);
01153 if (!pShrunkOldArena) {
01154 break;
01155 }
01156
01157
01158 if (pShrunkOldArena != pOldArena) {
01159 dxWorldProcessContext *shrunkcontext = (dxWorldProcessContext *)dEFFICIENT_PTR(pShrunkOldArena);
01160
01161
01162
01163 size_t shrunkcontextofs = (size_t)shrunkcontext - (size_t)pShrunkOldArena;
01164 size_t offsetdiff = oldcontextofs - shrunkcontextofs;
01165 if (offsetdiff != 0) {
01166 memmove(shrunkcontext, (void *)((size_t)shrunkcontext + offsetdiff), datasize);
01167 }
01168
01169
01170
01171 size_t shrunkdatasize = dxWorldProcessContext::MakeBufferSize(shrunkarenasize);
01172 void *blockbegin = dEFFICIENT_PTR(shrunkcontext + 1);
01173 void *blockend = dOFFSET_EFFICIENTLY(blockbegin, shrunkdatasize);
01174 shrunkcontext->m_pAllocBegin = blockbegin;
01175 shrunkcontext->m_pAllocEnd = blockend;
01176 shrunkcontext->m_pAllocCurrent = blockend;
01177 shrunkcontext->m_pArenaBegin = pShrunkOldArena;
01178
01179 size_t stOffset = ((size_t)pShrunkOldArena - (size_t)pOldArena) - offsetdiff;
01180 shrunkcontext->OffsetPreallocations(stOffset);
01181
01182 oldcontext = shrunkcontext;
01183
01184
01185 pOldArena = pShrunkOldArena;
01186 }
01187
01188
01189 oldarenasize = shrunkarenasize;
01190 }
01191
01192 } else {
01193 oldcontext->m_pArenaMemMgr->m_fnFree(pOldArena, oldarenasize);
01194 oldcontext = NULL;
01195
01196
01197 pOldArena = NULL;
01198 oldarenasize = 0;
01199 }
01200 }
01201
01202
01203 void *pNewArena = memmgr->m_fnAlloc(arenareq_with_reserve);
01204 if (!pNewArena) {
01205 break;
01206 }
01207
01208 context = (dxWorldProcessContext *)dEFFICIENT_PTR(pNewArena);
01209
01210 void *blockbegin = dEFFICIENT_PTR(context + 1);
01211 void *blockend = dOFFSET_EFFICIENTLY(blockbegin, memreq_with_reserve);
01212
01213 context->m_pAllocBegin = blockbegin;
01214 context->m_pAllocEnd = blockend;
01215 context->m_pArenaBegin = pNewArena;
01216 context->m_pAllocCurrent = blockbegin;
01217
01218 if (oldcontext) {
01219 context->CopyPreallocations(oldcontext);
01220 } else {
01221 context->ClearPreallocations();
01222 }
01223
01224 context->m_pArenaMemMgr = memmgr;
01225 context->m_pPreallocationcContext = oldcontext;
01226 }
01227
01228 allocsuccess = true;
01229 } while (false);
01230
01231 if (!allocsuccess) {
01232 if (pOldArena) {
01233 dIASSERT(oldcontext);
01234 oldcontext->m_pArenaMemMgr->m_fnFree(pOldArena, oldarenasize);
01235 }
01236 context = NULL;
01237 }
01238
01239 return context;
01240 }
01241
01242 bool dxReallocateParallelWorldProcessContext (dxWorld *world, dReal stepsize, dmemestimate_fn_t stepperestimate)
01243 {
01244 dxStepWorkingMemory *wmem = AllocateOnDemand(world->wmem);
01245 if (!wmem) return false;
01246
01247 dxWorldProcessContext *oldcontext = wmem->GetWorldProcessingContext();
01248 dIASSERT (!oldcontext || oldcontext->IsStructureValid());
01249
01250 const dxWorldProcessMemoryReserveInfo *reserveinfo = wmem->SureGetMemoryReserveInfo();
01251 const dxWorldProcessMemoryManager *memmgr = wmem->SureGetMemoryManager();
01252
01253 dxWorldProcessContext *context = oldcontext;
01254
01255 size_t sesize;
01256 size_t islandsreq = EstimateIslandsProcessingMemoryRequirements(world, sesize);
01257 dIASSERT(islandsreq == dEFFICIENT_SIZE(islandsreq));
01258 dIASSERT(sesize == dEFFICIENT_SIZE(sesize));
01259
01260 size_t stepperestimatereq = islandsreq + sesize;
01261 context = InternalReallocateWorldProcessContext(context, stepperestimatereq, memmgr, 1.0f, reserveinfo->m_uiReserveMinimum);
01262
01263 if (context)
01264 {
01265 size_t stepperreq = BuildIslandAndEstimateStepperMemoryRequirements(context, world, stepsize, stepperestimate);
01266 dIASSERT(stepperreq == dEFFICIENT_SIZE(stepperreq));
01267
01268 size_t memreq = stepperreq + islandsreq;
01269 context = InternalReallocateWorldProcessContext(context, memreq, memmgr, reserveinfo->m_fReserveFactor, reserveinfo->m_uiReserveMinimum);
01270 }
01271
01272 wmem->SetWorldProcessingContext(context);
01273 return context != NULL;
01274 }
01275
01276