$search
00001 #include <ode/objects.h> 00002 #include <ode/ode.h> 00003 #include <ode/odemath.h> 00004 #include <ode/rotation.h> 00005 #include <ode/timer.h> 00006 #include <ode/error.h> 00007 #include <ode/matrix.h> 00008 #include <ode/misc.h> 00009 #include "objects.h" 00010 #include "config.h" 00011 #include "joints/joint.h" 00012 #include "lcp.h" 00013 #include "util.h" 00014 00015 #include <parallel_common.h> 00016 #include <parallel_utils.h> 00017 #include <parallel_stepper.h> 00018 00019 #if defined(USE_CUDA) 00020 #include <cuda_solver.h> 00021 #ifdef CUDA_DOUBLESUPPORT 00022 typedef parallel_ode::CudaPGSSolver<dReal,dReal> SolverType; 00023 #else 00024 typedef parallel_ode::CudaPGSSolver<float,dReal> SolverType; 00025 #endif 00026 #elif defined(USE_OPENCL) 00027 #include <opencl_solver.h> 00028 typedef parallel_ode::OpenCLPGSSolver<dReal> SolverType; 00029 #elif defined(USE_OPENMP) 00030 #include <openmp_solver.h> 00031 typedef parallel_ode::OpenMPPGSSolver<dReal> SolverType; 00032 #endif 00033 00034 #if PARALLEL_ENABLED 00035 static SolverType parallelSolver; 00036 #endif 00037 00038 typedef const dReal *dRealPtr; 00039 typedef dReal *dRealMutablePtr; 00040 00041 template 00042 void compute_invM_JT<dReal> (int m, dRealPtr J, dRealMutablePtr iMJ, int *jb, 00043 dxBody * const *body, const dRealPtr invI); 00044 00045 template 00046 void compute_Adcfm_b<dReal> (int m, dReal sor_w, dRealMutablePtr J, dRealPtr iMJ, int* jb, dRealPtr cfm, 00047 dRealMutablePtr Adcfm, dRealMutablePtr b); 00048 00049 #define RANDOMLY_REORDER_CONSTRAINTS 1 00050 00051 #ifdef BENCHMARKING 00052 static unsigned long totalConstraints = 0; 00053 static unsigned long totalBodies = 0; 00054 static const unsigned long maxBenchmarkIters = 300; 00055 static const unsigned long minBenchmarkIters = 50; 00056 static unsigned long benchmarkRoundsRemaining = 5; 00057 static unsigned long benchmarkIters = 0; 00058 static int sorItersMult = 2; 00059 static dReal sorParamMult = .85; 00060 00061 static void benchmarkIteration( dxWorld *world) { 00062 ++benchmarkIters; 00063 00064 if (benchmarkIters >= maxBenchmarkIters) { 00065 printf("Iterations = %d, Avg Constraints = %f, Avg Bodies = %f\n", 00066 dWorldGetQuickStepNumIterations( world ), 00067 (dReal)totalConstraints / (dReal)(maxBenchmarkIters - minBenchmarkIters), 00068 (dReal)totalBodies / (dReal)(maxBenchmarkIters - minBenchmarkIters) ); 00069 00070 dTimerReport (stdout,1); 00071 00072 --benchmarkRoundsRemaining; 00073 if( benchmarkRoundsRemaining == 0 ) exit(0); 00074 00075 totalConstraints=0; 00076 totalBodies=0; 00077 benchmarkIters=0; 00078 dWorldSetQuickStepNumIterations( world, (int)(dWorldGetQuickStepNumIterations( world ) * sorItersMult) ); 00079 dWorldSetQuickStepW( world, (dReal)(dWorldGetQuickStepW( world ) * sorParamMult) ); 00080 } 00081 } 00082 #endif 00083 00084 struct IndexError { 00085 int index; // row index 00086 }; 00087 00088 template<typename T> 00089 void compute_invM_JT (int m, const T* J, T* iMJ, int *jb, 00090 dxBody * const *body, const T* invI) 00091 { 00092 IFTIMING( dTimerNow("compute invM/JT ") ); 00093 00094 // precompute iMJ = inv(M)*J' 00095 dRealMutablePtr iMJ_ptr = iMJ; 00096 dRealPtr J_ptr = J; 00097 for (int i=0; i<m; J_ptr += 12, iMJ_ptr += 12, i++) { 00098 int b1 = jb[i*2]; 00099 int b2 = jb[i*2+1]; 00100 dReal k1 = body[b1]->invMass; 00101 for (int j=0; j<3; j++) iMJ_ptr[j] = k1*J_ptr[j]; 00102 const dReal *invIrow1 = invI + 12*b1; 00103 dMultiply0_331 (iMJ_ptr + 3, invIrow1, J_ptr + 3); 00104 if (b2 >= 0) { 00105 dReal k2 = body[b2]->invMass; 00106 for (int j=0; j<3; j++) iMJ_ptr[j+6] = k2*J_ptr[j+6]; 00107 const dReal *invIrow2 = invI + 12*b2; 00108 dMultiply0_331 (iMJ_ptr + 9, invIrow2, J_ptr + 9); 00109 } 00110 } 00111 } 00112 00113 template<typename T> 00114 void compute_Adcfm_b (int m, T sor_w, T* J, const T* iMJ, int* jb, const T* cfm, 00115 T* Adcfm, T* b) 00116 { 00117 IFTIMING( dTimerNow("compute Adcfm/b") ); 00118 { 00119 // precompute 1 / diagonals of A 00120 dRealPtr iMJ_ptr = iMJ; 00121 dRealPtr J_ptr = J; 00122 for (int i=0; i<m; J_ptr += 12, iMJ_ptr += 12, i++) { 00123 dReal sum = 0; 00124 for (int j=0; j<6; j++) sum += iMJ_ptr[j] * J_ptr[j]; 00125 if (jb[i*2+1] >= 0) { 00126 for (int k=6; k<12; k++) sum += iMJ_ptr[k] * J_ptr[k]; 00127 } 00128 Adcfm[i] = sor_w / (sum + cfm[i]); 00129 } 00130 } 00131 00132 { 00133 // scale J and b by Ad 00134 dRealMutablePtr J_ptr = J; 00135 for (int i=0; i<m; J_ptr += 12, i++) { 00136 dReal Ad_i = Adcfm[i]; 00137 for (int j=0; j<12; j++) { 00138 J_ptr[j] *= Ad_i; 00139 } 00140 b[i] *= Ad_i; 00141 // scale Ad by CFM. N.B. this should be done last since it is used above 00142 Adcfm[i] = Ad_i * cfm[i]; 00143 } 00144 } 00145 } 00146 00147 static void worldSolve(dxWorldProcessContext *context, 00148 const int m, const int nb, dRealMutablePtr J, int *jb, dxBody * const *body, 00149 dRealPtr invI, dRealMutablePtr lambda, dRealMutablePtr fc, dRealMutablePtr b, 00150 dRealPtr lo, dRealPtr hi, dRealPtr cfm, dRealMutablePtr iMJ, dRealMutablePtr Ad, const int *findex, 00151 const dxQuickStepParameters *qs) 00152 { 00153 IFVERBOSE( printf ("(# Bodies, #Constraints) = (%d, %d )\n",nb,m) ); 00154 00155 compute_invM_JT (m,J,iMJ,jb,body,invI); 00156 dSetZero (lambda,m); 00157 compute_Adcfm_b (m, qs->w, J, iMJ, jb, cfm, Ad, b); 00158 dSetZero (fc,nb*6); 00159 00160 IFTIMING( dTimerNow ("solving LCP problem") ); 00161 00162 // order to solve constraint rows in 00163 IndexError *order = context->AllocateArray<IndexError> (m); 00164 00165 { 00166 // make sure constraints with findex < 0 come first. 00167 IndexError *orderhead = order, *ordertail = order + (m - 1); 00168 00169 // Fill the array from both ends 00170 for (int i=0; i<m; i++) { 00171 if (findex[i] < 0) { 00172 orderhead->index = i; // Place them at the front 00173 ++orderhead; 00174 } else { 00175 ordertail->index = i; // Place them at the end 00176 --ordertail; 00177 } 00178 } 00179 dIASSERT (orderhead-ordertail==1); 00180 } 00181 00182 #if COMPUTE_ERROR 00183 dReal *delta_error = context->AllocateArray<dReal> (m); 00184 #endif 00185 00186 const int num_iterations = qs->num_iterations; 00187 for (int iteration=0; iteration < num_iterations; iteration++) { 00188 00189 #ifdef RANDOMLY_REORDER_CONSTRAINTS 00190 if ((iteration & 7) == 0) { 00191 for (int i=1; i<m; i++) { 00192 int swapi = dRandInt(i+1); 00193 IndexError tmp = order[i]; 00194 order[i] = order[swapi]; 00195 order[swapi] = tmp; 00196 } 00197 } 00198 #endif 00199 00200 #if COMPUTE_ERROR 00201 dReal rms_error = 0; 00202 #endif 00203 00204 for (int i=0; i<m; i++) { 00205 // @@@ potential optimization: we could pre-sort J and iMJ, thereby 00206 // linearizing access to those arrays. hmmm, this does not seem 00207 // like a win, but we should think carefully about our memory 00208 // access pattern. 00209 00210 int index = order[i].index; 00211 00212 dRealMutablePtr fc_ptr1; 00213 dRealMutablePtr fc_ptr2; 00214 dReal delta; 00215 00216 { 00217 int b1 = jb[index*2]; 00218 int b2 = jb[index*2+1]; 00219 fc_ptr1 = fc + 6*b1; 00220 fc_ptr2 = (b2 >= 0) ? fc + 6*b2 : NULL; 00221 } 00222 00223 dReal old_lambda = lambda[index]; 00224 00225 { 00226 delta = b[index] - old_lambda*Ad[index]; 00227 00228 dRealPtr J_ptr = J + index*12; 00229 // @@@ potential optimization: SIMD-ize this and the b2 >= 0 case 00230 delta -=fc_ptr1[0] * J_ptr[0] + fc_ptr1[1] * J_ptr[1] + 00231 fc_ptr1[2] * J_ptr[2] + fc_ptr1[3] * J_ptr[3] + 00232 fc_ptr1[4] * J_ptr[4] + fc_ptr1[5] * J_ptr[5]; 00233 // @@@ potential optimization: handle 1-body constraints in a separate 00234 // loop to avoid the cost of test & jump? 00235 if (fc_ptr2) { 00236 delta -=fc_ptr2[0] * J_ptr[6] + fc_ptr2[1] * J_ptr[7] + 00237 fc_ptr2[2] * J_ptr[8] + fc_ptr2[3] * J_ptr[9] + 00238 fc_ptr2[4] * J_ptr[10] + fc_ptr2[5] * J_ptr[11]; 00239 } 00240 } 00241 00242 00243 { 00244 dReal hi_act, lo_act; 00245 00246 // set the limits for this constraint. 00247 // this is the place where the QuickStep method differs from the 00248 // direct LCP solving method, since that method only performs this 00249 // limit adjustment once per time step, whereas this method performs 00250 // once per iteration per constraint row. 00251 // the constraints are ordered so that all lambda[] values needed have 00252 // already been computed. 00253 if (findex[index] >= 0) { 00254 hi_act = dFabs (hi[index] * lambda[findex[index]]); 00255 lo_act = -hi_act; 00256 } else { 00257 hi_act = hi[index]; 00258 lo_act = lo[index]; 00259 } 00260 00261 // compute lambda and clamp it to [lo,hi]. 00262 // @@@ potential optimization: does SSE have clamping instructions 00263 // to save test+jump penalties here? 00264 dReal new_lambda = old_lambda + delta; 00265 if (new_lambda < lo_act) { 00266 delta = lo_act-old_lambda; 00267 lambda[index] = lo_act; 00268 } 00269 else if (new_lambda > hi_act) { 00270 delta = hi_act-old_lambda; 00271 lambda[index] = hi_act; 00272 } 00273 else { 00274 lambda[index] = new_lambda; 00275 } 00276 } 00277 00278 #if COMPUTE_ERROR 00279 rms_error += delta*delta; 00280 delta_error[index] = dFabs(delta); 00281 #endif 00282 00283 //@@@ a trick that may or may not help 00284 //dReal ramp = (1-((dReal)(iteration+1)/(dReal)num_iterations)); 00285 //delta *= ramp; 00286 00287 { 00288 dRealPtr iMJ_ptr = iMJ + index*12; 00289 // update fc. 00290 // @@@ potential optimization: SIMD for this and the b2 >= 0 case 00291 fc_ptr1[0] += delta * iMJ_ptr[0]; 00292 fc_ptr1[1] += delta * iMJ_ptr[1]; 00293 fc_ptr1[2] += delta * iMJ_ptr[2]; 00294 fc_ptr1[3] += delta * iMJ_ptr[3]; 00295 fc_ptr1[4] += delta * iMJ_ptr[4]; 00296 fc_ptr1[5] += delta * iMJ_ptr[5]; 00297 // @@@ potential optimization: handle 1-body constraints in a separate 00298 // loop to avoid the cost of test & jump? 00299 if (fc_ptr2) { 00300 fc_ptr2[0] += delta * iMJ_ptr[6]; 00301 fc_ptr2[1] += delta * iMJ_ptr[7]; 00302 fc_ptr2[2] += delta * iMJ_ptr[8]; 00303 fc_ptr2[3] += delta * iMJ_ptr[9]; 00304 fc_ptr2[4] += delta * iMJ_ptr[10]; 00305 fc_ptr2[5] += delta * iMJ_ptr[11]; 00306 } 00307 } 00308 } 00309 #if COMPUTE_ERROR 00310 if ( iteration == num_iterations - 1 ) { 00311 // do we need to compute norm across entire solution space (0,m)? 00312 // since local convergence might produce errors in other nodes? 00313 #ifdef RECOMPUTE_RMS 00314 // recompute rms_error to be sure swap is not corrupting arrays 00315 rms_error = 0; 00316 #ifdef USE_1NORM 00317 for (int i=0; i<m; i++) 00318 { 00319 rms_error = dFabs(delta_error[order[i].index]) > rms_error ? dFabs(delta_error[order[i].index]) : rms_error; // 1norm test 00320 } 00321 #else // use 2 norm 00322 for (int i=0; i<m; i++) // use entire solution vector errors 00323 rms_error += delta_error[order[i].index]*delta_error[order[i].index]; 00324 rms_error = sqrt(rms_error); 00325 #endif 00326 #else 00327 rms_error = sqrt(rms_error); 00328 #endif 00329 printf("(Iteration, Error) = (%d, %20.18f)\n",iteration,rms_error); 00330 } 00331 #endif 00332 } 00333 } 00334 00335 void dxParallelProcessIslands (dxWorld *world, dReal stepsize, dstepper_fn_t stepper) 00336 { 00337 const int sizeelements = 2; 00338 00339 dxStepWorkingMemory *wmem = world->wmem; 00340 dIASSERT(wmem != NULL); 00341 00342 dxWorldProcessContext *context = wmem->GetWorldProcessingContext(); 00343 00344 size_t const *islandreqs = NULL; 00345 int islandcount; 00346 int const *islandsizes; 00347 dxBody *const *body; 00348 dxJoint *const *joint; 00349 context->RetrievePreallocations(islandcount, islandsizes, body, joint, islandreqs); 00350 00351 dxBody *const *bodystart = body; 00352 dxJoint *const *jointstart = joint; 00353 00354 int const *const sizesend = islandsizes + islandcount * sizeelements; 00355 00356 int bcount = 0; 00357 int jcount = 0; 00358 00359 for (int const *sizescurr = islandsizes; sizescurr != sizesend; sizescurr += sizeelements) { 00360 bcount += sizescurr[0]; 00361 jcount += sizescurr[1]; 00362 } 00363 00364 BEGIN_STATE_SAVE(context, stepperstate) { 00365 stepper (context,world,bodystart,bcount,jointstart,jcount,stepsize); 00366 } END_STATE_SAVE(context, stepperstate); 00367 00368 context->CleanupContext(); 00369 dIASSERT(context->IsStructureValid()); 00370 } 00371 00372 //*************************************************************************** 00373 // configuration 00374 00375 #define RANDOMLY_REORDER_CONSTRAINTS 1 00376 00377 //**************************************************************************** 00378 // special matrix multipliers 00379 00380 // multiply block of B matrix (q x 6) with 12 dReal per row with C vektor (q) 00381 static void Multiply1_12q1 (dReal *A, const dReal *B, const dReal *C, int q) 00382 { 00383 dIASSERT (q>0 && A && B && C); 00384 00385 dReal a = 0; 00386 dReal b = 0; 00387 dReal c = 0; 00388 dReal d = 0; 00389 dReal e = 0; 00390 dReal f = 0; 00391 dReal s; 00392 00393 for(int i=0, k = 0; i<q; k += 12, i++) 00394 { 00395 s = C[i]; //C[i] and B[n+k] cannot overlap because its value has been read into a temporary. 00396 00397 //For the rest of the loop, the only memory dependency (array) is from B[] 00398 a += B[ k] * s; 00399 b += B[1+k] * s; 00400 c += B[2+k] * s; 00401 d += B[3+k] * s; 00402 e += B[4+k] * s; 00403 f += B[5+k] * s; 00404 } 00405 00406 A[0] = a; 00407 A[1] = b; 00408 A[2] = c; 00409 A[3] = d; 00410 A[4] = e; 00411 A[5] = f; 00412 } 00413 00414 //*************************************************************************** 00415 // various common computations involving the matrix J 00416 00417 // compute out = J*in. 00418 00419 static void multiply_J (int m, dRealPtr J, int *jb, 00420 dRealPtr in, dRealMutablePtr out) 00421 { 00422 dRealPtr J_ptr = J; 00423 for (int i=0; i<m; i++) { 00424 int b1 = jb[i*2]; 00425 int b2 = jb[i*2+1]; 00426 dReal sum = 0; 00427 dRealPtr in_ptr = in + b1*6; 00428 for (int j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j]; 00429 J_ptr += 6; 00430 if (b2 >= 0) { 00431 in_ptr = in + b2*6; 00432 for (int j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j]; 00433 } 00434 J_ptr += 6; 00435 out[i] = sum; 00436 } 00437 } 00438 00439 struct dJointWithInfo1 00440 { 00441 dxJoint *joint; 00442 dxJoint::Info1 info; 00443 }; 00444 00445 void dxParallelQuickStepper( dxWorldProcessContext *context, 00446 dxWorld *world, dxBody * const *body, int nb, 00447 dxJoint * const *_joint, int _nj, dReal stepsize) 00448 { 00449 00450 IFTIMING(dTimerStart("preprocessing")); 00451 const dReal stepsize1 = dRecip(stepsize); 00452 00453 { 00454 // number all bodies in the body list - set their tag values 00455 for (int i=0; i<nb; i++) body[i]->tag = i; 00456 } 00457 00458 // for all bodies, compute the inertia tensor and its inverse in the global 00459 // frame, and compute the rotational force and add it to the torque 00460 // accumulator. I and invI are a vertical stack of 3x4 matrices, one per body. 00461 dReal *invI = context->AllocateArray<dReal> (3*4*nb); 00462 00463 { 00464 dReal *invIrow = invI; 00465 dxBody *const *const bodyend = body + nb; 00466 for (dxBody *const *bodycurr = body; bodycurr != bodyend; invIrow += 12, bodycurr++) { 00467 dMatrix3 tmp; 00468 dxBody *b = *bodycurr; 00469 00470 // compute inverse inertia tensor in global frame 00471 dMultiply2_333 (tmp,b->invI,b->posr.R); 00472 dMultiply0_333 (invIrow,b->posr.R,tmp); 00473 00474 if (b->flags & dxBodyGyroscopic) { 00475 dMatrix3 I; 00476 // compute inertia tensor in global frame 00477 dMultiply2_333 (tmp,b->mass.I,b->posr.R); 00478 dMultiply0_333 (I,b->posr.R,tmp); 00479 // compute rotational force 00480 dMultiply0_331 (tmp,I,b->avel); 00481 dSubtractVectorCross3(b->tacc,b->avel,tmp); 00482 } 00483 } 00484 } 00485 00486 // get the masses for every body 00487 dReal *invM = context->AllocateArray<dReal> (nb); 00488 { 00489 dReal *invMrow = invM; 00490 dxBody *const *const bodyend = body + nb; 00491 for (dxBody *const *bodycurr = body; bodycurr != bodyend; invMrow++, bodycurr++) { 00492 dxBody *b = *bodycurr; 00493 //*invMrow = b->mass.mass; 00494 *invMrow = b->invMass; 00495 00496 } 00497 } 00498 00499 { 00500 // add the gravity force to all bodies 00501 // since gravity does normally have only one component it's more efficient 00502 // to run three loops for each individual component 00503 dxBody *const *const bodyend = body + nb; 00504 dReal gravity_x = world->gravity[0]; 00505 if (gravity_x) { 00506 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) { 00507 dxBody *b = *bodycurr; 00508 if ((b->flags & dxBodyNoGravity)==0) { 00509 b->facc[0] += b->mass.mass * gravity_x; 00510 } 00511 } 00512 } 00513 dReal gravity_y = world->gravity[1]; 00514 if (gravity_y) { 00515 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) { 00516 dxBody *b = *bodycurr; 00517 if ((b->flags & dxBodyNoGravity)==0) { 00518 b->facc[1] += b->mass.mass * gravity_y; 00519 } 00520 } 00521 } 00522 dReal gravity_z = world->gravity[2]; 00523 if (gravity_z) { 00524 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) { 00525 dxBody *b = *bodycurr; 00526 if ((b->flags & dxBodyNoGravity)==0) { 00527 b->facc[2] += b->mass.mass * gravity_z; 00528 } 00529 } 00530 } 00531 } 00532 00533 // get joint information (m = total constraint dimension, nub = number of unbounded variables). 00534 // joints with m=0 are inactive and are removed from the joints array 00535 // entirely, so that the code that follows does not consider them. 00536 dJointWithInfo1 *const jointiinfos = context->AllocateArray<dJointWithInfo1> (_nj); 00537 int nj; 00538 00539 { 00540 dJointWithInfo1 *jicurr = jointiinfos; 00541 dxJoint *const *const _jend = _joint + _nj; 00542 for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) { // jicurr=dest, _jcurr=src 00543 dxJoint *j = *_jcurr; 00544 j->getInfo1 (&jicurr->info); 00545 dIASSERT (jicurr->info.m >= 0 && jicurr->info.m <= 6 && jicurr->info.nub >= 0 && jicurr->info.nub <= jicurr->info.m); 00546 if (jicurr->info.m > 0) { 00547 jicurr->joint = j; 00548 jicurr++; 00549 } 00550 } 00551 nj = jicurr - jointiinfos; 00552 } 00553 00554 context->ShrinkArray<dJointWithInfo1>(jointiinfos, _nj, nj); 00555 00556 int m; 00557 int mfb; // number of rows of Jacobian we will have to save for joint feedback 00558 00559 { 00560 int mcurr = 0, mfbcurr = 0; 00561 const dJointWithInfo1 *jicurr = jointiinfos; 00562 const dJointWithInfo1 *const jiend = jicurr + nj; 00563 for (; jicurr != jiend; jicurr++) { 00564 int jm = jicurr->info.m; 00565 mcurr += jm; 00566 if (jicurr->joint->feedback) 00567 mfbcurr += jm; 00568 } 00569 00570 m = mcurr; 00571 mfb = mfbcurr; 00572 } 00573 00574 // if there are constraints, compute the constraint force 00575 dReal *J = NULL; 00576 int *jb = NULL; 00577 00578 if (m > 0) { 00579 dReal *cfm, *lo, *hi, *rhs, *Jcopy; 00580 int *findex; 00581 00582 { 00583 int mlocal = m; 00584 00585 const unsigned jelements = mlocal*12; 00586 J = context->AllocateArray<dReal> (jelements); 00587 dSetZero (J,jelements); 00588 00589 // create a constraint equation right hand side vector `c', a constraint 00590 // force mixing vector `cfm', and LCP low and high bound vectors, and an 00591 // 'findex' vector. 00592 cfm = context->AllocateArray<dReal> (mlocal); 00593 dSetValue (cfm,mlocal,world->global_cfm); 00594 00595 lo = context->AllocateArray<dReal> (mlocal); 00596 dSetValue (lo,mlocal,-dInfinity); 00597 00598 hi = context->AllocateArray<dReal> (mlocal); 00599 dSetValue (hi,mlocal,dInfinity); 00600 00601 findex = context->AllocateArray<int> (mlocal); 00602 for (int i=0; i<mlocal; i++) findex[i] = -1; 00603 00604 const unsigned jbelements = mlocal*2; 00605 jb = context->AllocateArray<int> (jbelements); 00606 00607 rhs = context->AllocateArray<dReal> (mlocal); 00608 00609 Jcopy = context->AllocateArray<dReal> (mfb*12); 00610 00611 } 00612 00613 BEGIN_STATE_SAVE(context, cstate) { 00614 dReal *c = context->AllocateArray<dReal> (m); 00615 dSetZero (c, m); 00616 00617 { 00618 IFTIMING (dTimerNow ("create J")); 00619 // get jacobian data from constraints. an m*12 matrix will be created 00620 // to store the two jacobian blocks from each constraint. it has this 00621 // format: 00622 // 00623 // l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 \ . 00624 // l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 )-- jacobian for joint 0, body 1 and body 2 (3 rows) 00625 // l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 / 00626 // l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 )--- jacobian for joint 1, body 1 and body 2 (3 rows) 00627 // etc... 00628 // 00629 // (lll) = linear jacobian data 00630 // (aaa) = angular jacobian data 00631 // 00632 dxJoint::Info2 Jinfo; 00633 Jinfo.rowskip = 12; 00634 Jinfo.fps = stepsize1; 00635 Jinfo.erp = world->global_erp; 00636 00637 dReal *Jcopyrow = Jcopy; 00638 unsigned ofsi = 0; 00639 const dJointWithInfo1 *jicurr = jointiinfos; 00640 const dJointWithInfo1 *const jiend = jicurr + nj; 00641 for (; jicurr != jiend; jicurr++) { 00642 dReal *const Jrow = J + ofsi * 12; 00643 Jinfo.J1l = Jrow; 00644 Jinfo.J1a = Jrow + 3; 00645 Jinfo.J2l = Jrow + 6; 00646 Jinfo.J2a = Jrow + 9; 00647 Jinfo.c = c + ofsi; 00648 Jinfo.cfm = cfm + ofsi; 00649 Jinfo.lo = lo + ofsi; 00650 Jinfo.hi = hi + ofsi; 00651 Jinfo.findex = findex + ofsi; 00652 00653 00654 // now write all information into J 00655 dxJoint *joint = jicurr->joint; 00656 joint->getInfo2 (&Jinfo); 00657 const int infom = jicurr->info.m; 00658 00659 // we need a copy of Jacobian for joint feedbacks 00660 // because it gets destroyed by SOR solver 00661 // instead of saving all Jacobian, we can save just rows 00662 // for joints, that requested feedback (which is normally much less) 00663 if (joint->feedback) { 00664 const int rowels = infom * 12; 00665 memcpy(Jcopyrow, Jrow, rowels * sizeof(dReal)); 00666 Jcopyrow += rowels; 00667 } 00668 00669 // adjust returned findex values for global index numbering 00670 int *findex_ofsi = findex + ofsi; 00671 for (int j=0; j<infom; j++) { 00672 int fival = findex_ofsi[j]; 00673 if (fival >= 0) 00674 findex_ofsi[j] = fival + ofsi; 00675 } 00676 00677 ofsi += infom; 00678 } 00679 } 00680 00681 { 00682 // create an array of body numbers for each joint row 00683 int *jb_ptr = jb; 00684 const dJointWithInfo1 *jicurr = jointiinfos; 00685 const dJointWithInfo1 *const jiend = jicurr + nj; 00686 for (; jicurr != jiend; jicurr++) { 00687 dxJoint *joint = jicurr->joint; 00688 const int infom = jicurr->info.m; 00689 00690 int b1 = (joint->node[0].body) ? (joint->node[0].body->tag) : -1; 00691 int b2 = (joint->node[1].body) ? (joint->node[1].body->tag) : -1; 00692 for (int j=0; j<infom; j++) { 00693 jb_ptr[0] = b1; 00694 jb_ptr[1] = b2; 00695 jb_ptr += 2; 00696 } 00697 } 00698 dIASSERT (jb_ptr == jb+2*m); 00699 } 00700 00701 BEGIN_STATE_SAVE(context, tmp1state) { 00702 IFTIMING (dTimerNow ("compute rhs")); 00703 // compute the right hand side `rhs' 00704 dReal *tmp1 = context->AllocateArray<dReal> (nb*6); 00705 // put v/h + invM*fe into tmp1 00706 dReal *tmp1curr = tmp1; 00707 const dReal *invIrow = invI; 00708 dxBody *const *const bodyend = body + nb; 00709 for (dxBody *const *bodycurr = body; bodycurr != bodyend; tmp1curr+=6, invIrow+=12, bodycurr++) { 00710 dxBody *b = *bodycurr; 00711 dReal body_invMass = b->invMass; 00712 for (int j=0; j<3; j++) tmp1curr[j] = b->facc[j] * body_invMass + b->lvel[j] * stepsize1; 00713 dMultiply0_331 (tmp1curr + 3,invIrow,b->tacc); 00714 for (int k=0; k<3; k++) tmp1curr[3+k] += b->avel[k] * stepsize1; 00715 } 00716 00717 // put J*tmp1 into rhs 00718 multiply_J (m,J,jb,tmp1,rhs); 00719 } END_STATE_SAVE(context, tmp1state); 00720 00721 // complete rhs 00722 for (int i=0; i<m; i++) rhs[i] = c[i]*stepsize1 - rhs[i]; 00723 00724 // scale CFM 00725 for (int j=0; j<m; j++) cfm[j] *= stepsize1; 00726 00727 } END_STATE_SAVE(context, cstate); 00728 00729 /* 00730 for( int i=0; i<m; i++ ) { 00731 lo[i] = CUDA_MIN > lo[i] ? CUDA_MIN : lo[i];//std::min(lo[i], (dReal)CUDA_MIN);// 00732 hi[i] = hi[i] > CUDA_MAX ? CUDA_MAX : hi[i];//std::min(hi[i], (dReal)CUDA_MAX);// 00733 } 00734 */ 00735 00736 // load lambda from the value saved on the previous iteration 00737 dReal *lambda = context->AllocateArray<dReal> (m); 00738 00739 dReal *cforce = context->AllocateArray<dReal> (nb*6); 00740 00741 dReal *iMJ = context->AllocateArray<dReal> (m*12); 00742 dReal *Ad = context->AllocateArray<dReal> (m); 00743 00744 BEGIN_STATE_SAVE(context, lcpstate) { 00745 // solve the LCP problem and get lambda and invM*constraint_force 00746 IFBENCHMARKING( if( benchmarkIters > minBenchmarkIters ) totalConstraints+=m; totalBodies+=nb; dTimerStart(" LCP Solve - Parallel ");); 00747 00748 #if PARALLEL_ENABLED 00749 // FIXMED: need to first cast params into float, right now, SolverType is float, but params contains dRal pointers 00750 SolverType::SolverParams params(context,&world->qs,m,nb,J,jb,body,invI,lambda,cforce,rhs,lo,hi,cfm,iMJ,Ad,findex,stepsize); 00751 parallelSolver.worldSolve( ¶ms ); 00752 #else 00753 worldSolve(context,m,nb,J,jb,body,invI,lambda,cforce,rhs,lo,hi,cfm,iMJ,Ad,findex,&world->qs); 00754 #endif 00755 00756 IFBENCHMARKING (dTimerEnd()); 00757 00758 } END_STATE_SAVE(context, lcpstate); 00759 00760 // note that the SOR method overwrites rhs and J at this point, so 00761 // they should not be used again. 00762 { 00763 IFTIMING (dTimerNow ("velocity update due to constraint forces")); 00764 // note that cforce is really not a force but an acceleration, hence there is 00765 // no premultiplying of invM here (compare to update due to external force 'facc' below) 00766 // 00767 // add stepsize * cforce to the body velocity 00768 const dReal *cforcecurr = cforce; 00769 dxBody *const *const bodyend = body + nb; 00770 for (dxBody *const *bodycurr = body; bodycurr != bodyend; cforcecurr+=6, bodycurr++) { 00771 dxBody *b = *bodycurr; 00772 for (int j=0; j<3; j++) { 00773 b->lvel[j] += stepsize * cforcecurr[j]; 00774 b->avel[j] += stepsize * cforcecurr[3+j]; 00775 } 00776 } 00777 } 00778 00779 if (mfb > 0) { 00780 // straightforward computation of joint constraint forces: 00781 // multiply related lambdas with respective J' block for joints 00782 // where feedback was requested 00783 dReal data[6]; 00784 const dReal *lambdacurr = lambda; 00785 const dReal *Jcopyrow = Jcopy; 00786 const dJointWithInfo1 *jicurr = jointiinfos; 00787 const dJointWithInfo1 *const jiend = jicurr + nj; 00788 for (; jicurr != jiend; jicurr++) { 00789 dxJoint *joint = jicurr->joint; 00790 const int infom = jicurr->info.m; 00791 00792 if (joint->feedback) { 00793 dJointFeedback *fb = joint->feedback; 00794 Multiply1_12q1 (data, Jcopyrow, lambdacurr, infom); 00795 fb->f1[0] = data[0]; 00796 fb->f1[1] = data[1]; 00797 fb->f1[2] = data[2]; 00798 fb->t1[0] = data[3]; 00799 fb->t1[1] = data[4]; 00800 fb->t1[2] = data[5]; 00801 00802 if (joint->node[1].body) 00803 { 00804 Multiply1_12q1 (data, Jcopyrow+6, lambdacurr, infom); 00805 fb->f2[0] = data[0]; 00806 fb->f2[1] = data[1]; 00807 fb->f2[2] = data[2]; 00808 fb->t2[0] = data[3]; 00809 fb->t2[1] = data[4]; 00810 fb->t2[2] = data[5]; 00811 } 00812 00813 Jcopyrow += infom * 12; 00814 } 00815 00816 lambdacurr += infom; 00817 } 00818 } 00819 } 00820 00821 { 00822 IFTIMING (dTimerNow ("compute velocity update")); 00823 // compute the velocity update: 00824 // add stepsize * invM * fe to the body velocity 00825 const dReal *invIrow = invI; 00826 dxBody *const *const bodyend = body + nb; 00827 for (dxBody *const *bodycurr = body; bodycurr != bodyend; invIrow += 12, bodycurr++) { 00828 dxBody *b = *bodycurr; 00829 dReal body_invMass_mul_stepsize = stepsize * b->invMass; 00830 for (int j=0; j<3; j++) { 00831 b->lvel[j] += body_invMass_mul_stepsize * b->facc[j]; 00832 b->tacc[j] *= stepsize; 00833 } 00834 dMultiplyAdd0_331 (b->avel, invIrow, b->tacc); 00835 } 00836 } 00837 00838 { 00839 // update the position and orientation from the new linear/angular velocity 00840 // (over the given timestep) 00841 IFTIMING (dTimerNow ("update position")); 00842 dxBody *const *const bodyend = body + nb; 00843 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) { 00844 dxBody *b = *bodycurr; 00845 dxStepBody (b,stepsize); 00846 } 00847 } 00848 00849 { 00850 IFTIMING (dTimerNow ("tidy up")); 00851 // zero all force accumulators 00852 dxBody *const *const bodyend = body + nb; 00853 for (dxBody *const *bodycurr = body; bodycurr != bodyend; bodycurr++) { 00854 dxBody *b = *bodycurr; 00855 dSetZero (b->facc,3); 00856 dSetZero (b->tacc,3); 00857 } 00858 } 00859 00860 IFTIMING (dTimerEnd()); 00861 IFTIMING (if (m > 0) dTimerReport (stdout,1)); 00862 IFBENCHMARKING ( benchmarkIteration( world ) ); 00863 } 00864 00865 static size_t EstimateParallelSOR_LCPMemoryRequirements(int m, int n) 00866 { 00867 size_t res = dEFFICIENT_SIZE(sizeof(dReal) * 4 * m) * 4; // for ij 00868 res += dEFFICIENT_SIZE(sizeof(dReal) * 4 * m) * 4; // for j 00869 res += dEFFICIENT_SIZE(sizeof(dReal) * m); // for adcfm 00870 res += dEFFICIENT_SIZE(sizeof(dReal) * m); // for rhs 00871 res += dEFFICIENT_SIZE(sizeof(dReal) * m) * 2; // for lohi 00872 res += dEFFICIENT_SIZE(sizeof(int) * m * 4); // for bodyIDs 00873 res += dEFFICIENT_SIZE(sizeof(int) * m); // for fIDs 00874 res += dEFFICIENT_SIZE(sizeof(dReal) * m); // for lambda 00875 00876 res += dEFFICIENT_SIZE(sizeof(dReal) * m); // for iMass 00877 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * 3; // for i0 00878 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4); // for bodyFAcc 00879 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4); // for bodyTAcc 00880 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * (size_t)ParallelOptions::MAXBODYREPETITION; // for bodyFAccReduction 00881 res += dEFFICIENT_SIZE(sizeof(dReal) * m * 4) * (size_t)ParallelOptions::MAXBODYREPETITION; // for bodyTAccReduction 00882 00883 return res; 00884 } 00885 00886 size_t dxEstimateParallelStepMemoryRequirements ( 00887 dxBody * const *body, int nb, dxJoint * const *_joint, int _nj) 00888 { 00889 int nj, m, mfb; 00890 00891 { 00892 int njcurr = 0, mcurr = 0, mfbcurr = 0; 00893 dxJoint::SureMaxInfo info; 00894 dxJoint *const *const _jend = _joint + _nj; 00895 for (dxJoint *const *_jcurr = _joint; _jcurr != _jend; _jcurr++) { 00896 dxJoint *j = *_jcurr; 00897 j->getSureMaxInfo (&info); 00898 00899 int jm = info.max_m; 00900 if (jm > 0) { 00901 njcurr++; 00902 00903 mcurr += jm; 00904 if (j->feedback) 00905 mfbcurr += jm; 00906 } 00907 } 00908 nj = njcurr; m = mcurr; mfb = mfbcurr; 00909 } 00910 00911 size_t res = 0; 00912 00913 res += dEFFICIENT_SIZE(sizeof(dReal) * 3 * 4 * nb); // for invI 00914 res += dEFFICIENT_SIZE(sizeof(dReal) * nb); // for invM 00915 00916 { 00917 size_t sub1_res1 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * _nj); // for initial jointiinfos 00918 size_t sub1_res2 = dEFFICIENT_SIZE(sizeof(dJointWithInfo1) * nj); // for shrunk jointiinfos 00919 if (m > 0) { 00920 sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 12 * m); // for J 00921 sub1_res2 += 4 * dEFFICIENT_SIZE(sizeof(dReal) * m); // for cfm, lo, hi, rhs 00922 sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * 12 * m); // for jb FIXME: shoulbe be 2 not 12? 00923 sub1_res2 += dEFFICIENT_SIZE(sizeof(int) * m); // for findex 00924 sub1_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 12 * mfb); // for Jcopy 00925 00926 { 00927 size_t sub2_res1 = dEFFICIENT_SIZE(sizeof(dReal) * m); // for c 00928 { 00929 size_t sub3_res1 = dEFFICIENT_SIZE(sizeof(dReal) * 6 * nb); // for tmp1 00930 00931 size_t sub3_res2 = 0; 00932 00933 sub2_res1 += (sub3_res1 >= sub3_res2) ? sub3_res1 : sub3_res2; 00934 } 00935 00936 size_t sub2_res2 = dEFFICIENT_SIZE(sizeof(dReal) * m); // for lambda 00937 sub2_res2 += dEFFICIENT_SIZE(sizeof(dReal) * 6 * nb); // for cforce 00938 { 00939 size_t sub3_res1 = EstimateParallelSOR_LCPMemoryRequirements(m,nj); // for SOR_LCP 00940 size_t sub3_res2 = 0; 00941 sub2_res2 += (sub3_res1 >= sub3_res2) ? sub3_res1 : sub3_res2; 00942 } 00943 00944 sub1_res2 += (sub2_res1 >= sub2_res2) ? sub2_res1 : sub2_res2; 00945 } 00946 } 00947 00948 res += 3 * ((sub1_res1 >= sub1_res2) ? sub1_res1 : sub1_res2); 00949 } 00950 00951 return res; 00952 } 00953 00954 static size_t BuildIslandAndEstimateStepperMemoryRequirements(dxWorldProcessContext *context, 00955 dxWorld *world, dReal stepsize, dmemestimate_fn_t stepperestimate) 00956 { 00957 const int sizeelements = 2; 00958 size_t maxreq = 0; 00959 00960 // handle auto-disabling of bodies 00961 dInternalHandleAutoDisabling (world,stepsize); 00962 00963 int nb = world->nb, nj = world->nj; 00964 // Make array for island body/joint counts 00965 int *islandsizes = context->AllocateArray<int>(sizeelements); 00966 00967 // make arrays for body and joint lists (for a single island) to go into 00968 dxBody **body = context->AllocateArray<dxBody *>(nb); 00969 dxJoint **joint = context->AllocateArray<dxJoint *>(nj); 00970 00971 BEGIN_STATE_SAVE(context, stackstate) { 00972 // allocate a stack of unvisited bodies in the island. the maximum size of 00973 // the stack can be the lesser of the number of bodies or joints, because 00974 // new bodies are only ever added to the stack by going through untagged 00975 // joints. all the bodies in the stack must be tagged! 00976 00977 int stackalloc = (nj < nb) ? nj : nb; 00978 dxBody **stack = context->AllocateArray<dxBody *>(stackalloc); 00979 00980 { 00981 // set all body/joint tags to 0 00982 for (dxBody *b=world->firstbody; b; b=(dxBody*)b->next) b->tag = 0; 00983 for (dxJoint *j=world->firstjoint; j; j=(dxJoint*)j->next) j->tag = 0; 00984 } 00985 00986 int bcount = 0, jcount =0; 00987 00988 dxBody **bodystart = body; 00989 dxJoint **jointstart = joint; 00990 for (dxBody *bb=world->firstbody; bb; bb=(dxBody*)bb->next) { 00991 // get bb = the next enabled, untagged body, and tag it 00992 if (!bb->tag) { 00993 if (!(bb->flags & dxBodyDisabled)) { 00994 bb->tag = 1; 00995 00996 dxBody **bodycurr = bodystart; 00997 dxJoint **jointcurr = jointstart; 00998 00999 // tag all bodies and joints starting from bb. 01000 *bodycurr++ = bb; 01001 01002 int stacksize = 0; 01003 dxBody *b = bb; 01004 01005 while (true) { 01006 // traverse and tag all body's joints, add untagged connected bodies 01007 // to stack 01008 for (dxJointNode *n=b->firstjoint; n; n=n->next) { 01009 dxJoint *njoint = n->joint; 01010 if (!njoint->tag) { 01011 if (njoint->isEnabled()) { 01012 njoint->tag = 1; 01013 *jointcurr++ = njoint; 01014 01015 dxBody *nbody = n->body; 01016 // Body disabled flag is not checked here. This is how auto-enable works. 01017 if (nbody && nbody->tag <= 0) { 01018 nbody->tag = 1; 01019 // Make sure all bodies are in the enabled state. 01020 nbody->flags &= ~dxBodyDisabled; 01021 stack[stacksize++] = nbody; 01022 } 01023 } else { 01024 njoint->tag = -1; // Used in Step to prevent search over disabled joints (not needed for QuickStep so far) 01025 } 01026 } 01027 } 01028 dIASSERT(stacksize <= world->nb); 01029 dIASSERT(stacksize <= world->nj); 01030 01031 if (stacksize == 0) { 01032 break; 01033 } 01034 01035 b = stack[--stacksize]; // pop body off stack 01036 *bodycurr++ = b; // put body on body list 01037 } 01038 01039 bcount += (bodycurr - bodystart); 01040 jcount += (jointcurr - jointstart); 01041 01042 bodystart = bodycurr; 01043 jointstart = jointcurr; 01044 } else { 01045 bb->tag = -1; // Not used so far (assigned to retain consistency with joints) 01046 } 01047 } 01048 } 01049 01050 islandsizes[0] = bcount; 01051 islandsizes[1] = jcount; 01052 01053 size_t islandreq = stepperestimate(body, bcount, joint, jcount); 01054 maxreq = islandreq; 01055 01056 } END_STATE_SAVE(context, stackstate); 01057 01058 # ifndef dNODEBUG 01059 // if debugging, check that all objects (except for disabled bodies, 01060 // unconnected joints, and joints that are connected to disabled bodies) 01061 // were tagged. 01062 { 01063 for (dxBody *b=world->firstbody; b; b=(dxBody*)b->next) { 01064 if (b->flags & dxBodyDisabled) { 01065 if (b->tag > 0) dDebug (0,"disabled body tagged"); 01066 } 01067 else { 01068 if (b->tag <= 0) dDebug (0,"enabled body not tagged"); 01069 } 01070 } 01071 for (dxJoint *j=world->firstjoint; j; j=(dxJoint*)j->next) { 01072 if ( (( j->node[0].body && (j->node[0].body->flags & dxBodyDisabled)==0 ) || 01073 (j->node[1].body && (j->node[1].body->flags & dxBodyDisabled)==0) ) 01074 && 01075 j->isEnabled() ) { 01076 if (j->tag <= 0) dDebug (0,"attached enabled joint not tagged"); 01077 } 01078 else { 01079 if (j->tag > 0) dDebug (0,"unattached or disabled joint tagged"); 01080 } 01081 } 01082 } 01083 # endif 01084 01085 size_t const *islandreqs = NULL; 01086 int islandcount = 1; 01087 context->SavePreallocations(islandcount, islandsizes, body, joint, islandreqs); 01088 01089 return maxreq; 01090 } 01091 01092 static size_t EstimateIslandsProcessingMemoryRequirements(dxWorld *world, size_t &sesize) 01093 { 01094 size_t res = 0; 01095 01096 size_t islandcounts = dEFFICIENT_SIZE(world->nb * 2 * sizeof(int)); 01097 res += islandcounts; 01098 01099 size_t bodiessize = dEFFICIENT_SIZE(world->nb * sizeof(dxBody*)); 01100 size_t jointssize = dEFFICIENT_SIZE(world->nj * sizeof(dxJoint*)); 01101 res += bodiessize + jointssize; 01102 01103 sesize = (bodiessize < jointssize) ? bodiessize : jointssize; 01104 return res; 01105 } 01106 01107 static size_t AdjustArenaSizeForReserveRequirements(size_t arenareq, float rsrvfactor, unsigned rsrvminimum) 01108 { 01109 float scaledarena = arenareq * rsrvfactor; 01110 size_t adjustedarena = (scaledarena < SIZE_MAX) ? (size_t)scaledarena : SIZE_MAX; 01111 size_t boundedarena = (adjustedarena > rsrvminimum) ? adjustedarena : (size_t)rsrvminimum; 01112 return dEFFICIENT_SIZE(boundedarena); 01113 } 01114 01115 static dxWorldProcessContext *InternalReallocateWorldProcessContext ( 01116 dxWorldProcessContext *oldcontext, size_t memreq, 01117 const dxWorldProcessMemoryManager *memmgr, float rsrvfactor, unsigned rsrvminimum) 01118 { 01119 dxWorldProcessContext *context = oldcontext; 01120 bool allocsuccess = false; 01121 01122 size_t oldarenasize; 01123 void *pOldArena; 01124 01125 do { 01126 size_t oldmemsize = oldcontext ? oldcontext->GetMemorySize() : 0; 01127 if (!oldcontext || oldmemsize < memreq) { 01128 oldarenasize = oldcontext ? dxWorldProcessContext::MakeArenaSize(oldmemsize) : 0; 01129 pOldArena = oldcontext ? oldcontext->m_pArenaBegin : NULL; 01130 01131 if (!dxWorldProcessContext::IsArenaPossible(memreq)) { 01132 break; 01133 } 01134 01135 size_t arenareq = dxWorldProcessContext::MakeArenaSize(memreq); 01136 size_t arenareq_with_reserve = AdjustArenaSizeForReserveRequirements(arenareq, rsrvfactor, rsrvminimum); 01137 size_t memreq_with_reserve = memreq + (arenareq_with_reserve - arenareq); 01138 01139 if (oldcontext) { 01140 01141 if (oldcontext->m_pAllocCurrent != oldcontext->m_pAllocBegin) { 01142 01143 // Save old efficient offset and meaningful data size for the case if 01144 // reallocation throws the block at different efficient offset 01145 size_t oldcontextofs = (size_t)oldcontext - (size_t)pOldArena; 01146 size_t datasize = (size_t)oldcontext->m_pAllocCurrent - (size_t)oldcontext; 01147 01148 // Extra EFFICIENT_ALIGNMENT bytes might be needed after re-allocation with different alignment 01149 size_t shrunkarenasize = dEFFICIENT_SIZE(datasize + oldcontextofs) + EFFICIENT_ALIGNMENT; 01150 if (shrunkarenasize < oldarenasize) { 01151 01152 void *pShrunkOldArena = oldcontext->m_pArenaMemMgr->m_fnShrink(pOldArena, oldarenasize, shrunkarenasize); 01153 if (!pShrunkOldArena) { 01154 break; 01155 } 01156 01157 // In case if shrinking is not supported and memory manager had to allocate-copy-free 01158 if (pShrunkOldArena != pOldArena) { 01159 dxWorldProcessContext *shrunkcontext = (dxWorldProcessContext *)dEFFICIENT_PTR(pShrunkOldArena); 01160 01161 // Preform data shift in case if efficient alignment of new block 01162 // does not match that of old block 01163 size_t shrunkcontextofs = (size_t)shrunkcontext - (size_t)pShrunkOldArena; 01164 size_t offsetdiff = oldcontextofs - shrunkcontextofs; 01165 if (offsetdiff != 0) { 01166 memmove(shrunkcontext, (void *)((size_t)shrunkcontext + offsetdiff), datasize); 01167 } 01168 01169 // Make sure allocation pointers are valid - that is necessary to 01170 // be able to calculate size and free old arena later 01171 size_t shrunkdatasize = dxWorldProcessContext::MakeBufferSize(shrunkarenasize); 01172 void *blockbegin = dEFFICIENT_PTR(shrunkcontext + 1); 01173 void *blockend = dOFFSET_EFFICIENTLY(blockbegin, shrunkdatasize); 01174 shrunkcontext->m_pAllocBegin = blockbegin; 01175 shrunkcontext->m_pAllocEnd = blockend; 01176 shrunkcontext->m_pAllocCurrent = blockend; // -- set to end to prevent possibility of further allocation 01177 shrunkcontext->m_pArenaBegin = pShrunkOldArena; 01178 01179 size_t stOffset = ((size_t)pShrunkOldArena - (size_t)pOldArena) - offsetdiff; 01180 shrunkcontext->OffsetPreallocations(stOffset); 01181 01182 oldcontext = shrunkcontext; 01183 01184 // Reassign to old arena variables for potential freeing at exit 01185 pOldArena = pShrunkOldArena; 01186 } 01187 01188 // Reassign to old arena variables for potential freeing at exit 01189 oldarenasize = shrunkarenasize; 01190 } 01191 01192 } else { 01193 oldcontext->m_pArenaMemMgr->m_fnFree(pOldArena, oldarenasize); 01194 oldcontext = NULL; 01195 01196 // Zero variables to avoid another freeing on exit 01197 pOldArena = NULL; 01198 oldarenasize = 0; 01199 } 01200 } 01201 01202 // Allocate new arena 01203 void *pNewArena = memmgr->m_fnAlloc(arenareq_with_reserve); 01204 if (!pNewArena) { 01205 break; 01206 } 01207 01208 context = (dxWorldProcessContext *)dEFFICIENT_PTR(pNewArena); 01209 01210 void *blockbegin = dEFFICIENT_PTR(context + 1); 01211 void *blockend = dOFFSET_EFFICIENTLY(blockbegin, memreq_with_reserve); 01212 01213 context->m_pAllocBegin = blockbegin; 01214 context->m_pAllocEnd = blockend; 01215 context->m_pArenaBegin = pNewArena; 01216 context->m_pAllocCurrent = blockbegin; 01217 01218 if (oldcontext) { 01219 context->CopyPreallocations(oldcontext); 01220 } else { 01221 context->ClearPreallocations(); 01222 } 01223 01224 context->m_pArenaMemMgr = memmgr; 01225 context->m_pPreallocationcContext = oldcontext; 01226 } 01227 01228 allocsuccess = true; 01229 } while (false); 01230 01231 if (!allocsuccess) { 01232 if (pOldArena) { 01233 dIASSERT(oldcontext); 01234 oldcontext->m_pArenaMemMgr->m_fnFree(pOldArena, oldarenasize); 01235 } 01236 context = NULL; 01237 } 01238 01239 return context; 01240 } 01241 01242 bool dxReallocateParallelWorldProcessContext (dxWorld *world, dReal stepsize, dmemestimate_fn_t stepperestimate) 01243 { 01244 dxStepWorkingMemory *wmem = AllocateOnDemand(world->wmem); 01245 if (!wmem) return false; 01246 01247 dxWorldProcessContext *oldcontext = wmem->GetWorldProcessingContext(); 01248 dIASSERT (!oldcontext || oldcontext->IsStructureValid()); 01249 01250 const dxWorldProcessMemoryReserveInfo *reserveinfo = wmem->SureGetMemoryReserveInfo(); 01251 const dxWorldProcessMemoryManager *memmgr = wmem->SureGetMemoryManager(); 01252 01253 dxWorldProcessContext *context = oldcontext; 01254 01255 size_t sesize; 01256 size_t islandsreq = EstimateIslandsProcessingMemoryRequirements(world, sesize); 01257 dIASSERT(islandsreq == dEFFICIENT_SIZE(islandsreq)); 01258 dIASSERT(sesize == dEFFICIENT_SIZE(sesize)); 01259 01260 size_t stepperestimatereq = islandsreq + sesize; 01261 context = InternalReallocateWorldProcessContext(context, stepperestimatereq, memmgr, 1.0f, reserveinfo->m_uiReserveMinimum); 01262 01263 if (context) 01264 { 01265 size_t stepperreq = BuildIslandAndEstimateStepperMemoryRequirements(context, world, stepsize, stepperestimate); 01266 dIASSERT(stepperreq == dEFFICIENT_SIZE(stepperreq)); 01267 01268 size_t memreq = stepperreq + islandsreq; 01269 context = InternalReallocateWorldProcessContext(context, memreq, memmgr, reserveinfo->m_fReserveFactor, reserveinfo->m_uiReserveMinimum); 01270 } 01271 01272 wmem->SetWorldProcessingContext(context); 01273 return context != NULL; 01274 } 01275 01276