parallel_quickstep: parallel_kernels_nontemplate.h Source File

00001 #ifndef PARALLEL_KERNELS_NONTEMPLATE_H
00002 #define PARALLEL_KERNELS_NONTEMPLATE_H
00003 
00004 #define C_ID(index,i)     (index+(cStride*(i)))
00005 #define B_ID(index,i)     (index+(bStride*(i)))
00006 
00007 dxGlobal void
00008 parallelZero( dxDeviceData dReal *buffer, int bufferSize )
00009 {
00010   int index = dxGlobalIdxX();
00011 
00012   if( index < bufferSize )
00013     buffer[ index ] = dParallelZero;
00014 }
00015 
00016 dxGlobal void
00017 parallelZero4( dxDeviceData dReal4 *buffer, int bufferSize )
00018 {
00019   int index = dxGlobalIdxX();
00020 
00021   if( index < bufferSize )
00022     buffer[ index ] = make_real4( dParallelZero );
00023 }
00024 
00025 dxGlobal void
00026 parallelSORLCP( dxDeviceData dReal4 *fc0_reduction,
00027                 dxDeviceData dReal4 *fc1_reduction,
00028                 dxDeviceData dReal *lambda,
00029                 dxDeviceData const int4 *bodyIDs,
00030                 dxDeviceData const int *fIDs,
00031                 dxDeviceData const dReal4 *j,
00032                 dxDeviceData const dReal4 *ij,
00033                 dxDeviceData const dReal4 *fc0,
00034                 dxDeviceData const dReal4 *fc1,
00035                 dxDeviceData const dReal *adcfm,
00036                 dxDeviceData const dReal *rhs,
00037                 dxDeviceData const dReal *lohi,
00038                 const int offset,
00039                 const int numConstraints,
00040                 const int bStride,
00041                 const int cStride )
00042 {
00043   int index = dxGlobalIdxX();
00044 
00045   if( index >= numConstraints )
00046     return;
00047 
00048   index += offset;
00049 
00050   dReal old_lambda = lambda[ index ];
00051 
00052   int4 bodyID = bodyIDs[ index ];
00053 
00054   dReal4 fc00 = fc0[ bodyID.x ];
00055   dReal4 fc01 = fc1[ bodyID.x ];
00056   dReal4 fc10 = make_real4( dParallelZero );
00057   dReal4 fc11 = make_real4( dParallelZero );
00058 
00059   dReal4 j0_temp = j[      index    ];
00060   dReal4 j1_temp = j[ C_ID(index,1) ];
00061 
00062   dReal  delta = rhs[ index ] - old_lambda * adcfm[ index ];
00063 
00064   if( bodyID.y >= 0 )  {
00065     fc10 = fc0[ bodyID.y ];
00066     fc11 = fc1[ bodyID.y ];
00067   }
00068 
00069   {
00070     delta -= dot( fc00, j0_temp );
00071     delta -= dot( fc01, j1_temp );
00072     if (bodyID.y >= 0) {
00073       dReal4 j2_temp = j[ C_ID(index,2) ];
00074       dReal4 j3_temp = j[ C_ID(index,3) ];
00075       delta -= dot( fc10, j2_temp );
00076       delta -= dot( fc11, j3_temp );
00077     }
00078   }
00079 
00080   {
00081     dReal lo_act = lohi[ index ];
00082     dReal hi_act = lohi[ C_ID(index,1) ];
00083 
00084     int fID = fIDs[ index ];
00085     if (fID >= 0) {
00086       hi_act = fabs( hi_act * lambda[ fID ]);
00087       lo_act = -hi_act;
00088     }
00089 
00090     dReal new_lambda = old_lambda + delta;
00091     dReal final_lambda = new_lambda;
00092 
00093     if (new_lambda < lo_act) {
00094       delta = lo_act-old_lambda;
00095       final_lambda = lo_act;
00096     }
00097     else if (new_lambda > hi_act) {
00098       delta = hi_act-old_lambda;
00099       final_lambda = hi_act;
00100     }
00101     lambda[ index ] = final_lambda;
00102   }
00103 
00104   j0_temp = ij[      index    ];
00105   j1_temp = ij[ C_ID(index,1) ];
00106 
00107   {
00108     j0_temp *= delta;
00109     j1_temp *= delta;
00110 
00111     fc0_reduction[ bodyID.z ] += j0_temp;
00112     fc1_reduction[ bodyID.z ] += j1_temp;
00113 
00114     if( bodyID.y >= 0 ) {
00115       dReal4 j2_temp = ij[ C_ID(index,2) ];
00116       dReal4 j3_temp = ij[ C_ID(index,3) ];
00117 
00118       j2_temp *= delta;
00119       j3_temp *= delta;
00120 
00121       fc0_reduction[ bodyID.w ] += j2_temp;
00122       fc1_reduction[ bodyID.w ] += j3_temp;
00123     }
00124   }
00125 }
00126 
00127 dxGlobal void
00128 parallelReduce( dxDeviceData dReal4 *fc0,
00129                 dxDeviceData dReal4 *fc1,
00130                 dxDeviceData const dReal4 *fc0_reduction,
00131                 dxDeviceData const dReal4 *fc1_reduction,
00132                 const int reductionStride,
00133                 const int bodySize,
00134                 const int reductionSize )
00135 {
00136   const int index = dxGlobalIdxX();
00137   if( index >= bodySize ) return;
00138 
00139   int nextIndex = index + reductionStride;
00140 
00141   dReal4 sum0 = fc0_reduction[ index ];
00142   dReal4 sum1 = fc1_reduction[ index ];
00143 
00144   while(nextIndex < reductionSize) {
00145     sum0 += fc0_reduction[ nextIndex ];
00146     sum1 += fc1_reduction[ nextIndex ];
00147     nextIndex += reductionStride;
00148   }
00149 
00150   fc0[ index ] += sum0;
00151   fc1[ index ] += sum1;
00152 }
00153 
00154 #endif