00001 #ifndef PARALLEL_KERNELS_NONTEMPLATE_H 00002 #define PARALLEL_KERNELS_NONTEMPLATE_H 00003 00004 #define C_ID(index,i) (index+(cStride*(i))) 00005 #define B_ID(index,i) (index+(bStride*(i))) 00006 00007 dxGlobal void 00008 parallelZero( dxDeviceData dReal *buffer, int bufferSize ) 00009 { 00010 int index = dxGlobalIdxX(); 00011 00012 if( index < bufferSize ) 00013 buffer[ index ] = dParallelZero; 00014 } 00015 00016 dxGlobal void 00017 parallelZero4( dxDeviceData dReal4 *buffer, int bufferSize ) 00018 { 00019 int index = dxGlobalIdxX(); 00020 00021 if( index < bufferSize ) 00022 buffer[ index ] = make_real4( dParallelZero ); 00023 } 00024 00025 dxGlobal void 00026 parallelSORLCP( dxDeviceData dReal4 *fc0_reduction, 00027 dxDeviceData dReal4 *fc1_reduction, 00028 dxDeviceData dReal *lambda, 00029 dxDeviceData const int4 *bodyIDs, 00030 dxDeviceData const int *fIDs, 00031 dxDeviceData const dReal4 *j, 00032 dxDeviceData const dReal4 *ij, 00033 dxDeviceData const dReal4 *fc0, 00034 dxDeviceData const dReal4 *fc1, 00035 dxDeviceData const dReal *adcfm, 00036 dxDeviceData const dReal *rhs, 00037 dxDeviceData const dReal *lohi, 00038 const int offset, 00039 const int numConstraints, 00040 const int bStride, 00041 const int cStride ) 00042 { 00043 int index = dxGlobalIdxX(); 00044 00045 if( index >= numConstraints ) 00046 return; 00047 00048 index += offset; 00049 00050 dReal old_lambda = lambda[ index ]; 00051 00052 int4 bodyID = bodyIDs[ index ]; 00053 00054 dReal4 fc00 = fc0[ bodyID.x ]; 00055 dReal4 fc01 = fc1[ bodyID.x ]; 00056 dReal4 fc10 = make_real4( dParallelZero ); 00057 dReal4 fc11 = make_real4( dParallelZero ); 00058 00059 dReal4 j0_temp = j[ index ]; 00060 dReal4 j1_temp = j[ C_ID(index,1) ]; 00061 00062 dReal delta = rhs[ index ] - old_lambda * adcfm[ index ]; 00063 00064 if( bodyID.y >= 0 ) { 00065 fc10 = fc0[ bodyID.y ]; 00066 fc11 = fc1[ bodyID.y ]; 00067 } 00068 00069 { 00070 delta -= dot( fc00, j0_temp ); 00071 delta -= dot( fc01, j1_temp ); 00072 if (bodyID.y >= 0) { 00073 dReal4 j2_temp = j[ C_ID(index,2) ]; 00074 dReal4 j3_temp = j[ C_ID(index,3) ]; 00075 delta -= dot( fc10, j2_temp ); 00076 delta -= dot( fc11, j3_temp ); 00077 } 00078 } 00079 00080 { 00081 dReal lo_act = lohi[ index ]; 00082 dReal hi_act = lohi[ C_ID(index,1) ]; 00083 00084 int fID = fIDs[ index ]; 00085 if (fID >= 0) { 00086 hi_act = fabs( hi_act * lambda[ fID ]); 00087 lo_act = -hi_act; 00088 } 00089 00090 dReal new_lambda = old_lambda + delta; 00091 dReal final_lambda = new_lambda; 00092 00093 if (new_lambda < lo_act) { 00094 delta = lo_act-old_lambda; 00095 final_lambda = lo_act; 00096 } 00097 else if (new_lambda > hi_act) { 00098 delta = hi_act-old_lambda; 00099 final_lambda = hi_act; 00100 } 00101 lambda[ index ] = final_lambda; 00102 } 00103 00104 j0_temp = ij[ index ]; 00105 j1_temp = ij[ C_ID(index,1) ]; 00106 00107 { 00108 j0_temp *= delta; 00109 j1_temp *= delta; 00110 00111 fc0_reduction[ bodyID.z ] += j0_temp; 00112 fc1_reduction[ bodyID.z ] += j1_temp; 00113 00114 if( bodyID.y >= 0 ) { 00115 dReal4 j2_temp = ij[ C_ID(index,2) ]; 00116 dReal4 j3_temp = ij[ C_ID(index,3) ]; 00117 00118 j2_temp *= delta; 00119 j3_temp *= delta; 00120 00121 fc0_reduction[ bodyID.w ] += j2_temp; 00122 fc1_reduction[ bodyID.w ] += j3_temp; 00123 } 00124 } 00125 } 00126 00127 dxGlobal void 00128 parallelReduce( dxDeviceData dReal4 *fc0, 00129 dxDeviceData dReal4 *fc1, 00130 dxDeviceData const dReal4 *fc0_reduction, 00131 dxDeviceData const dReal4 *fc1_reduction, 00132 const int reductionStride, 00133 const int bodySize, 00134 const int reductionSize ) 00135 { 00136 const int index = dxGlobalIdxX(); 00137 if( index >= bodySize ) return; 00138 00139 int nextIndex = index + reductionStride; 00140 00141 dReal4 sum0 = fc0_reduction[ index ]; 00142 dReal4 sum1 = fc1_reduction[ index ]; 00143 00144 while(nextIndex < reductionSize) { 00145 sum0 += fc0_reduction[ nextIndex ]; 00146 sum1 += fc1_reduction[ nextIndex ]; 00147 nextIndex += reductionStride; 00148 } 00149 00150 fc0[ index ] += sum0; 00151 fc1[ index ] += sum1; 00152 } 00153 00154 #endif