$search
00001 #include <parallel_common.h> 00002 #include <parallel_reduce.h> 00003 #include <parallel_math.h> 00004 00005 #include "openmp_kernels.h" 00006 #include "openmp_utils.h" 00007 00008 #include <omp.h> 00009 #include <limits> 00010 00011 #include "parallel_kernels.h" 00012 00013 namespace parallel_ode 00014 { 00015 00016 template <typename T> 00017 void ompZeroVector( T *buffer, int bufferSize ) 00018 { 00019 //#pragma omp parallel for 00020 for(int index = 0; index < bufferSize; ++index) { 00021 buffer[ index ] = parallel_zero<T>(); 00022 } 00023 00024 } 00025 00026 template <typename T> 00027 void ompPGSReduce( typename vec4<T>::Type *fc0, 00028 typename vec4<T>::Type *fc1, 00029 typename vec4<T>::Type *fc0_reduction, 00030 typename vec4<T>::Type *fc1_reduction, 00031 ReduceStrategy* reduceStrategy ) 00032 { 00033 typedef typename vec4<T>::Type Vec4T; 00034 00035 const int bodySize = reduceStrategy->getBodySize( ); 00036 const int reductionSize = reduceStrategy->getBodySizeWithReduction( ); 00037 const int reductionStride = reduceStrategy->getBodyOffsetStride( ); 00038 00039 const Vec4T zeroVec = make_vec4( (T)0.0 ); 00040 00041 //#pragma omp parallel for 00042 for(int index = 0; index < bodySize; ++index) { 00043 int nextIndex = index + reductionStride; 00044 00045 Vec4T sum0 = fc0_reduction[ index ]; 00046 Vec4T sum1 = fc1_reduction[ index ]; 00047 00048 while(nextIndex < reductionSize) { 00049 sum0 += fc0_reduction[ nextIndex ]; 00050 fc0_reduction[ nextIndex ] = zeroVec; 00051 sum1 += fc1_reduction[ nextIndex ]; 00052 fc1_reduction[ nextIndex ] = zeroVec; 00053 nextIndex += reductionStride; 00054 } 00055 00056 fc0[ index ] += sum0; 00057 fc1[ index ] += sum1; 00058 } 00059 00060 } 00061 00062 template <typename T> 00063 void ompPGSSolve( int4 *bodyIDs, 00064 int *fIDs, 00065 typename vec4<T>::Type *j, 00066 typename vec4<T>::Type *ij, 00067 typename vec4<T>::Type *fc0, 00068 typename vec4<T>::Type *fc1, 00069 typename vec4<T>::Type *fc0_reduction, 00070 typename vec4<T>::Type *fc1_reduction, 00071 T* lambda, 00072 T* adcfm, 00073 T* rhs, 00074 T* hilo, 00075 int offset, int numConstraints, bool bUseAtomics, 00076 int bStride, int cStride) 00077 { 00078 typedef typename vec4<T>::Type Vec4T; 00079 00080 //#pragma omp parallel for 00081 for(int localIndex = 0; localIndex < numConstraints; ++localIndex) { 00082 00083 const int index = localIndex + offset; 00084 00085 T old_lambda = lambda[ index ]; 00086 00087 int4 bodyID = bodyIDs[ index ]; 00088 00089 Vec4T fc00 = fc0[ bodyID.x ]; 00090 Vec4T fc01 = fc1[ bodyID.x ]; 00091 Vec4T fc10 = make_vec4( (T)0.0 ); 00092 Vec4T fc11 = make_vec4( (T)0.0 ); 00093 00094 Vec4T j0_temp = j[ index ]; 00095 Vec4T j1_temp = j[ C_ID(index,1) ]; 00096 Vec4T j2_temp = j[ C_ID(index,2) ]; 00097 Vec4T j3_temp = j[ C_ID(index,3) ]; 00098 00099 T delta = rhs[ index ] - old_lambda * adcfm[ index ]; 00100 00101 if( bodyID.y >= 0 ) { 00102 fc10 = fc0[ bodyID.y ]; 00103 fc11 = fc1[ bodyID.y ]; 00104 } 00105 00106 { 00107 delta -= dot( fc00, j0_temp ); 00108 delta -= dot( fc01, j1_temp ); 00109 if (bodyID.y >= 0) { 00110 delta -= dot( fc10, j2_temp ); 00111 delta -= dot( fc11, j3_temp ); 00112 } 00113 } 00114 00115 { 00116 T lo_act = hilo[ index ]; 00117 T hi_act = hilo[ C_ID(index,1) ]; 00118 int fID = fIDs[ index ]; 00119 00120 if (fID >= 0) { 00121 hi_act = abs( hi_act * lambda[ fID ]); 00122 lo_act = -hi_act; 00123 } 00124 00125 T new_lambda = old_lambda + delta; 00126 T final_lambda = new_lambda; 00127 00128 if (new_lambda < lo_act) { 00129 delta = lo_act-old_lambda; 00130 final_lambda = lo_act; 00131 } 00132 else if (new_lambda > hi_act) { 00133 delta = hi_act-old_lambda; 00134 final_lambda = hi_act; 00135 } 00136 lambda[ index ] = final_lambda; 00137 } 00138 00139 j0_temp = ij[ index ]; 00140 j1_temp = ij[ C_ID(index,1) ]; 00141 j2_temp = ij[ C_ID(index,2) ]; 00142 j3_temp = ij[ C_ID(index,3) ]; 00143 00144 { 00145 j0_temp *= delta; 00146 j1_temp *= delta; 00147 00148 // STORE 00149 if( bUseAtomics ) { 00150 myAtomicVecAdd<T>(fc0[ bodyID.x ], j0_temp); 00151 myAtomicVecAdd<T>(fc1[ bodyID.x ], j1_temp); 00152 } else { 00153 fc0_reduction[ bodyID.z ] = j0_temp; 00154 fc1_reduction[ bodyID.z ] = j1_temp; 00155 } 00156 00157 if( bodyID.y >= 0 ) { 00158 j2_temp *= delta; 00159 j3_temp *= delta; 00160 00161 // STORE 00162 if( bUseAtomics ) { 00163 myAtomicVecAdd<T>(fc0[ bodyID.y ], j2_temp); 00164 myAtomicVecAdd<T>(fc1[ bodyID.y ], j3_temp); 00165 } else { 00166 fc0_reduction[ bodyID.w ] = j2_temp; 00167 fc1_reduction[ bodyID.w ] = j3_temp; 00168 } 00169 } 00170 } 00171 } 00172 } 00173 00174 00175 // Explicit specializations needed to generate code 00176 template void ompZeroVector<dReal4>( dReal4 *buffer, 00177 int bufferSize ); 00178 00179 template void ompPGSReduce<dReal>( dReal4 *fc0, 00180 dReal4 *fc1, 00181 dReal4 *fc0_reduction, 00182 dReal4 *fc1_reduction, 00183 ReduceStrategy* reduceStrategy ); 00184 00185 template void ompPGSSolve<dReal>( int4 *bodyIDs, 00186 int *fIDs, 00187 dReal4 *j, 00188 dReal4 *ij, 00189 dReal4 *fc0, 00190 dReal4 *fc1, 00191 dReal4 *fc0_reduction, 00192 dReal4 *fc1_reduction, 00193 dReal *lambda, 00194 dReal *adcfm, 00195 dReal *rhs, 00196 dReal *hilo, 00197 int batch, int numConstraints, bool bUseAtomics, int bStride, int cStride ); 00198 00199 }