00001 #include <parallel_common.h>
00002 #include <parallel_reduce.h>
00003 #include <parallel_math.h>
00004
00005 #include "openmp_kernels.h"
00006 #include "openmp_utils.h"
00007
00008 #include <omp.h>
00009 #include <limits>
00010
00011 #include "parallel_kernels.h"
00012
00013 namespace parallel_ode
00014 {
00015
00016 template <typename T>
00017 void ompZeroVector( T *buffer, int bufferSize )
00018 {
00019
00020 for(int index = 0; index < bufferSize; ++index) {
00021 buffer[ index ] = parallel_zero<T>();
00022 }
00023
00024 }
00025
00026 template <typename T>
00027 void ompPGSReduce( typename vec4<T>::Type *fc0,
00028 typename vec4<T>::Type *fc1,
00029 typename vec4<T>::Type *fc0_reduction,
00030 typename vec4<T>::Type *fc1_reduction,
00031 ReduceStrategy* reduceStrategy )
00032 {
00033 typedef typename vec4<T>::Type Vec4T;
00034
00035 const int bodySize = reduceStrategy->getBodySize( );
00036 const int reductionSize = reduceStrategy->getBodySizeWithReduction( );
00037 const int reductionStride = reduceStrategy->getBodyOffsetStride( );
00038
00039 const Vec4T zeroVec = make_vec4( (T)0.0 );
00040
00041
00042 for(int index = 0; index < bodySize; ++index) {
00043 int nextIndex = index + reductionStride;
00044
00045 Vec4T sum0 = fc0_reduction[ index ];
00046 Vec4T sum1 = fc1_reduction[ index ];
00047
00048 while(nextIndex < reductionSize) {
00049 sum0 += fc0_reduction[ nextIndex ];
00050 fc0_reduction[ nextIndex ] = zeroVec;
00051 sum1 += fc1_reduction[ nextIndex ];
00052 fc1_reduction[ nextIndex ] = zeroVec;
00053 nextIndex += reductionStride;
00054 }
00055
00056 fc0[ index ] += sum0;
00057 fc1[ index ] += sum1;
00058 }
00059
00060 }
00061
00062 template <typename T>
00063 void ompPGSSolve( int4 *bodyIDs,
00064 int *fIDs,
00065 typename vec4<T>::Type *j,
00066 typename vec4<T>::Type *ij,
00067 typename vec4<T>::Type *fc0,
00068 typename vec4<T>::Type *fc1,
00069 typename vec4<T>::Type *fc0_reduction,
00070 typename vec4<T>::Type *fc1_reduction,
00071 T* lambda,
00072 T* adcfm,
00073 T* rhs,
00074 T* hilo,
00075 int offset, int numConstraints, bool bUseAtomics,
00076 int bStride, int cStride)
00077 {
00078 typedef typename vec4<T>::Type Vec4T;
00079
00080
00081 for(int localIndex = 0; localIndex < numConstraints; ++localIndex) {
00082
00083 const int index = localIndex + offset;
00084
00085 T old_lambda = lambda[ index ];
00086
00087 int4 bodyID = bodyIDs[ index ];
00088
00089 Vec4T fc00 = fc0[ bodyID.x ];
00090 Vec4T fc01 = fc1[ bodyID.x ];
00091 Vec4T fc10 = make_vec4( (T)0.0 );
00092 Vec4T fc11 = make_vec4( (T)0.0 );
00093
00094 Vec4T j0_temp = j[ index ];
00095 Vec4T j1_temp = j[ C_ID(index,1) ];
00096 Vec4T j2_temp = j[ C_ID(index,2) ];
00097 Vec4T j3_temp = j[ C_ID(index,3) ];
00098
00099 T delta = rhs[ index ] - old_lambda * adcfm[ index ];
00100
00101 if( bodyID.y >= 0 ) {
00102 fc10 = fc0[ bodyID.y ];
00103 fc11 = fc1[ bodyID.y ];
00104 }
00105
00106 {
00107 delta -= dot( fc00, j0_temp );
00108 delta -= dot( fc01, j1_temp );
00109 if (bodyID.y >= 0) {
00110 delta -= dot( fc10, j2_temp );
00111 delta -= dot( fc11, j3_temp );
00112 }
00113 }
00114
00115 {
00116 T lo_act = hilo[ index ];
00117 T hi_act = hilo[ C_ID(index,1) ];
00118 int fID = fIDs[ index ];
00119
00120 if (fID >= 0) {
00121 hi_act = abs( hi_act * lambda[ fID ]);
00122 lo_act = -hi_act;
00123 }
00124
00125 T new_lambda = old_lambda + delta;
00126 T final_lambda = new_lambda;
00127
00128 if (new_lambda < lo_act) {
00129 delta = lo_act-old_lambda;
00130 final_lambda = lo_act;
00131 }
00132 else if (new_lambda > hi_act) {
00133 delta = hi_act-old_lambda;
00134 final_lambda = hi_act;
00135 }
00136 lambda[ index ] = final_lambda;
00137 }
00138
00139 j0_temp = ij[ index ];
00140 j1_temp = ij[ C_ID(index,1) ];
00141 j2_temp = ij[ C_ID(index,2) ];
00142 j3_temp = ij[ C_ID(index,3) ];
00143
00144 {
00145 j0_temp *= delta;
00146 j1_temp *= delta;
00147
00148
00149 if( bUseAtomics ) {
00150 myAtomicVecAdd<T>(fc0[ bodyID.x ], j0_temp);
00151 myAtomicVecAdd<T>(fc1[ bodyID.x ], j1_temp);
00152 } else {
00153 fc0_reduction[ bodyID.z ] = j0_temp;
00154 fc1_reduction[ bodyID.z ] = j1_temp;
00155 }
00156
00157 if( bodyID.y >= 0 ) {
00158 j2_temp *= delta;
00159 j3_temp *= delta;
00160
00161
00162 if( bUseAtomics ) {
00163 myAtomicVecAdd<T>(fc0[ bodyID.y ], j2_temp);
00164 myAtomicVecAdd<T>(fc1[ bodyID.y ], j3_temp);
00165 } else {
00166 fc0_reduction[ bodyID.w ] = j2_temp;
00167 fc1_reduction[ bodyID.w ] = j3_temp;
00168 }
00169 }
00170 }
00171 }
00172 }
00173
00174
00175
00176 template void ompZeroVector<dReal4>( dReal4 *buffer,
00177 int bufferSize );
00178
00179 template void ompPGSReduce<dReal>( dReal4 *fc0,
00180 dReal4 *fc1,
00181 dReal4 *fc0_reduction,
00182 dReal4 *fc1_reduction,
00183 ReduceStrategy* reduceStrategy );
00184
00185 template void ompPGSSolve<dReal>( int4 *bodyIDs,
00186 int *fIDs,
00187 dReal4 *j,
00188 dReal4 *ij,
00189 dReal4 *fc0,
00190 dReal4 *fc1,
00191 dReal4 *fc0_reduction,
00192 dReal4 *fc1_reduction,
00193 dReal *lambda,
00194 dReal *adcfm,
00195 dReal *rhs,
00196 dReal *hilo,
00197 int batch, int numConstraints, bool bUseAtomics, int bStride, int cStride );
00198
00199 }