openmp_kernels.cpp
Go to the documentation of this file.
00001 #include <parallel_common.h>
00002 #include <parallel_reduce.h>
00003 #include <parallel_math.h>
00004 
00005 #include "openmp_kernels.h"
00006 #include "openmp_utils.h"
00007 
00008 #include <omp.h>
00009 #include <limits>
00010 
00011 #include "parallel_kernels.h"
00012 
00013 namespace parallel_ode
00014 {
00015 
00016 template <typename T>
00017 void ompZeroVector( T *buffer, int bufferSize )
00018 {
00019   //#pragma omp parallel for
00020   for(int index = 0; index < bufferSize; ++index) {
00021     buffer[ index ] = parallel_zero<T>();
00022   }
00023 
00024 }
00025 
00026 template <typename T>
00027 void ompPGSReduce( typename vec4<T>::Type *fc0,
00028                    typename vec4<T>::Type *fc1,
00029                    typename vec4<T>::Type *fc0_reduction,
00030                    typename vec4<T>::Type *fc1_reduction,
00031                    ReduceStrategy* reduceStrategy )
00032 {
00033   typedef typename vec4<T>::Type Vec4T;
00034 
00035   const int bodySize = reduceStrategy->getBodySize( );
00036   const int reductionSize = reduceStrategy->getBodySizeWithReduction( );
00037   const int reductionStride = reduceStrategy->getBodyOffsetStride( );
00038 
00039   const Vec4T zeroVec = make_vec4( (T)0.0 );
00040 
00041   //#pragma omp parallel for
00042   for(int index = 0; index < bodySize; ++index) {
00043     int nextIndex = index + reductionStride;
00044 
00045     Vec4T sum0 = fc0_reduction[ index ];
00046     Vec4T sum1 = fc1_reduction[ index ];
00047 
00048     while(nextIndex < reductionSize) {
00049       sum0 += fc0_reduction[ nextIndex ];
00050       fc0_reduction[ nextIndex ] = zeroVec;
00051       sum1 += fc1_reduction[ nextIndex ];
00052       fc1_reduction[ nextIndex ] = zeroVec;
00053       nextIndex += reductionStride;
00054     }
00055 
00056     fc0[ index ] += sum0;
00057     fc1[ index ] += sum1;
00058   }
00059 
00060 }
00061 
00062 template <typename T>
00063 void ompPGSSolve( int4 *bodyIDs,
00064                   int  *fIDs,
00065                   typename vec4<T>::Type *j,
00066                   typename vec4<T>::Type *ij,
00067                   typename vec4<T>::Type *fc0,
00068                   typename vec4<T>::Type *fc1,
00069                   typename vec4<T>::Type *fc0_reduction,
00070                   typename vec4<T>::Type *fc1_reduction,
00071                   T* lambda,
00072                   T* adcfm,
00073                   T* rhs,
00074                   T* hilo,
00075                   int offset, int numConstraints, bool bUseAtomics,
00076                   int bStride, int cStride)
00077 {
00078   typedef typename vec4<T>::Type Vec4T;
00079 
00080   //#pragma omp parallel for
00081   for(int localIndex = 0; localIndex < numConstraints; ++localIndex) {
00082 
00083     const int index = localIndex + offset;
00084 
00085     T old_lambda = lambda[ index ];
00086 
00087     int4 bodyID = bodyIDs[ index ];
00088 
00089     Vec4T fc00 = fc0[ bodyID.x ];
00090     Vec4T fc01 = fc1[ bodyID.x ];
00091     Vec4T fc10 = make_vec4( (T)0.0 );
00092     Vec4T fc11 = make_vec4( (T)0.0 );
00093 
00094     Vec4T j0_temp = j[      index    ];
00095     Vec4T j1_temp = j[ C_ID(index,1) ];
00096     Vec4T j2_temp = j[ C_ID(index,2) ];
00097     Vec4T j3_temp = j[ C_ID(index,3) ];
00098 
00099     T  delta = rhs[ index ] - old_lambda * adcfm[ index ];
00100 
00101     if( bodyID.y >= 0 )  {
00102       fc10 = fc0[ bodyID.y ];
00103       fc11 = fc1[ bodyID.y ];
00104     }
00105 
00106     {
00107       delta -= dot( fc00, j0_temp );
00108       delta -= dot( fc01, j1_temp );
00109       if (bodyID.y >= 0) {
00110         delta -= dot( fc10, j2_temp );
00111         delta -= dot( fc11, j3_temp );
00112       }
00113     }
00114 
00115     {
00116       T lo_act = hilo[ index ];
00117       T hi_act = hilo[ C_ID(index,1) ];
00118       int fID = fIDs[ index ];
00119 
00120       if (fID >= 0) {
00121         hi_act = abs( hi_act * lambda[ fID ]);
00122         lo_act = -hi_act;
00123       }
00124 
00125       T new_lambda = old_lambda + delta;
00126       T final_lambda = new_lambda;
00127 
00128       if (new_lambda < lo_act) {
00129         delta = lo_act-old_lambda;
00130         final_lambda = lo_act;
00131       }
00132       else if (new_lambda > hi_act) {
00133         delta = hi_act-old_lambda;
00134         final_lambda = hi_act;
00135       }
00136       lambda[ index ] = final_lambda;
00137     }
00138 
00139     j0_temp = ij[      index    ];
00140     j1_temp = ij[ C_ID(index,1) ];
00141     j2_temp = ij[ C_ID(index,2) ];
00142     j3_temp = ij[ C_ID(index,3) ];
00143 
00144     {
00145       j0_temp *= delta;
00146       j1_temp *= delta;
00147 
00148       // STORE
00149       if( bUseAtomics ) {
00150         myAtomicVecAdd<T>(fc0[ bodyID.x ], j0_temp);
00151         myAtomicVecAdd<T>(fc1[ bodyID.x ], j1_temp);
00152       } else {
00153         fc0_reduction[ bodyID.z ] = j0_temp;
00154         fc1_reduction[ bodyID.z ] = j1_temp;
00155       }
00156 
00157       if( bodyID.y >= 0 ) {
00158         j2_temp *= delta;
00159         j3_temp *= delta;
00160 
00161         // STORE
00162         if( bUseAtomics ) {
00163           myAtomicVecAdd<T>(fc0[ bodyID.y ], j2_temp);
00164           myAtomicVecAdd<T>(fc1[ bodyID.y ], j3_temp);
00165         } else {
00166           fc0_reduction[ bodyID.w ] = j2_temp;
00167           fc1_reduction[ bodyID.w ] = j3_temp;
00168         }
00169       }
00170     }
00171   }
00172 }
00173 
00174 
00175 // Explicit specializations needed to generate code
00176 template void ompZeroVector<dReal4>( dReal4  *buffer,
00177                                      int bufferSize );
00178 
00179 template void ompPGSReduce<dReal>( dReal4 *fc0,
00180                                    dReal4 *fc1,
00181                                    dReal4 *fc0_reduction,
00182                                    dReal4 *fc1_reduction,
00183                                    ReduceStrategy* reduceStrategy );
00184 
00185 template void ompPGSSolve<dReal>( int4 *bodyIDs,
00186                                   int *fIDs,
00187                                   dReal4 *j,
00188                                   dReal4 *ij,
00189                                   dReal4 *fc0,
00190                                   dReal4 *fc1,
00191                                   dReal4 *fc0_reduction,
00192                                   dReal4 *fc1_reduction,
00193                                   dReal *lambda,
00194                                   dReal *adcfm,
00195                                   dReal *rhs,
00196                                   dReal *hilo,
00197                                   int batch, int numConstraints, bool bUseAtomics, int bStride, int cStride );
00198 
00199 }


parallel_quickstep
Author(s): Jared Duke
autogenerated on Fri Jan 3 2014 11:36:56