parallel_quickstep: cuda_solver.cpp Source File

Go to the documentation of this file.
00001 #include <cuda_solver.h>
00002 #include <parallel_utils.h>
00003 #include <cuda_utils.h>
00004 
00005 #include "cuda_kernels.cuh"
00006 
00007 namespace parallel_ode
00008 {
00009 
00010 #ifdef CUDA_DOUBLESUPPORT
00011 template class CudaPGSSolver<dReal,dReal>;
00012 #else
00013 template class CudaPGSSolver<float,dReal>;
00014 #endif
00015 
00016 template<typename CudaT,typename ParamsT>
00017 void CudaPGSSolver<CudaT,ParamsT>::initialize( )
00018 {
00019   ParallelPGSSolver<CudaT,ParamsT,ParallelTypes::CUDA>::initialize( );
00020 
00021   MemFlags flags = cudaHostAllocDefault;
00022   flags |= this->wcMemEnabled() ? cudaHostAllocWriteCombined : 0;
00023   flags |= this->pinnedMemEnabled() ? cudaHostAllocPortable : 0;
00024   this->setMemFlags( flags );
00025 }
00026 
00027 template<typename CudaT,typename ParamsT>
00028 void CudaPGSSolver<CudaT,ParamsT>::preProcessDevice( const CudaT sorParam, const CudaT stepSize )
00029 {
00031 }
00032 
00033 template<typename CudaT,typename ParamsT>
00034 void CudaPGSSolver<CudaT,ParamsT>::solveAndReduce( const int offset, const int batchSize )
00035 {
00036   // cuda stuff here should change it's double/single-ness  based on GPU support for doubles
00037   cudaPGSSolve<CudaT>( this->bodyIDs.getDeviceBuffer( ),
00038                    this->fIDs.getDeviceBuffer( ),
00039                    this->j0.getDeviceBuffer( ),
00040                    this->ij0.getDeviceBuffer( ),
00041                    this->bodyFAcc.getDeviceBuffer( ),
00042                    this->bodyTAcc.getDeviceBuffer( ),
00043                    this->bodyFAccReduction.getDeviceBuffer( ),
00044                    this->bodyTAccReduction.getDeviceBuffer( ),
00045                    this->lambda0.getDeviceBuffer( ),
00046                    this->adcfm.getDeviceBuffer( ),
00047                    this->rhs.getDeviceBuffer( ),
00048                    this->lohiD.getDeviceBuffer( ),
00049                    offset,
00050                    batchSize,
00051                    this->atomicsEnabled( ),
00052                    this->getBodyStride( ),
00053                    this->getConstraintStride( ) );
00054 
00055   if( this->reduceEnabled( ) ) {
00056     cudaPGSReduce<CudaT>( this->bodyFAcc.getDeviceBuffer( ),
00057                       this->bodyTAcc.getDeviceBuffer( ),
00058                       this->bodyFAccReduction.getDeviceBuffer( ),
00059                       this->bodyTAccReduction.getDeviceBuffer( ),
00060                       this->reduceStrategy_ );
00061   }
00062 }
00063 
00064 template<typename CudaT,typename ParamsT>
00065 void CudaPGSSolver<CudaT,ParamsT>::loadConstraints( )
00066 {
00067   ParallelPGSSolver<CudaT,ParamsT, ParallelTypes::CUDA>::loadConstraints( );
00068 
00069   // Zero out the force accumulation vector
00070   if( this->reduceEnabled( ) ) {
00071     cudaZeroVector<Vec4T>(this->bodyFAccReduction.getDeviceBuffer( ), this->bodyFAccReduction.getSize( ));
00072     cudaZeroVector<Vec4T>(this->bodyTAccReduction.getDeviceBuffer( ), this->bodyTAccReduction.getSize( ));
00073   }
00074 }
00075 
00076 }