$search
00001 #include <cuda_solver.h> 00002 #include <parallel_utils.h> 00003 #include <cuda_utils.h> 00004 00005 #include "cuda_kernels.cuh" 00006 00007 namespace parallel_ode 00008 { 00009 00010 #ifdef CUDA_DOUBLESUPPORT 00011 template class CudaPGSSolver<dReal,dReal>; 00012 #else 00013 template class CudaPGSSolver<float,dReal>; 00014 #endif 00015 00016 template<typename CudaT,typename ParamsT> 00017 void CudaPGSSolver<CudaT,ParamsT>::initialize( ) 00018 { 00019 ParallelPGSSolver<CudaT,ParamsT,ParallelTypes::CUDA>::initialize( ); 00020 00021 MemFlags flags = cudaHostAllocDefault; 00022 flags |= this->wcMemEnabled() ? cudaHostAllocWriteCombined : 0; 00023 flags |= this->pinnedMemEnabled() ? cudaHostAllocPortable : 0; 00024 this->setMemFlags( flags ); 00025 } 00026 00027 template<typename CudaT,typename ParamsT> 00028 void CudaPGSSolver<CudaT,ParamsT>::preProcessDevice( const CudaT sorParam, const CudaT stepSize ) 00029 { 00031 } 00032 00033 template<typename CudaT,typename ParamsT> 00034 void CudaPGSSolver<CudaT,ParamsT>::solveAndReduce( const int offset, const int batchSize ) 00035 { 00036 // cuda stuff here should change it's double/single-ness based on GPU support for doubles 00037 cudaPGSSolve<CudaT>( this->bodyIDs.getDeviceBuffer( ), 00038 this->fIDs.getDeviceBuffer( ), 00039 this->j0.getDeviceBuffer( ), 00040 this->ij0.getDeviceBuffer( ), 00041 this->bodyFAcc.getDeviceBuffer( ), 00042 this->bodyTAcc.getDeviceBuffer( ), 00043 this->bodyFAccReduction.getDeviceBuffer( ), 00044 this->bodyTAccReduction.getDeviceBuffer( ), 00045 this->lambda0.getDeviceBuffer( ), 00046 this->adcfm.getDeviceBuffer( ), 00047 this->rhs.getDeviceBuffer( ), 00048 this->lohiD.getDeviceBuffer( ), 00049 offset, 00050 batchSize, 00051 this->atomicsEnabled( ), 00052 this->getBodyStride( ), 00053 this->getConstraintStride( ) ); 00054 00055 if( this->reduceEnabled( ) ) { 00056 cudaPGSReduce<CudaT>( this->bodyFAcc.getDeviceBuffer( ), 00057 this->bodyTAcc.getDeviceBuffer( ), 00058 this->bodyFAccReduction.getDeviceBuffer( ), 00059 this->bodyTAccReduction.getDeviceBuffer( ), 00060 this->reduceStrategy_ ); 00061 } 00062 } 00063 00064 template<typename CudaT,typename ParamsT> 00065 void CudaPGSSolver<CudaT,ParamsT>::loadConstraints( ) 00066 { 00067 ParallelPGSSolver<CudaT,ParamsT, ParallelTypes::CUDA>::loadConstraints( ); 00068 00069 // Zero out the force accumulation vector 00070 if( this->reduceEnabled( ) ) { 00071 cudaZeroVector<Vec4T>(this->bodyFAccReduction.getDeviceBuffer( ), this->bodyFAccReduction.getSize( )); 00072 cudaZeroVector<Vec4T>(this->bodyTAccReduction.getDeviceBuffer( ), this->bodyTAccReduction.getSize( )); 00073 } 00074 } 00075 00076 }