$search
00001 #ifndef PARALLEL_MEMORY_H 00002 #define PARALLEL_MEMORY_H 00003 00004 #include "parallel_common.h" 00005 00006 namespace parallel_utils 00007 { 00008 00009 namespace CopyTypes 00010 { 00011 enum CopyType { COPY_SYNC, COPY_ASYNC, COPY_DEFAULT = COPY_ASYNC }; 00012 } 00013 typedef CopyTypes::CopyType CopyType; 00014 00015 template<typename T> 00016 static T offsetBuffer(T buffer, ArraySize offset) { 00017 return buffer + offset; 00018 } 00019 00020 template <typename T, ParallelType PT> struct MemManager { 00021 00022 typedef T* host_type; 00023 typedef T* device_type; 00024 typedef void** device_init_type; 00025 typedef int stream_type; 00026 typedef unsigned int mem_flags; 00027 00028 inline static void allocateHost( void** ptr, size_t size, int flags ) { 00029 *ptr = malloc( size ); 00030 } 00031 00032 inline static void freeHost( void* ptr ) { 00033 free( ptr ); 00034 } 00035 00036 inline static void allocateDevice( void** ptr, size_t size ) { 00037 *ptr = NULL; 00038 } 00039 00040 inline static void freeDevice( void* ptr ) { 00041 00042 } 00043 00044 inline static void copyToHost(void* dst, void* src, size_t size, CopyType copyType, stream_type stream ) { 00045 00046 } 00047 00048 inline static void copyToDevice(void* dst, void* src, size_t size, CopyType copyType, stream_type stream ) { 00049 00050 } 00051 00052 }; 00053 00054 #ifdef USE_CUDA 00055 00056 template <typename T> struct MemManager<T,ParallelTypes::CUDA> { 00057 00058 typedef T* host_type; 00059 typedef T* device_type; 00060 typedef void** device_init_type; 00061 typedef cudaStream_t stream_type; 00062 typedef unsigned int mem_flags; 00063 00064 const static mem_flags defaultAllocFlags = cudaHostAllocPortable; 00065 00066 inline static void allocateHost( void** ptr, size_t size, int flags ) { 00067 cudaHostAlloc(ptr, size, flags ); 00068 } 00069 00070 inline static void freeHost( void* ptr ) { 00071 cudaFreeHost( ptr ); 00072 } 00073 00074 inline static void allocateDevice( void** ptr, size_t size ) { 00075 cudaMalloc(ptr, size); 00076 } 00077 00078 inline static void freeDevice( void* ptr ) { 00079 cudaFree( ptr ); 00080 } 00081 00082 inline static void copyToHost(void* dst, void * src, size_t size, CopyType copyType, stream_type stream ) { 00083 if( CopyTypes::COPY_SYNC == copyType ) 00084 cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost); 00085 else 00086 cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream); 00087 } 00088 00089 inline static void copyToDevice(void* dst, void * src, size_t size, CopyType copyType, stream_type stream ) { 00090 if( CopyTypes::COPY_SYNC == copyType ) 00091 cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); 00092 else 00093 cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream); 00094 } 00095 }; 00096 00097 #elif USE_OPENCL 00098 00099 // NOTE: If you don't have OpenCL 1.1, you're out of luck here =/ 00100 template<> 00101 cl_mem offsetBuffer<cl_mem>(cl_mem buffer, ArraySize offset) { 00102 00103 if( NULL == buffer || offset == 0 ) return buffer; 00104 00105 size_t originalBufferSize; 00106 cl_int ciErrNum; 00107 ciErrNum = clGetMemObjectInfo( buffer, 00108 CL_MEM_SIZE, 00109 sizeof(size_t), 00110 &originalBufferSize, 00111 NULL ); 00112 oclCheckError(ciErrNum, CL_SUCCESS); 00113 00114 cl_mem_flags originalBufferMemflags; 00115 ciErrNum = clGetMemObjectInfo( buffer, 00116 CL_MEM_FLAGS, 00117 sizeof(cl_mem_flags), 00118 &originalBufferMemflags, 00119 NULL ); 00120 oclCheckError(ciErrNum, CL_SUCCESS); 00121 00122 cl_buffer_region subBufferRegion = { offset, originalBufferSize - offset }; 00123 cl_mem subBuffer = clCreateSubBuffer( buffer, 00124 originalBufferMemflags, 00125 CL_BUFFER_CREATE_TYPE_REGION, 00126 &subBufferRegion, 00127 &ciErrNum ); 00128 oclCheckError(ciErrNum, CL_SUCCESS); 00129 00130 return subBuffer; 00131 } 00132 00133 template <typename T> struct MemManager<T, ParallelType::OpenCL> { 00134 00135 typedef T* host_type; 00136 typedef cl_mem device_type; 00137 typedef cl_mem* device_init_type; 00138 typedef int stream_type; 00139 typedef cl_mem_flags mem_flags; 00140 00141 inline static void allocateHost( void** ptr, size_t size, int flags ) { 00142 *ptr = malloc( size ); 00143 } 00144 00145 inline static void freeHost( void* ptr ) { 00146 if( ptr ) free( ptr ); 00147 } 00148 00149 inline static void allocateDevice( cl_mem* ptr, size_t size ) { 00150 cl_int ciErrNum; 00151 *ptr = clCreateBuffer( dxGetDeviceContext(), CL_MEM_READ_WRITE, size, NULL, &ciErrNum ); 00152 oclCheckError(ciErrNum, CL_SUCCESS); 00153 } 00154 00155 inline static void freeDevice( void* ptr ) { 00156 if( ptr ) { 00157 cl_int ciErrNum; 00158 ciErrNum = clReleaseMemObject( ((device_type)ptr) ); 00159 oclCheckError(ciErrNum, CL_SUCCESS); 00160 } 00161 } 00162 00163 inline static void copyToHost(void* dst, cl_mem src, size_t size, CopyType copyType, stream_type stream ) { 00164 cl_int ciErrNum; 00165 ciErrNum = clEnqueueReadBuffer(dxGetDeviceQueue(), 00166 src, 00167 (CopyTypes::COPY_SYNC == copyType ), 00168 0, 00169 size, 00170 dst, 00171 0, 00172 NULL, 00173 NULL); 00174 oclCheckError(ciErrNum, CL_SUCCESS); 00175 } 00176 00177 inline static void copyToDevice(cl_mem dst, void* src, size_t size, CopyType copyType, stream_type stream ) { 00178 cl_int ciErrNum; 00179 ciErrNum = clEnqueueWriteBuffer(dxGetDeviceQueue(), 00180 dst, 00181 (CopyTypes::COPY_SYNC == copyType ), 00182 0, 00183 size, 00184 src, 00185 0, 00186 NULL, 00187 NULL); 00188 oclCheckError(ciErrNum, CL_SUCCESS); 00189 } 00190 }; 00191 00192 #endif 00193 00194 } 00195 00196 #endif 00197