10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) 11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H 15 static const int kCudaScratchSize = 1024;
19 class StreamInterface {
21 virtual ~StreamInterface() {}
23 virtual const cudaStream_t& stream()
const = 0;
24 virtual const cudaDeviceProp& deviceProperties()
const = 0;
27 virtual void* allocate(
size_t num_bytes)
const = 0;
28 virtual void deallocate(
void* buffer)
const = 0;
31 virtual void* scratchpad()
const = 0;
37 virtual unsigned int* semaphore()
const = 0;
40 static cudaDeviceProp* m_deviceProperties;
41 static bool m_devicePropInitialized =
false;
43 static void initializeDeviceProp() {
44 if (!m_devicePropInitialized) {
51 #if __cplusplus >= 201103L 52 static std::atomic<bool> first(
true);
53 if (first.exchange(
false)) {
55 static bool first =
true;
61 cudaError_t status = cudaGetDeviceCount(&num_devices);
62 if (status != cudaSuccess) {
63 std::cerr <<
"Failed to get the number of CUDA devices: " 64 << cudaGetErrorString(status)
66 assert(status == cudaSuccess);
68 m_deviceProperties =
new cudaDeviceProp[num_devices];
69 for (
int i = 0; i < num_devices; ++i) {
70 status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
71 if (status != cudaSuccess) {
72 std::cerr <<
"Failed to initialize CUDA device #" 75 << cudaGetErrorString(status)
77 assert(status == cudaSuccess);
81 #if __cplusplus >= 201103L 82 std::atomic_thread_fence(std::memory_order_release);
84 m_devicePropInitialized =
true;
87 while (!m_devicePropInitialized) {
88 #if __cplusplus >= 201103L 89 std::atomic_thread_fence(std::memory_order_acquire);
97 static const cudaStream_t default_stream = cudaStreamDefault;
99 class CudaStreamDevice :
public StreamInterface {
102 CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
103 cudaGetDevice(&device_);
104 initializeDeviceProp();
107 CudaStreamDevice(
int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
108 initializeDeviceProp();
114 CudaStreamDevice(
const cudaStream_t* stream,
int device = -1)
115 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
117 cudaGetDevice(&device_);
120 cudaError_t err = cudaGetDeviceCount(&num_devices);
122 assert(err == cudaSuccess);
123 assert(device < num_devices);
126 initializeDeviceProp();
129 virtual ~CudaStreamDevice() {
131 deallocate(scratch_);
135 const cudaStream_t& stream()
const {
return *stream_; }
136 const cudaDeviceProp& deviceProperties()
const {
137 return m_deviceProperties[device_];
139 virtual void* allocate(
size_t num_bytes)
const {
140 cudaError_t err = cudaSetDevice(device_);
142 assert(err == cudaSuccess);
144 err = cudaMalloc(&result, num_bytes);
145 assert(err == cudaSuccess);
146 assert(result != NULL);
149 virtual
void deallocate(
void* buffer)
const {
150 cudaError_t err = cudaSetDevice(device_);
152 assert(err == cudaSuccess);
153 assert(buffer != NULL);
154 err = cudaFree(buffer);
155 assert(err == cudaSuccess);
158 virtual
void* scratchpad()
const {
159 if (scratch_ == NULL) {
160 scratch_ = allocate(kCudaScratchSize +
sizeof(
unsigned int));
165 virtual unsigned int* semaphore()
const {
166 if (semaphore_ == NULL) {
167 char* scratch =
static_cast<char*
>(scratchpad()) + kCudaScratchSize;
168 semaphore_ =
reinterpret_cast<unsigned int*
>(scratch);
169 cudaError_t err = cudaMemsetAsync(semaphore_, 0,
sizeof(
unsigned int), *stream_);
171 assert(err == cudaSuccess);
177 const cudaStream_t* stream_;
179 mutable
void* scratch_;
180 mutable
unsigned int* semaphore_;
186 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
189 explicit GpuDevice(
const StreamInterface* stream,
int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
194 return stream_->stream();
198 return stream_->allocate(num_bytes);
202 stream_->deallocate(buffer);
206 return stream_->scratchpad();
210 return stream_->semaphore();
213 EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void memcpy(
void* dst,
const void* src,
size_t n)
const {
214 #ifndef __CUDA_ARCH__ 215 cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
218 assert(err == cudaSuccess);
220 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
226 cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
228 assert(err == cudaSuccess);
233 cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
235 assert(err == cudaSuccess);
239 #ifndef __CUDA_ARCH__ 240 cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
242 assert(err == cudaSuccess);
244 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
261 return firstLevelCacheSize();
265 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__) 266 cudaError_t err = cudaStreamSynchronize(stream_->stream());
267 if (err != cudaSuccess) {
268 std::cerr <<
"Error detected in CUDA stream: " 269 << cudaGetErrorString(err)
271 assert(err == cudaSuccess);
274 assert(
false &&
"The default device should be used instead to generate kernel code");
279 return stream_->deviceProperties().multiProcessorCount;
282 return stream_->deviceProperties().maxThreadsPerBlock;
285 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
288 return stream_->deviceProperties().sharedMemPerBlock;
291 return stream_->deviceProperties().major;
294 return stream_->deviceProperties().minor;
303 inline bool ok()
const {
305 cudaError_t
error = cudaStreamQuery(stream_->stream());
306 return (error == cudaSuccess) || (error == cudaErrorNotReady);
313 const StreamInterface* stream_;
317 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ 318 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ 319 assert(cudaGetLastError() == cudaSuccess); 324 static EIGEN_DEVICE_FUNC
inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
325 #ifndef __CUDA_ARCH__ 326 cudaError_t status = cudaDeviceSetSharedMemConfig(config);
328 assert(status == cudaSuccess);
337 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H #define EIGEN_STRONG_INLINE
return(x<=y?ADS(x):ADS(y))
#define EIGEN_UNUSED_VARIABLE(var)