TensorDeviceGpu.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
12 
13 // This header file container defines fo gpu* macros which will resolve to
14 // their equivalent hip* or cuda* versions depending on the compiler in use
15 // A separate header (included at the end of this file) will undefine all
17 
18 namespace Eigen {
19 
20 static const int kGpuScratchSize = 1024;
21 
22 // This defines an interface that GPUDevice can take to use
23 // HIP / CUDA streams underneath.
24 class StreamInterface {
25  public:
26  virtual ~StreamInterface() {}
27 
28  virtual const gpuStream_t& stream() const = 0;
29  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
30 
31  // Allocate memory on the actual device where the computation will run
32  virtual void* allocate(size_t num_bytes) const = 0;
33  virtual void deallocate(void* buffer) const = 0;
34 
35  // Return a scratchpad buffer of size 1k
36  virtual void* scratchpad() const = 0;
37 
38  // Return a semaphore. The semaphore is initially initialized to 0, and
39  // each kernel using it is responsible for resetting to 0 upon completion
40  // to maintain the invariant that the semaphore is always equal to 0 upon
41  // each kernel start.
42  virtual unsigned int* semaphore() const = 0;
43 };
44 
45 class GpuDeviceProperties {
46  public:
47  GpuDeviceProperties() :
48  initialized_(false), first_(true), device_properties_(nullptr) {}
49 
50  ~GpuDeviceProperties() {
51  if (device_properties_) {
52  delete[] device_properties_;
53  }
54  }
55 
56  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
57  return device_properties_[device];
58  }
59 
60  EIGEN_STRONG_INLINE bool isInitialized() const {
61  return initialized_;
62  }
63 
64  void initialize() {
65  if (!initialized_) {
66  // Attempts to ensure proper behavior in the case of multiple threads
67  // calling this function simultaneously. This would be trivial to
68  // implement if we could use std::mutex, but unfortunately mutex don't
69  // compile with nvcc, so we resort to atomics and thread fences instead.
70  // Note that if the caller uses a compiler that doesn't support c++11 we
71  // can't ensure that the initialization is thread safe.
72  if (first_.exchange(false)) {
73  // We're the first thread to reach this point.
74  int num_devices;
75  gpuError_t status = gpuGetDeviceCount(&num_devices);
76  if (status != gpuSuccess) {
77  std::cerr << "Failed to get the number of GPU devices: "
78  << gpuGetErrorString(status)
79  << std::endl;
80  gpu_assert(status == gpuSuccess);
81  }
82  device_properties_ = new gpuDeviceProp_t[num_devices];
83  for (int i = 0; i < num_devices; ++i) {
84  status = gpuGetDeviceProperties(&device_properties_[i], i);
85  if (status != gpuSuccess) {
86  std::cerr << "Failed to initialize GPU device #"
87  << i
88  << ": "
89  << gpuGetErrorString(status)
90  << std::endl;
91  gpu_assert(status == gpuSuccess);
92  }
93  }
94 
95  std::atomic_thread_fence(std::memory_order_release);
96  initialized_ = true;
97  } else {
98  // Wait for the other thread to inititialize the properties.
99  while (!initialized_) {
100  std::atomic_thread_fence(std::memory_order_acquire);
101  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
102  }
103  }
104  }
105  }
106 
107  private:
108  volatile bool initialized_;
109  std::atomic<bool> first_;
110  gpuDeviceProp_t* device_properties_;
111 };
112 
113 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
114  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
115  if (!deviceProperties->isInitialized()) {
116  deviceProperties->initialize();
117  }
118  return *deviceProperties;
119 }
120 
121 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
122  return GetGpuDeviceProperties().get(device);
123 }
124 
125 static const gpuStream_t default_stream = gpuStreamDefault;
126 
127 class GpuStreamDevice : public StreamInterface {
128  public:
129  // Use the default stream on the current device
130  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
131  gpuGetDevice(&device_);
132  }
133  // Use the default stream on the specified device
134  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
135  // Use the specified stream. Note that it's the
136  // caller responsibility to ensure that the stream can run on
137  // the specified device. If no device is specified the code
138  // assumes that the stream is associated to the current gpu device.
139  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
140  : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
141  if (device < 0) {
142  gpuGetDevice(&device_);
143  } else {
144  int num_devices;
145  gpuError_t err = gpuGetDeviceCount(&num_devices);
147  gpu_assert(err == gpuSuccess);
148  gpu_assert(device < num_devices);
149  device_ = device;
150  }
151  }
152 
153  virtual ~GpuStreamDevice() {
154  if (scratch_) {
155  deallocate(scratch_);
156  }
157  }
158 
159  const gpuStream_t& stream() const { return *stream_; }
160  const gpuDeviceProp_t& deviceProperties() const {
161  return GetGpuDeviceProperties(device_);
162  }
163  virtual void* allocate(size_t num_bytes) const {
164  gpuError_t err = gpuSetDevice(device_);
166  gpu_assert(err == gpuSuccess);
167  void* result;
168  err = gpuMalloc(&result, num_bytes);
169  gpu_assert(err == gpuSuccess);
170  gpu_assert(result != NULL);
171  return result;
172  }
173  virtual void deallocate(void* buffer) const {
174  gpuError_t err = gpuSetDevice(device_);
176  gpu_assert(err == gpuSuccess);
177  gpu_assert(buffer != NULL);
178  err = gpuFree(buffer);
179  gpu_assert(err == gpuSuccess);
180  }
181 
182  virtual void* scratchpad() const {
183  if (scratch_ == NULL) {
184  scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
185  }
186  return scratch_;
187  }
188 
189  virtual unsigned int* semaphore() const {
190  if (semaphore_ == NULL) {
191  char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
192  semaphore_ = reinterpret_cast<unsigned int*>(scratch);
193  gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
195  gpu_assert(err == gpuSuccess);
196  }
197  return semaphore_;
198  }
199 
200  private:
201  const gpuStream_t* stream_;
202  int device_;
203  mutable void* scratch_;
204  mutable unsigned int* semaphore_;
205 };
206 
207 struct GpuDevice {
208  // The StreamInterface is not owned: the caller is
209  // responsible for its initialization and eventual destruction.
210  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
212  }
213  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
215  }
216  // TODO(bsteiner): This is an internal API, we should not expose it.
217  EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
218  return stream_->stream();
219  }
220 
221  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
222  return stream_->allocate(num_bytes);
223  }
224 
225  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
226  stream_->deallocate(buffer);
227  }
228 
229  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
230  return stream_->allocate(num_bytes);
231  }
232 
233  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
234  stream_->deallocate(buffer);
235  }
236 
237  template<typename Type>
239  return data;
240  }
241 
242  EIGEN_STRONG_INLINE void* scratchpad() const {
243  return stream_->scratchpad();
244  }
245 
246  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
247  return stream_->semaphore();
248  }
249 
250  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
251 #ifndef EIGEN_GPU_COMPILE_PHASE
252  gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
253  stream_->stream());
255  gpu_assert(err == gpuSuccess);
256 #else
260  eigen_assert(false && "The default device should be used instead to generate kernel code");
261 #endif
262  }
263 
264  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
265  gpuError_t err =
266  gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
268  gpu_assert(err == gpuSuccess);
269  }
270 
271  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
272  gpuError_t err =
273  gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
275  gpu_assert(err == gpuSuccess);
276  }
277 
278  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
279 #ifndef EIGEN_GPU_COMPILE_PHASE
280  gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
282  gpu_assert(err == gpuSuccess);
283 #else
284  eigen_assert(false && "The default device should be used instead to generate kernel code");
285 #endif
286  }
287 
288  EIGEN_STRONG_INLINE size_t numThreads() const {
289  // FIXME
290  return 32;
291  }
292 
293  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
294  // FIXME
295  return 48*1024;
296  }
297 
298  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
299  // We won't try to take advantage of the l2 cache for the time being, and
300  // there is no l3 cache on hip/cuda devices.
301  return firstLevelCacheSize();
302  }
303 
305 #ifndef EIGEN_GPU_COMPILE_PHASE
306  gpuError_t err = gpuStreamSynchronize(stream_->stream());
307  if (err != gpuSuccess) {
308  std::cerr << "Error detected in GPU stream: "
309  << gpuGetErrorString(err)
310  << std::endl;
311  gpu_assert(err == gpuSuccess);
312  }
313 #else
314  gpu_assert(false && "The default device should be used instead to generate kernel code");
315 #endif
316  }
317 
318  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
319  return stream_->deviceProperties().multiProcessorCount;
320  }
321  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
322  return stream_->deviceProperties().maxThreadsPerBlock;
323  }
324  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
325  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
326  }
327  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
328  return stream_->deviceProperties().sharedMemPerBlock;
329  }
330  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
331  return stream_->deviceProperties().major;
332  }
333  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
334  return stream_->deviceProperties().minor;
335  }
336 
337  EIGEN_STRONG_INLINE int maxBlocks() const {
338  return max_blocks_;
339  }
340 
341  // This function checks if the GPU runtime recorded an error for the
342  // underlying stream device.
343  inline bool ok() const {
344 #ifdef EIGEN_GPUCC
345  gpuError_t error = gpuStreamQuery(stream_->stream());
346  return (error == gpuSuccess) || (error == gpuErrorNotReady);
347 #else
348  return false;
349 #endif
350  }
351 
352  private:
353  const StreamInterface* stream_;
354  int max_blocks_;
355 };
356 
357 #if defined(EIGEN_HIPCC)
358 
359 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
360  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
361  gpu_assert(hipGetLastError() == hipSuccess);
362 
363 #else
364 
365 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
366  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
367  gpu_assert(cudaGetLastError() == cudaSuccess);
368 
369 #endif
370 
371 // FIXME: Should be device and kernel specific.
372 #ifdef EIGEN_GPUCC
373 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
374 #ifndef EIGEN_GPU_COMPILE_PHASE
375  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
376  EIGEN_UNUSED_VARIABLE(status)
377  gpu_assert(status == gpuSuccess);
378 #else
379  EIGEN_UNUSED_VARIABLE(config)
380 #endif
381 }
382 #endif
383 
384 } // end namespace Eigen
385 
386 // undefine all the gpu* macros we defined at the beginning of the file
388 
389 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
gtsam::synchronize
void synchronize(ConcurrentFilter &filter, ConcurrentSmoother &smoother)
Definition: ConcurrentFilteringAndSmoothing.cpp:28
gtsam.examples.SFMExample_bal.stream
stream
Definition: SFMExample_bal.py:24
c
Scalar Scalar * c
Definition: benchVecAdd.cpp:17
eigen_assert
#define eigen_assert(x)
Definition: Macros.h:1037
buffer
Definition: pytypes.h:2270
result
Values result
Definition: OdometryOptimize.cpp:8
n
int n
Definition: BiCGSTAB_simple.cpp:1
Type
Definition: typing.h:67
data
int data[]
Definition: Map_placement_new.cpp:1
EIGEN_UNUSED_VARIABLE
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:1076
TensorGpuHipCudaDefines.h
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
TensorGpuHipCudaUndefines.h
EIGEN_ALWAYS_INLINE
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:932
error
static double error
Definition: testRot3.cpp:37
NULL
#define NULL
Definition: ccolamd.c:609
get
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9
gtsam::lago::initialize
Values initialize(const NonlinearFactorGraph &graph, bool useOdometricPath)
Definition: lago.cpp:375


gtsam
Author(s):
autogenerated on Sat Nov 16 2024 04:05:47