TensorDeviceSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17 #include <unordered_set>
18 
19 namespace Eigen {
20 
21 namespace TensorSycl {
22 namespace internal {
23 
25 struct SyclDeviceInfo {
26  SyclDeviceInfo(cl::sycl::queue queue)
27  : local_mem_type(
28  queue.get_device()
29  .template get_info<cl::sycl::info::device::local_mem_type>()),
30  max_work_item_sizes(
31  queue.get_device()
32  .template get_info<
33  cl::sycl::info::device::max_work_item_sizes>()),
34  max_mem_alloc_size(
35  queue.get_device()
36  .template get_info<
37  cl::sycl::info::device::max_mem_alloc_size>()),
38  max_compute_units(queue.get_device()
39  .template get_info<
40  cl::sycl::info::device::max_compute_units>()),
41  max_work_group_size(
42  queue.get_device()
43  .template get_info<
44  cl::sycl::info::device::max_work_group_size>()),
45  local_mem_size(
46  queue.get_device()
47  .template get_info<cl::sycl::info::device::local_mem_size>()),
48  platform_name(queue.get_device()
49  .get_platform()
50  .template get_info<cl::sycl::info::platform::name>()),
51  device_name(queue.get_device()
52  .template get_info<cl::sycl::info::device::name>()),
53  device_vendor(
54  queue.get_device()
55  .template get_info<cl::sycl::info::device::vendor>()) {}
56 
57  cl::sycl::info::local_mem_type local_mem_type;
58  cl::sycl::id<3> max_work_item_sizes;
59  unsigned long max_mem_alloc_size;
60  unsigned long max_compute_units;
61  unsigned long max_work_group_size;
62  size_t local_mem_size;
63  std::string platform_name;
64  std::string device_name;
65  std::string device_vendor;
66 };
67 
68 } // end namespace internal
69 } // end namespace TensorSycl
70 
71 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
72 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
73 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
74 // TensorFlow via the Eigen SYCL Backend.
75 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
76  -> decltype(cl::sycl::device::get_devices()) {
77 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
78  return {cl::sycl::device(cl::sycl::default_selector())};
79 #else
80  std::vector<cl::sycl::device> supported_devices;
81  auto platform_list = cl::sycl::platform::get_platforms();
82  for (const auto &platform : platform_list) {
83  auto device_list = platform.get_devices();
84  auto platform_name =
85  platform.template get_info<cl::sycl::info::platform::name>();
86  std::transform(platform_name.begin(), platform_name.end(),
87  platform_name.begin(), ::tolower);
88  for (const auto &device : device_list) {
89  auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90  std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91  bool unsupported_condition =
92  (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
93  vendor.find("apu") == std::string::npos) ||
94  (platform_name.find("experimental") != std::string::npos) ||
95  device.is_host();
96  if (!unsupported_condition) {
97  supported_devices.push_back(device);
98  }
99  }
100  }
101  return supported_devices;
102 #endif
103 }
104 
105 class QueueInterface {
106  public:
108  template <typename DeviceOrSelector>
109  explicit QueueInterface(
110  const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111  unsigned num_threads = std::thread::hardware_concurrency())
112  : m_queue(dev_or_sel, handler),
113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114  m_prog(m_queue.get_context(), get_sycl_supported_devices()),
115 #endif
116  m_thread_pool(num_threads),
117  m_device_info(m_queue) {
118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
119  m_prog.build_with_kernel_type<DeviceOrSelector>();
120  auto f = [&](cl::sycl::handler &cgh) {
121  cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
122  [=]() {})
123  };
124  EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
125 #endif
126  }
127 
128  template <typename DeviceOrSelector>
129  explicit QueueInterface(
130  const DeviceOrSelector &dev_or_sel,
131  unsigned num_threads = std::thread::hardware_concurrency())
132  : QueueInterface(dev_or_sel,
133  [this](cl::sycl::exception_list l) {
134  this->exception_caught_ = this->sycl_async_handler(l);
135  },
136  num_threads) {}
137 
138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
139  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
140 #endif
141 
143  EIGEN_STRONG_INLINE void *attach_buffer(
144  cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
145  std::lock_guard<std::mutex> lock(pmapper_mutex_);
146  return static_cast<void *>(pMapper.add_pointer(buf));
147  }
148 
150  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
151  std::lock_guard<std::mutex> lock(pmapper_mutex_);
152  TensorSycl::internal::SYCLfree<false>(p, pMapper);
153  }
154 
163  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
164 #if EIGEN_MAX_ALIGN_BYTES > 0
165  size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
166  if (align > 0) {
167  num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
168  }
169 #endif
170  std::lock_guard<std::mutex> lock(pmapper_mutex_);
171  return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
172  }
173 
174  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
175 #if EIGEN_MAX_ALIGN_BYTES > 0
176  size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
177  if (align > 0) {
178  num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
179  }
180 #endif
181  std::lock_guard<std::mutex> lock(pmapper_mutex_);
182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
183  if (scratch_buffers.empty()) {
184  return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
185  ;
186  } else {
187  for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188  auto buff = pMapper.get_buffer(*it);
189  if (buff.get_size() >= num_bytes) {
190  auto ptr = *it;
191  scratch_buffers.erase(it);
192  return ptr;
193  } else {
194  ++it;
195  }
196  }
197  return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
198  }
199 #else
200  return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
201 #endif
202  }
203  template <typename data_t>
204  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
205  cl::sycl::access::mode::read_write, data_t>
206  get(data_t *data) const {
207  return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
208  }
209  template <typename data_t>
211  TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
212  data_t>
213  data) const {
214  return static_cast<data_t *>(data.get_virtual_pointer());
215  }
216 
217  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
218  std::lock_guard<std::mutex> lock(pmapper_mutex_);
219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
220  scratch_buffers.insert(p);
221 #else
222  TensorSycl::internal::SYCLfree(p, pMapper);
223 #endif
224  }
225  template <cl::sycl::access::mode AcMd, typename T>
226  EIGEN_STRONG_INLINE void deallocate_temp(
227  const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
228  deallocate_temp(p.get_virtual_pointer());
229  }
230 
233  EIGEN_STRONG_INLINE void deallocate(void *p) const {
234  std::lock_guard<std::mutex> lock(pmapper_mutex_);
235  TensorSycl::internal::SYCLfree(p, pMapper);
236  }
237 
238  EIGEN_STRONG_INLINE void deallocate_all() const {
239  std::lock_guard<std::mutex> lock(pmapper_mutex_);
240  TensorSycl::internal::SYCLfreeAll(pMapper);
241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
242  scratch_buffers.clear();
243 #endif
244  }
245 
250  EIGEN_STRONG_INLINE void memcpyHostToDevice(
251  void *dst, const void *src, size_t n,
252  std::function<void()> callback) const {
253  static const auto write_mode = cl::sycl::access::mode::discard_write;
254  static const auto global_access = cl::sycl::access::target::global_buffer;
255  typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
256  write_accessor;
257  if (n == 0) {
258  if (callback) callback();
259  return;
260  }
261  n /= sizeof(buffer_scalar_t);
262  auto f = [&](cl::sycl::handler &cgh) {
263  write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
264  buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
265  auto non_deleter = [](buffer_scalar_t const *) {};
266  std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267  cgh.copy(s_ptr, dst_acc);
268  };
269  cl::sycl::event e;
270  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
271  synchronize_and_callback(e, callback);
272  }
273 
278  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
279  void *dst, const void *src, size_t n,
280  std::function<void()> callback) const {
281  static const auto read_mode = cl::sycl::access::mode::read;
282  static const auto global_access = cl::sycl::access::target::global_buffer;
283  typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
284  read_accessor;
285  if (n == 0) {
286  if (callback) callback();
287  return;
288  }
289  n /= sizeof(buffer_scalar_t);
290  auto f = [&](cl::sycl::handler &cgh) {
291  read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
292  buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
293  auto non_deleter = [](buffer_scalar_t *) {};
294  std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295  cgh.copy(src_acc, s_ptr);
296  };
297  cl::sycl::event e;
298  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
299  synchronize_and_callback(e, callback);
300  }
301 
305  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
306  static const auto read_mode = cl::sycl::access::mode::read;
307  static const auto write_mode = cl::sycl::access::mode::discard_write;
308  if (n == 0) {
309  return;
310  }
311  n /= sizeof(buffer_scalar_t);
312  auto f = [&](cl::sycl::handler &cgh) {
313  auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
314  auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
315  cgh.copy(src_acc, dst_acc);
316  };
317  cl::sycl::event e;
318  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
319  async_synchronize(e);
320  }
321 
325  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
326  static const auto write_mode = cl::sycl::access::mode::discard_write;
327  if (n == 0) {
328  return;
329  }
330  n /= sizeof(buffer_scalar_t);
331  auto f = [&](cl::sycl::handler &cgh) {
332  auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
333  // The cast to uint8_t is here to match the behaviour of the standard
334  // memset. The cast to buffer_scalar_t is needed to match the type of the
335  // accessor (in case buffer_scalar_t is not uint8_t)
336  cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
337  };
338  cl::sycl::event e;
339  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
340  async_synchronize(e);
341  }
342 
350  template <cl::sycl::access::mode AcMd, typename T>
351  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
352  get_range_accessor(const void *ptr) const {
353  static const auto global_access = cl::sycl::access::target::global_buffer;
354  static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355  typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356  typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
357 
358  std::lock_guard<std::mutex> lock(pmapper_mutex_);
359 
360  auto original_buffer = pMapper.get_buffer(ptr);
361  const ptrdiff_t offset = pMapper.get_offset(ptr);
362  const ptrdiff_t typed_offset = offset / sizeof(T);
363  eigen_assert(typed_offset >= 0);
364  const auto typed_size = original_buffer.get_size() / sizeof(T);
365  auto buffer = original_buffer.template reinterpret<
367  cl::sycl::range<1>(typed_size));
368  const ptrdiff_t size = buffer.get_count() - typed_offset;
369  eigen_assert(size >= 0);
371  1, AcMd, global_access, is_place_holder>
372  placeholder_accessor_t;
373  const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
374  return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
375  cl::sycl::id<1>(typed_offset)),
376  static_cast<size_t>(typed_offset),
377  reinterpret_cast<std::intptr_t>(start_ptr));
378  }
379 
382  template <cl::sycl::access::mode AcMd, typename Index>
383  EIGEN_STRONG_INLINE cl::sycl::accessor<
384  buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
385  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
386  const Index n_bytes) const {
387  static const auto global_access = cl::sycl::access::target::global_buffer;
388  eigen_assert(n_bytes >= 0);
389  std::lock_guard<std::mutex> lock(pmapper_mutex_);
390  auto buffer = pMapper.get_buffer(ptr);
391  const ptrdiff_t offset = pMapper.get_offset(ptr);
392  eigen_assert(offset >= 0);
393  eigen_assert(offset + n_bytes <= buffer.get_size());
394  return buffer.template get_access<AcMd, global_access>(
395  cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
396  }
397 
402  template <cl::sycl::access::mode AcMd>
403  EIGEN_STRONG_INLINE cl::sycl::accessor<
404  buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
405  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
406  std::lock_guard<std::mutex> lock(pmapper_mutex_);
407  return pMapper.get_buffer(ptr)
408  .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
409  cgh);
410  }
411 
412  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
413  const void *ptr) const {
414  std::lock_guard<std::mutex> lock(pmapper_mutex_);
415  return pMapper.get_buffer(ptr);
416  }
417 
418  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
419  std::lock_guard<std::mutex> lock(pmapper_mutex_);
420  return pMapper.get_offset(ptr);
421  }
422 
423  template <typename OutScalar, typename sycl_kernel, typename Lhs,
424  typename Rhs, typename OutPtr, typename Range, typename Index,
425  typename... T>
426  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
427  const Rhs &rhs, OutPtr outptr,
428  Range thread_range,
429  Index scratchSize,
430  T... var) const {
431  auto kernel_functor = [=](cl::sycl::handler &cgh) {
432  // binding the placeholder accessors to a commandgroup handler
433  lhs.bind(cgh);
434  rhs.bind(cgh);
435  outptr.bind(cgh);
436  typedef cl::sycl::accessor<OutScalar, 1,
437  cl::sycl::access::mode::read_write,
439  LocalAccessor;
440 
441  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
442  cgh.parallel_for(
443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444  program().template get_kernel<sycl_kernel>(),
445 #endif
446  thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
447  };
448  cl::sycl::event e;
449  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
450  async_synchronize(e);
451  }
452 
453  template <typename OutScalar, typename sycl_kernel, typename InPtr,
454  typename OutPtr, typename Range, typename Index, typename... T>
455  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
456  OutPtr &outptr,
457  Range thread_range,
458  Index scratchSize,
459  T... var) const {
460  auto kernel_functor = [=](cl::sycl::handler &cgh) {
461  // binding the placeholder accessors to a commandgroup handler
462  inptr.bind(cgh);
463  outptr.bind(cgh);
464  typedef cl::sycl::accessor<OutScalar, 1,
465  cl::sycl::access::mode::read_write,
467  LocalAccessor;
468 
469  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
470  cgh.parallel_for(
471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472  program().template get_kernel<sycl_kernel>(),
473 #endif
474  thread_range, sycl_kernel(scratch, inptr, outptr, var...));
475  };
476  cl::sycl::event e;
477  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
478  async_synchronize(e);
479  }
480 
481  template <typename OutScalar, typename sycl_kernel, typename InPtr,
482  typename Range, typename Index, typename... T>
483  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
484  Range thread_range,
485  Index scratchSize,
486  T... var) const {
487  auto kernel_functor = [=](cl::sycl::handler &cgh) {
488  // binding the placeholder accessors to a commandgroup handler
489  inptr.bind(cgh);
490  typedef cl::sycl::accessor<OutScalar, 1,
491  cl::sycl::access::mode::read_write,
493  LocalAccessor;
494 
495  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
496  cgh.parallel_for(
497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498  program().template get_kernel<sycl_kernel>(),
499 #endif
500  thread_range, sycl_kernel(scratch, inptr, var...));
501  };
502  cl::sycl::event e;
503  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
504  async_synchronize(e);
505  }
506 
507 
508  EIGEN_STRONG_INLINE void synchronize() const {
509 #ifdef EIGEN_EXCEPTIONS
510  m_queue.wait_and_throw();
511 #else
512  m_queue.wait();
513 #endif
514  }
515 
516 
517  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
518  set_latest_event(e);
519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
520  synchronize();
521 #endif
522  }
523 
524  template <typename Index>
525  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
526  Index &rng, Index &GRange) const {
527  tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
528  tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529  EIGEN_SYCL_LOCAL_THREAD_DIM1),
530  static_cast<Index>(tileSize));
531  rng = n;
532  if (rng == 0) rng = static_cast<Index>(1);
533  GRange = rng;
534  if (tileSize > GRange)
535  tileSize = GRange;
536  else if (GRange > tileSize) {
537  Index xMode = static_cast<Index>(GRange % tileSize);
538  if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
539  }
540  }
541 
544  template <typename Index>
545  EIGEN_STRONG_INLINE void parallel_for_setup(
546  const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547  cl::sycl::range<2> &local_range) const {
548  std::array<Index, 2> input_range = input_dim;
549  Index max_workgroup_Size =
550  static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
551  max_workgroup_Size =
552  std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
553  EIGEN_SYCL_LOCAL_THREAD_DIM1),
554  static_cast<Index>(max_workgroup_Size));
555  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
556  local_range[1] =
557  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
558  input_range[1] = input_dim[1];
559  if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
560  global_range[1] = input_range[1];
561  if (local_range[1] > global_range[1])
562  local_range[1] = global_range[1];
563  else if (global_range[1] > local_range[1]) {
564  Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
565  if (xMode != 0)
566  global_range[1] += static_cast<Index>(local_range[1] - xMode);
567  }
568  local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
569  input_range[0] = input_dim[0];
570  if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
571  global_range[0] = input_range[0];
572  if (local_range[0] > global_range[0])
573  local_range[0] = global_range[0];
574  else if (global_range[0] > local_range[0]) {
575  Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
576  if (xMode != 0)
577  global_range[0] += static_cast<Index>(local_range[0] - xMode);
578  }
579  }
580 
583  template <typename Index>
584  EIGEN_STRONG_INLINE void parallel_for_setup(
585  const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586  cl::sycl::range<3> &local_range) const {
587  std::array<Index, 3> input_range = input_dim;
588  Index max_workgroup_Size =
589  static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
590  max_workgroup_Size =
591  std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
592  EIGEN_SYCL_LOCAL_THREAD_DIM1),
593  static_cast<Index>(max_workgroup_Size));
594  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
595  local_range[2] =
596  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
597  input_range[2] = input_dim[2];
598  if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
599  global_range[2] = input_range[2];
600  if (local_range[2] > global_range[2])
601  local_range[2] = global_range[2];
602  else if (global_range[2] > local_range[2]) {
603  Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
604  if (xMode != 0)
605  global_range[2] += static_cast<Index>(local_range[2] - xMode);
606  }
607  pow_of_2 = static_cast<Index>(
608  std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
609  local_range[1] =
610  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
611  input_range[1] = input_dim[1];
612  if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
613  global_range[1] = input_range[1];
614  if (local_range[1] > global_range[1])
615  local_range[1] = global_range[1];
616  else if (global_range[1] > local_range[1]) {
617  Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
618  if (xMode != 0)
619  global_range[1] += static_cast<Index>(local_range[1] - xMode);
620  }
621  local_range[0] = static_cast<Index>(max_workgroup_Size /
622  (local_range[1] * local_range[2]));
623  input_range[0] = input_dim[0];
624  if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
625  global_range[0] = input_range[0];
626  if (local_range[0] > global_range[0])
627  local_range[0] = global_range[0];
628  else if (global_range[0] > local_range[0]) {
629  Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
630  if (xMode != 0)
631  global_range[0] += static_cast<Index>(local_range[0] - xMode);
632  }
633  }
634 
635  EIGEN_STRONG_INLINE bool has_local_memory() const {
636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
637  return false;
638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
639  return true;
640 #else
641  return m_device_info.local_mem_type ==
643 #endif
644  }
645 
646  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
647  return m_device_info.max_mem_alloc_size;
648  }
649 
650  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
651  return m_device_info.max_compute_units;
652  }
653 
654  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
655  return m_device_info.max_work_group_size;
656  }
657 
658  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
659  return m_device_info.max_work_item_sizes;
660  }
661 
663  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
664 
665  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
666  // OpenCL doesnot have such concept
667  return 2;
668  }
669 
670  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
671  return m_device_info.local_mem_size;
672  }
673 
674  // This function returns the nearest power of 2 Work-group size which is <=
675  // maximum device workgroup size.
676  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
677  return getPowerOfTwo(m_device_info.max_work_group_size, false);
678  }
679 
680  EIGEN_STRONG_INLINE std::string getPlatformName() const {
681  return m_device_info.platform_name;
682  }
683 
684  EIGEN_STRONG_INLINE std::string getDeviceName() const {
685  return m_device_info.device_name;
686  }
687 
688  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
689  return m_device_info.device_vendor;
690  }
691 
692  // This function returns the nearest power of 2
693  // if roundup is true returns result>=wgsize
694  // else it return result <= wgsize
695  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
696  if (roundUp) --wGSize;
697  wGSize |= (wGSize >> 1);
698  wGSize |= (wGSize >> 2);
699  wGSize |= (wGSize >> 4);
700  wGSize |= (wGSize >> 8);
701  wGSize |= (wGSize >> 16);
702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
703  wGSize |= (wGSize >> 32);
704 #endif
705  return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
706  }
707 
708  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
709 
710  // This function checks if the runtime recorded an error for the
711  // underlying stream device.
712  EIGEN_STRONG_INLINE bool ok() const {
713  if (!exception_caught_) {
714  synchronize();
715  }
716  return !exception_caught_;
717  }
718 
719  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
721  std::lock_guard<std::mutex> lock(event_mutex_);
722  return latest_events_[std::this_thread::get_id()];
723 #else
724  eigen_assert(false);
725  return cl::sycl::event();
726 #endif
727  }
728 
729  // destructor
730  ~QueueInterface() {
731  pMapper.clear();
732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
733  scratch_buffers.clear();
734 #endif
735  }
736 
737  protected:
738  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
740  std::lock_guard<std::mutex> lock(event_mutex_);
741  latest_events_[std::this_thread::get_id()] = e;
742 #else
744 #endif
745  }
746 
747  void synchronize_and_callback(cl::sycl::event e,
748  const std::function<void()> &callback) const {
749  set_latest_event(e);
750  if (callback) {
751  auto callback_ = [=]() {
752 #ifdef EIGEN_EXCEPTIONS
753  cl::sycl::event(e).wait_and_throw();
754 #else
755  cl::sycl::event(e).wait();
756 #endif
757  callback();
758  };
759  m_thread_pool.Schedule(std::move(callback_));
760  } else {
761 #ifdef EIGEN_EXCEPTIONS
762  m_queue.wait_and_throw();
763 #else
764  m_queue.wait();
765 #endif
766  }
767  }
768 
769  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
770  bool exception_caught = false;
771  for (const auto &e : exceptions) {
772  if (e) {
773  exception_caught = true;
774  EIGEN_THROW_X(e);
775  }
776  }
777  return exception_caught;
778  }
779 
781  bool exception_caught_ = false;
782 
783  mutable std::mutex pmapper_mutex_;
784 
785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
786  mutable std::mutex event_mutex_;
787  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
788 #endif
789 
794  mutable TensorSycl::internal::PointerMapper pMapper;
795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
796  mutable std::unordered_set<void *> scratch_buffers;
797 #endif
798  mutable cl::sycl::queue m_queue;
800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
801  mutable cl::sycl::program m_prog;
802 #endif
803 
806  mutable Eigen::ThreadPool m_thread_pool;
807 
808  const TensorSycl::internal::SyclDeviceInfo m_device_info;
809 };
810 
811 struct SyclDeviceBase {
814  const QueueInterface *m_queue_stream;
815  explicit SyclDeviceBase(const QueueInterface *queue_stream)
816  : m_queue_stream(queue_stream) {}
817  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
818  return m_queue_stream;
819  }
820 };
821 
822 // Here is a sycl device struct which accept the sycl queue interface
823 // as an input
824 struct SyclDevice : public SyclDeviceBase {
825  explicit SyclDevice(const QueueInterface *queue_stream)
826  : SyclDeviceBase(queue_stream) {}
827 
828  // this is the accessor used to construct the evaluator
829  template <cl::sycl::access::mode AcMd, typename T>
830  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
831  get_range_accessor(const void *ptr) const {
832  return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
833  }
834 
835  // get sycl accessor
836  template <cl::sycl::access::mode AcMd>
837  EIGEN_STRONG_INLINE cl::sycl::accessor<
838  buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
839  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
840  return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
841  }
842 
844  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
845  const void *ptr) const {
846  return queue_stream()->get_sycl_buffer(ptr);
847  }
848 
851  template <typename Index>
852  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
853  Index &rng, Index &GRange) const {
854  queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
855  }
856 
859  template <typename Index>
860  EIGEN_STRONG_INLINE void parallel_for_setup(
861  const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862  cl::sycl::range<2> &local_range) const {
863  queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
864  }
865 
868  template <typename Index>
869  EIGEN_STRONG_INLINE void parallel_for_setup(
870  const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871  cl::sycl::range<3> &local_range) const {
872  queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
873  }
874 
876  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
877  return queue_stream()->allocate(num_bytes);
878  }
879 
880  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
881  return queue_stream()->allocate_temp(num_bytes);
882  }
883 
885  EIGEN_STRONG_INLINE void deallocate(void *p) const {
886  queue_stream()->deallocate(p);
887  }
888 
889  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
890  queue_stream()->deallocate_temp(buffer);
891  }
892  template <cl::sycl::access::mode AcMd, typename T>
893  EIGEN_STRONG_INLINE void deallocate_temp(
894  const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
895  queue_stream()->deallocate_temp(buffer);
896  }
897  EIGEN_STRONG_INLINE void deallocate_all() const {
898  queue_stream()->deallocate_all();
899  }
900 
901  template <typename data_t>
902  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
903  cl::sycl::access::mode::read_write, data_t>
904  get(data_t *data) const {
905  return queue_stream()->get(data);
906  }
907  template <typename data_t>
909  TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
910  data_t>
911  data) const {
912  return queue_stream()->get(data);
913  }
914 
916  EIGEN_STRONG_INLINE void *attach_buffer(
917  cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
918  return queue_stream()->attach_buffer(buf);
919  }
921  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
922  queue_stream()->detach_buffer(p);
923  }
924  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
925  return queue_stream()->get_offset(ptr);
926  }
927 
928  // some runtime conditions that can be applied here
929  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
930 
932  template <typename Index>
933  EIGEN_STRONG_INLINE void memcpyHostToDevice(
934  Index *dst, const Index *src, size_t n,
935  std::function<void()> callback = {}) const {
936  queue_stream()->memcpyHostToDevice(dst, src, n, callback);
937  }
939  template <typename Index>
940  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
941  void *dst, const Index *src, size_t n,
942  std::function<void()> callback = {}) const {
943  queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
944  }
946  template <typename Index>
947  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
948  queue_stream()->memcpy(dst, src, n);
949  }
951  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
952  queue_stream()->memset(data, c, n);
953  }
955  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
956  return queue_stream()->sycl_queue();
957  }
958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
959  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
960  return queue_stream()->program();
961  }
962 #endif
963 
964  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
965 
966  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
967  // We won't try to take advantage of the l2 cache for the time being, and
968  // there is no l3 cache on sycl devices.
969  return firstLevelCacheSize();
970  }
971  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
972  return queue_stream()->getNumSyclMultiProcessors();
973  }
974  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
975  return queue_stream()->maxSyclThreadsPerBlock();
976  }
977  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
978  return queue_stream()->maxWorkItemSizes();
979  }
980  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
981  // OpenCL doesnot have such concept
982  return queue_stream()->maxSyclThreadsPerMultiProcessor();
983  }
984  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
985  return queue_stream()->sharedMemPerBlock();
986  }
987  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
988  return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
989  }
990 
991  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
992  return queue_stream()->getPowerOfTwo(val, roundUp);
993  }
995  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
996  return queue_stream()->majorDeviceVersion();
997  }
998 
999  EIGEN_STRONG_INLINE void synchronize() const {
1000  queue_stream()->synchronize();
1001  }
1002  EIGEN_STRONG_INLINE void async_synchronize(
1003  cl::sycl::event e = cl::sycl::event()) const {
1004  queue_stream()->async_synchronize(e);
1005  }
1006  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
1007  return queue_stream()->get_latest_event();
1008  }
1009 
1010  // This function checks if the runtime recorded an error for the
1011  // underlying stream device.
1012  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
1013 
1014  EIGEN_STRONG_INLINE bool has_local_memory() const {
1015  return queue_stream()->has_local_memory();
1016  }
1017  EIGEN_STRONG_INLINE long max_buffer_size() const {
1018  return queue_stream()->max_buffer_size();
1019  }
1020  EIGEN_STRONG_INLINE std::string getPlatformName() const {
1021  return queue_stream()->getPlatformName();
1022  }
1023  EIGEN_STRONG_INLINE std::string getDeviceName() const {
1024  return queue_stream()->getDeviceName();
1025  }
1026  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
1027  return queue_stream()->getDeviceVendor();
1028  }
1029  template <typename OutScalar, typename KernelType, typename... T>
1030  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
1031  queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1032  var...);
1033  }
1034  template <typename OutScalar, typename KernelType, typename... T>
1035  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
1036  queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1037  var...);
1038  }
1039 
1040  template <typename OutScalar, typename KernelType, typename... T>
1041  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
1042  queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1043  var...);
1044  }
1045 };
1046 } // end namespace Eigen
1047 
1048 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
Eigen::internal::Lhs
@ Lhs
Definition: TensorContractionMapper.h:19
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
name
Annotation for function names.
Definition: attr.h:51
rng
static std::mt19937 rng
Definition: timeFactorOverhead.cpp:31
gtsam.examples.DogLegOptimizerExample.type
type
Definition: DogLegOptimizerExample.py:111
e
Array< double, 1, 3 > e(1./3., 0.5, 2.)
gtsam::synchronize
void synchronize(ConcurrentFilter &filter, ConcurrentSmoother &smoother)
Definition: ConcurrentFilteringAndSmoothing.cpp:28
Eigen::TensorSycl::internal::read
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if< PacketLoad, PacketType >::type read(const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld)
read, a template function used for loading the data from global memory. This function is used to guar...
Definition: TensorContractionSycl.h:177
c
Scalar Scalar * c
Definition: benchVecAdd.cpp:17
eigen_assert
#define eigen_assert(x)
Definition: Macros.h:1037
pybind_wrapper_test_script.this
this
Definition: pybind_wrapper_test_script.py:38
data_t
arr data_t(const arr_t &a, Ix... index)
Definition: test_numpy_array.cpp:87
T
Eigen::Triplet< double > T
Definition: Tutorial_sparse_example.cpp:6
buffer
Definition: pytypes.h:2270
gtsam::internal::align
static Similarity3 align(const Point3Pairs &d_abPointPairs, const Rot3 &aRb, const Point3Pair &centroids)
This method estimates the similarity transform from differences point pairs,.
Definition: Similarity3.cpp:69
Eigen::bfloat16_impl::log2
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16 &a)
Definition: BFloat16.h:508
gtsam::utils.numerical_derivative.local
np.ndarray local(Y a, Y b)
Definition: numerical_derivative.py:33
uint8_t
unsigned char uint8_t
Definition: ms_stdint.h:83
size
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
n
int n
Definition: BiCGSTAB_simple.cpp:1
data
int data[]
Definition: Map_placement_new.cpp:1
EIGEN_UNUSED_VARIABLE
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:1076
intptr_t
_W64 signed int intptr_t
Definition: ms_stdint.h:123
l
static const Line3 l(Rot3(), 1, 1)
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
info
else if n * info
Definition: 3rdparty/Eigen/lapack/cholesky.cpp:18
EIGEN_MAX_ALIGN_BYTES
#define EIGEN_MAX_ALIGN_BYTES
Definition: ConfigureVectorization.h:175
EIGEN_THROW_X
#define EIGEN_THROW_X(X)
Definition: Macros.h:1403
cl
Definition: cxx11_tensor_builtins_sycl.cpp:30
EIGEN_ALWAYS_INLINE
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:932
transform
EIGEN_DONT_INLINE void transform(const Transformation &t, Data &data)
Definition: geometry.cpp:25
Eigen::Triplet< double >
ceres::pow
Jet< T, N > pow(const Jet< T, N > &f, double g)
Definition: jet.h:570
Eigen::ThreadPoolTempl
Definition: NonBlockingThreadPool.h:16
Range
pair< size_t, size_t > Range
Definition: testBTree.cpp:27
offset
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition: gnuplot_common_settings.hh:64
tree::f
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
Definition: testExpression.cpp:218
Eigen::internal::Rhs
@ Rhs
Definition: TensorContractionMapper.h:18
EIGEN_SYCL_TRY_CATCH
#define EIGEN_SYCL_TRY_CATCH(X)
Definition: TensorMacros.h:54
p
float * p
Definition: Tutorial_Map_using.cpp:9
min
#define min(a, b)
Definition: datatypes.h:19
internal
Definition: BandTriangularSolver.h:13
get
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74


gtsam
Author(s):
autogenerated on Sat Nov 16 2024 04:05:47