15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17 #include <unordered_set>
21 namespace TensorSycl {
25 struct SyclDeviceInfo {
26 SyclDeviceInfo(cl::sycl::queue queue)
29 .template get_info<
cl::sycl::
info::device::local_mem_type>()),
33 cl::sycl::
info::device::max_work_item_sizes>()),
37 cl::sycl::
info::device::max_mem_alloc_size>()),
38 max_compute_units(queue.get_device()
40 cl::sycl::
info::device::max_compute_units>()),
44 cl::sycl::
info::device::max_work_group_size>()),
47 .template get_info<
cl::sycl::
info::device::local_mem_size>()),
48 platform_name(queue.get_device()
50 .template get_info<
cl::sycl::
info::platform::
name>()),
51 device_name(queue.get_device()
52 .template get_info<
cl::sycl::
info::device::
name>()),
55 .template get_info<
cl::sycl::
info::device::vendor>()) {}
57 cl::sycl::info::local_mem_type local_mem_type;
58 cl::sycl::id<3> max_work_item_sizes;
59 unsigned long max_mem_alloc_size;
60 unsigned long max_compute_units;
61 unsigned long max_work_group_size;
62 size_t local_mem_size;
63 std::string platform_name;
64 std::string device_name;
65 std::string device_vendor;
71 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
76 -> decltype(cl::sycl::device::get_devices()) {
77 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
78 return {cl::sycl::device(cl::sycl::default_selector())};
80 std::vector<cl::sycl::device> supported_devices;
81 auto platform_list = cl::sycl::platform::get_platforms();
82 for (
const auto &platform : platform_list) {
83 auto device_list = platform.get_devices();
85 platform.template get_info<cl::sycl::info::platform::name>();
87 platform_name.begin(), ::tolower);
88 for (
const auto &device : device_list) {
89 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91 bool unsupported_condition =
92 (device.is_cpu() && platform_name.find(
"amd") != std::string::npos &&
93 vendor.find(
"apu") == std::string::npos) ||
94 (platform_name.find(
"experimental") != std::string::npos) ||
96 if (!unsupported_condition) {
97 supported_devices.push_back(device);
101 return supported_devices;
105 class QueueInterface {
108 template <
typename DeviceOrSelector>
109 explicit QueueInterface(
110 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111 unsigned num_threads = std::thread::hardware_concurrency())
112 : m_queue(dev_or_sel, handler),
113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114 m_prog(m_queue.get_context(), get_sycl_supported_devices()),
116 m_thread_pool(num_threads),
117 m_device_info(m_queue) {
118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
119 m_prog.build_with_kernel_type<DeviceOrSelector>();
120 auto f = [&](cl::sycl::handler &cgh) {
121 cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
128 template <
typename DeviceOrSelector>
129 explicit QueueInterface(
130 const DeviceOrSelector &dev_or_sel,
131 unsigned num_threads = std::thread::hardware_concurrency())
132 : QueueInterface(dev_or_sel,
133 [
this](
cl::sycl::exception_list
l) {
134 this->exception_caught_ = this->sycl_async_handler(
l);
138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
144 cl::sycl::buffer<buffer_scalar_t, 1> &buf)
const {
145 std::lock_guard<std::mutex> lock(pmapper_mutex_);
146 return static_cast<void *
>(pMapper.add_pointer(buf));
151 std::lock_guard<std::mutex> lock(pmapper_mutex_);
152 TensorSycl::internal::SYCLfree<false>(
p, pMapper);
164 #if EIGEN_MAX_ALIGN_BYTES > 0
170 std::lock_guard<std::mutex> lock(pmapper_mutex_);
171 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
175 #if EIGEN_MAX_ALIGN_BYTES > 0
181 std::lock_guard<std::mutex> lock(pmapper_mutex_);
182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
183 if (scratch_buffers.empty()) {
184 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
187 for (
auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188 auto buff = pMapper.get_buffer(*it);
189 if (buff.get_size() >= num_bytes) {
191 scratch_buffers.erase(it);
197 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
200 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
203 template <
typename data_t>
205 cl::sycl::access::mode::read_write,
data_t>
207 return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(
data);
209 template <
typename data_t>
211 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
214 return static_cast<data_t *
>(
data.get_virtual_pointer());
218 std::lock_guard<std::mutex> lock(pmapper_mutex_);
219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
220 scratch_buffers.insert(
p);
222 TensorSycl::internal::SYCLfree(
p, pMapper);
225 template <cl::sycl::access::mode AcMd,
typename T>
227 const TensorSycl::internal::RangeAccess<AcMd, T> &
p)
const {
228 deallocate_temp(
p.get_virtual_pointer());
234 std::lock_guard<std::mutex> lock(pmapper_mutex_);
235 TensorSycl::internal::SYCLfree(
p, pMapper);
239 std::lock_guard<std::mutex> lock(pmapper_mutex_);
240 TensorSycl::internal::SYCLfreeAll(pMapper);
241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
242 scratch_buffers.clear();
251 void *dst,
const void *src,
size_t n,
252 std::function<
void()> callback)
const {
253 static const auto write_mode = cl::sycl::access::mode::discard_write;
254 static const auto global_access = cl::sycl::access::target::global_buffer;
255 typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
258 if (callback) callback();
261 n /=
sizeof(buffer_scalar_t);
262 auto f = [&](cl::sycl::handler &cgh) {
263 write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst,
n);
264 buffer_scalar_t
const *ptr =
static_cast<buffer_scalar_t
const *
>(src);
265 auto non_deleter = [](buffer_scalar_t
const *) {};
266 std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267 cgh.copy(s_ptr, dst_acc);
271 synchronize_and_callback(
e, callback);
279 void *dst,
const void *src,
size_t n,
280 std::function<
void()> callback)
const {
282 static const auto global_access = cl::sycl::access::target::global_buffer;
283 typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
286 if (callback) callback();
289 n /=
sizeof(buffer_scalar_t);
290 auto f = [&](cl::sycl::handler &cgh) {
291 read_accessor src_acc = get_range_accessor<read_mode>(cgh, src,
n);
292 buffer_scalar_t *ptr =
static_cast<buffer_scalar_t *
>(dst);
293 auto non_deleter = [](buffer_scalar_t *) {};
294 std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295 cgh.copy(src_acc, s_ptr);
299 synchronize_and_callback(
e, callback);
307 static const auto write_mode = cl::sycl::access::mode::discard_write;
311 n /=
sizeof(buffer_scalar_t);
312 auto f = [&](cl::sycl::handler &cgh) {
313 auto src_acc = get_range_accessor<read_mode>(cgh, src,
n);
314 auto dst_acc = get_range_accessor<write_mode>(cgh, dst,
n);
315 cgh.copy(src_acc, dst_acc);
319 async_synchronize(
e);
326 static const auto write_mode = cl::sycl::access::mode::discard_write;
330 n /=
sizeof(buffer_scalar_t);
331 auto f = [&](cl::sycl::handler &cgh) {
332 auto dst_acc = get_range_accessor<write_mode>(cgh,
data,
n);
336 cgh.fill(dst_acc,
static_cast<buffer_scalar_t
>(
static_cast<uint8_t>(
c)));
340 async_synchronize(
e);
350 template <cl::sycl::access::mode AcMd,
typename T>
352 get_range_accessor(
const void *ptr)
const {
353 static const auto global_access = cl::sycl::access::target::global_buffer;
354 static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355 typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356 typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
358 std::lock_guard<std::mutex> lock(pmapper_mutex_);
360 auto original_buffer = pMapper.get_buffer(ptr);
361 const ptrdiff_t
offset = pMapper.get_offset(ptr);
362 const ptrdiff_t typed_offset =
offset /
sizeof(
T);
364 const auto typed_size = original_buffer.get_size() /
sizeof(
T);
365 auto buffer = original_buffer.template reinterpret<
367 cl::sycl::range<1>(typed_size));
368 const ptrdiff_t
size =
buffer.get_count() - typed_offset;
371 1, AcMd, global_access, is_place_holder>
372 placeholder_accessor_t;
373 const auto start_ptr =
static_cast<internal_ptr_t
>(ptr) -
offset;
374 return ret_type(placeholder_accessor_t(
buffer, cl::sycl::range<1>(
size),
375 cl::sycl::id<1>(typed_offset)),
376 static_cast<size_t>(typed_offset),
382 template <cl::sycl::access::mode AcMd,
typename Index>
384 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
385 get_range_accessor(cl::sycl::handler &cgh,
const void *ptr,
386 const Index n_bytes)
const {
387 static const auto global_access = cl::sycl::access::target::global_buffer;
389 std::lock_guard<std::mutex> lock(pmapper_mutex_);
390 auto buffer = pMapper.get_buffer(ptr);
391 const ptrdiff_t
offset = pMapper.get_offset(ptr);
394 return buffer.template get_access<AcMd, global_access>(
395 cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(
offset));
402 template <cl::sycl::access::mode AcMd>
404 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
405 get_sycl_accessor(cl::sycl::handler &cgh,
const void *ptr)
const {
406 std::lock_guard<std::mutex> lock(pmapper_mutex_);
407 return pMapper.get_buffer(ptr)
408 .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
413 const void *ptr)
const {
414 std::lock_guard<std::mutex> lock(pmapper_mutex_);
415 return pMapper.get_buffer(ptr);
419 std::lock_guard<std::mutex> lock(pmapper_mutex_);
420 return pMapper.get_offset(ptr);
423 template <
typename OutScalar,
typename sycl_kernel,
typename Lhs,
424 typename Rhs,
typename OutPtr,
typename Range,
typename Index,
427 const Rhs &rhs, OutPtr outptr,
431 auto kernel_functor = [=](cl::sycl::handler &cgh) {
436 typedef cl::sycl::accessor<OutScalar, 1,
437 cl::sycl::access::mode::read_write,
441 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444 program().
template get_kernel<sycl_kernel>(),
446 thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
450 async_synchronize(
e);
453 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
454 typename OutPtr,
typename Range,
typename Index,
typename...
T>
460 auto kernel_functor = [=](cl::sycl::handler &cgh) {
464 typedef cl::sycl::accessor<OutScalar, 1,
465 cl::sycl::access::mode::read_write,
469 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472 program().
template get_kernel<sycl_kernel>(),
474 thread_range, sycl_kernel(scratch, inptr, outptr, var...));
478 async_synchronize(
e);
481 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
487 auto kernel_functor = [=](cl::sycl::handler &cgh) {
490 typedef cl::sycl::accessor<OutScalar, 1,
491 cl::sycl::access::mode::read_write,
495 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498 program().
template get_kernel<sycl_kernel>(),
500 thread_range, sycl_kernel(scratch, inptr, var...));
504 async_synchronize(
e);
509 #ifdef EIGEN_EXCEPTIONS
510 m_queue.wait_and_throw();
519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
524 template <
typename Index>
527 tileSize =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
528 tileSize =
std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529 EIGEN_SYCL_LOCAL_THREAD_DIM1),
530 static_cast<Index>(tileSize));
534 if (tileSize > GRange)
536 else if (GRange > tileSize) {
537 Index xMode =
static_cast<Index>(GRange % tileSize);
538 if (xMode != 0) GRange +=
static_cast<Index>(tileSize - xMode);
544 template <
typename Index>
546 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547 cl::sycl::range<2> &local_range)
const {
548 std::array<Index, 2> input_range = input_dim;
549 Index max_workgroup_Size =
550 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
553 EIGEN_SYCL_LOCAL_THREAD_DIM1),
554 static_cast<Index>(max_workgroup_Size));
558 input_range[1] = input_dim[1];
559 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
560 global_range[1] = input_range[1];
561 if (local_range[1] > global_range[1])
562 local_range[1] = global_range[1];
563 else if (global_range[1] > local_range[1]) {
564 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
566 global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
568 local_range[0] =
static_cast<Index>(max_workgroup_Size / local_range[1]);
569 input_range[0] = input_dim[0];
570 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
571 global_range[0] = input_range[0];
572 if (local_range[0] > global_range[0])
573 local_range[0] = global_range[0];
574 else if (global_range[0] > local_range[0]) {
575 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
577 global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
583 template <
typename Index>
585 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586 cl::sycl::range<3> &local_range)
const {
587 std::array<Index, 3> input_range = input_dim;
588 Index max_workgroup_Size =
589 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
592 EIGEN_SYCL_LOCAL_THREAD_DIM1),
593 static_cast<Index>(max_workgroup_Size));
597 input_range[2] = input_dim[2];
598 if (input_range[2] == 0) input_range[1] =
static_cast<Index>(1);
599 global_range[2] = input_range[2];
600 if (local_range[2] > global_range[2])
601 local_range[2] = global_range[2];
602 else if (global_range[2] > local_range[2]) {
603 Index xMode =
static_cast<Index>(global_range[2] % local_range[2]);
605 global_range[2] +=
static_cast<Index>(local_range[2] - xMode);
607 pow_of_2 =
static_cast<Index>(
608 std::log2(
static_cast<Index>(max_workgroup_Size / local_range[2])));
611 input_range[1] = input_dim[1];
612 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
613 global_range[1] = input_range[1];
614 if (local_range[1] > global_range[1])
615 local_range[1] = global_range[1];
616 else if (global_range[1] > local_range[1]) {
617 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
619 global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
621 local_range[0] =
static_cast<Index>(max_workgroup_Size /
622 (local_range[1] * local_range[2]));
623 input_range[0] = input_dim[0];
624 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
625 global_range[0] = input_range[0];
626 if (local_range[0] > global_range[0])
627 local_range[0] = global_range[0];
628 else if (global_range[0] > local_range[0]) {
629 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
631 global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
641 return m_device_info.local_mem_type ==
647 return m_device_info.max_mem_alloc_size;
651 return m_device_info.max_compute_units;
655 return m_device_info.max_work_group_size;
659 return m_device_info.max_work_item_sizes;
671 return m_device_info.local_mem_size;
677 return getPowerOfTwo(m_device_info.max_work_group_size,
false);
681 return m_device_info.platform_name;
685 return m_device_info.device_name;
689 return m_device_info.device_vendor;
696 if (roundUp) --wGSize;
697 wGSize |= (wGSize >> 1);
698 wGSize |= (wGSize >> 2);
699 wGSize |= (wGSize >> 4);
700 wGSize |= (wGSize >> 8);
701 wGSize |= (wGSize >> 16);
702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
703 wGSize |= (wGSize >> 32);
705 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
713 if (!exception_caught_) {
716 return !exception_caught_;
720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
721 std::lock_guard<std::mutex> lock(event_mutex_);
722 return latest_events_[std::this_thread::get_id()];
725 return cl::sycl::event();
732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
733 scratch_buffers.clear();
739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
740 std::lock_guard<std::mutex> lock(event_mutex_);
741 latest_events_[std::this_thread::get_id()] =
e;
747 void synchronize_and_callback(cl::sycl::event
e,
748 const std::function<
void()> &callback)
const {
751 auto callback_ = [=]() {
752 #ifdef EIGEN_EXCEPTIONS
753 cl::sycl::event(
e).wait_and_throw();
755 cl::sycl::event(
e).wait();
759 m_thread_pool.Schedule(std::move(callback_));
761 #ifdef EIGEN_EXCEPTIONS
762 m_queue.wait_and_throw();
769 bool sycl_async_handler(cl::sycl::exception_list exceptions)
const {
770 bool exception_caught =
false;
771 for (
const auto &
e : exceptions) {
773 exception_caught =
true;
777 return exception_caught;
781 bool exception_caught_ =
false;
783 mutable std::mutex pmapper_mutex_;
785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
786 mutable std::mutex event_mutex_;
787 mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
794 mutable TensorSycl::internal::PointerMapper pMapper;
795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
796 mutable std::unordered_set<void *> scratch_buffers;
798 mutable cl::sycl::queue m_queue;
800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
801 mutable cl::sycl::program m_prog;
808 const TensorSycl::internal::SyclDeviceInfo m_device_info;
811 struct SyclDeviceBase {
814 const QueueInterface *m_queue_stream;
815 explicit SyclDeviceBase(
const QueueInterface *queue_stream)
816 : m_queue_stream(queue_stream) {}
818 return m_queue_stream;
824 struct SyclDevice :
public SyclDeviceBase {
825 explicit SyclDevice(
const QueueInterface *queue_stream)
826 : SyclDeviceBase(queue_stream) {}
829 template <cl::sycl::access::mode AcMd,
typename T>
831 get_range_accessor(
const void *ptr)
const {
832 return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
836 template <cl::sycl::access::mode AcMd>
838 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
839 get_sycl_accessor(cl::sycl::handler &cgh,
const void *ptr)
const {
840 return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
845 const void *ptr)
const {
846 return queue_stream()->get_sycl_buffer(ptr);
851 template <
typename Index>
854 queue_stream()->parallel_for_setup(
n, tileSize,
rng, GRange);
859 template <
typename Index>
861 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862 cl::sycl::range<2> &local_range)
const {
863 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
868 template <
typename Index>
870 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871 cl::sycl::range<3> &local_range)
const {
872 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
877 return queue_stream()->allocate(num_bytes);
881 return queue_stream()->allocate_temp(num_bytes);
886 queue_stream()->deallocate(
p);
890 queue_stream()->deallocate_temp(
buffer);
892 template <cl::sycl::access::mode AcMd,
typename T>
894 const TensorSycl::internal::RangeAccess<AcMd, T> &
buffer)
const {
895 queue_stream()->deallocate_temp(
buffer);
898 queue_stream()->deallocate_all();
901 template <
typename data_t>
903 cl::sycl::access::mode::read_write,
data_t>
905 return queue_stream()->get(
data);
907 template <
typename data_t>
909 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
912 return queue_stream()->get(
data);
917 cl::sycl::buffer<buffer_scalar_t, 1> &buf)
const {
918 return queue_stream()->attach_buffer(buf);
922 queue_stream()->detach_buffer(
p);
925 return queue_stream()->get_offset(ptr);
932 template <
typename Index>
935 std::function<
void()> callback = {})
const {
936 queue_stream()->memcpyHostToDevice(dst, src,
n, callback);
939 template <
typename Index>
941 void *dst,
const Index *src,
size_t n,
942 std::function<
void()> callback = {})
const {
943 queue_stream()->memcpyDeviceToHost(dst, src,
n, callback);
946 template <
typename Index>
948 queue_stream()->memcpy(dst, src,
n);
952 queue_stream()->memset(
data,
c,
n);
956 return queue_stream()->sycl_queue();
958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
960 return queue_stream()->program();
969 return firstLevelCacheSize();
972 return queue_stream()->getNumSyclMultiProcessors();
975 return queue_stream()->maxSyclThreadsPerBlock();
978 return queue_stream()->maxWorkItemSizes();
982 return queue_stream()->maxSyclThreadsPerMultiProcessor();
985 return queue_stream()->sharedMemPerBlock();
988 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
992 return queue_stream()->getPowerOfTwo(val, roundUp);
996 return queue_stream()->majorDeviceVersion();
1000 queue_stream()->synchronize();
1003 cl::sycl::event
e = cl::sycl::event())
const {
1004 queue_stream()->async_synchronize(
e);
1007 return queue_stream()->get_latest_event();
1015 return queue_stream()->has_local_memory();
1018 return queue_stream()->max_buffer_size();
1021 return queue_stream()->getPlatformName();
1024 return queue_stream()->getDeviceName();
1027 return queue_stream()->getDeviceVendor();
1029 template <
typename OutScalar,
typename KernelType,
typename...
T>
1031 queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1034 template <
typename OutScalar,
typename KernelType,
typename...
T>
1036 queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1040 template <
typename OutScalar,
typename KernelType,
typename...
T>
1042 queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1048 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H