15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) 16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H 17 #include <unordered_set> 21 namespace TensorSycl {
25 struct SyclDeviceInfo {
26 SyclDeviceInfo(cl::sycl::queue queue)
29 .template get_info<
cl::sycl::
info::device::local_mem_type>()),
33 cl::sycl::
info::device::max_work_item_sizes>()),
37 cl::sycl::
info::device::max_mem_alloc_size>()),
38 max_compute_units(queue.get_device()
40 cl::sycl::
info::device::max_compute_units>()),
44 cl::sycl::
info::device::max_work_group_size>()),
47 .template get_info<
cl::sycl::
info::device::local_mem_size>()),
48 platform_name(queue.get_device()
50 .template get_info<
cl::sycl::
info::platform::
name>()),
51 device_name(queue.get_device()
52 .template get_info<
cl::sycl::
info::device::
name>()),
55 .template get_info<
cl::sycl::
info::device::vendor>()) {}
57 cl::sycl::info::local_mem_type local_mem_type;
58 cl::sycl::id<3> max_work_item_sizes;
59 unsigned long max_mem_alloc_size;
60 unsigned long max_compute_units;
61 unsigned long max_work_group_size;
62 size_t local_mem_size;
63 std::string platform_name;
64 std::string device_name;
65 std::string device_vendor;
71 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
76 -> decltype(cl::sycl::device::get_devices()) {
77 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR 78 return {cl::sycl::device(cl::sycl::default_selector())};
80 std::vector<cl::sycl::device> supported_devices;
81 auto platform_list = cl::sycl::platform::get_platforms();
82 for (
const auto &platform : platform_list) {
83 auto device_list = platform.get_devices();
85 platform.template get_info<cl::sycl::info::platform::name>();
87 platform_name.begin(), ::tolower);
88 for (
const auto &device : device_list) {
89 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91 bool unsupported_condition =
92 (device.is_cpu() && platform_name.find(
"amd") != std::string::npos &&
93 vendor.find(
"apu") == std::string::npos) ||
94 (platform_name.find(
"experimental") != std::string::npos) ||
96 if (!unsupported_condition) {
97 supported_devices.push_back(device);
101 return supported_devices;
105 class QueueInterface {
108 template <
typename DeviceOrSelector>
109 explicit QueueInterface(
110 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111 unsigned num_threads = std::thread::hardware_concurrency())
112 : m_queue(dev_or_sel, handler),
113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114 m_prog(m_queue.get_context(), get_sycl_supported_devices()),
116 m_thread_pool(num_threads),
117 m_device_info(m_queue) {
118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 119 m_prog.build_with_kernel_type<DeviceOrSelector>();
120 auto f = [&](cl::sycl::handler &cgh) {
121 cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
128 template <
typename DeviceOrSelector>
129 explicit QueueInterface(
130 const DeviceOrSelector &dev_or_sel,
131 unsigned num_threads = std::thread::hardware_concurrency())
132 : QueueInterface(dev_or_sel,
133 [
this](
cl::sycl::exception_list
l) {
134 this->exception_caught_ = this->sycl_async_handler(
l);
138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 144 cl::sycl::buffer<buffer_scalar_t, 1> &buf)
const {
145 std::lock_guard<std::mutex> lock(pmapper_mutex_);
146 return static_cast<void *
>(pMapper.add_pointer(buf));
151 std::lock_guard<std::mutex> lock(pmapper_mutex_);
152 TensorSycl::internal::SYCLfree<false>(
p, pMapper);
164 #if EIGEN_MAX_ALIGN_BYTES > 0 170 std::lock_guard<std::mutex> lock(pmapper_mutex_);
171 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
175 #if EIGEN_MAX_ALIGN_BYTES > 0 181 std::lock_guard<std::mutex> lock(pmapper_mutex_);
182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 183 if (scratch_buffers.empty()) {
184 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
187 for (
auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188 auto buff = pMapper.get_buffer(*it);
189 if (buff.get_size() >= num_bytes) {
191 scratch_buffers.erase(it);
197 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
200 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
203 template <
typename data_t>
205 cl::sycl::access::mode::read_write,
data_t>
207 return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(
data);
209 template <
typename data_t>
211 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
214 return static_cast<data_t *
>(
data.get_virtual_pointer());
218 std::lock_guard<std::mutex> lock(pmapper_mutex_);
219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 220 scratch_buffers.insert(p);
222 TensorSycl::internal::SYCLfree(p, pMapper);
225 template <cl::sycl::access::mode AcMd,
typename T>
227 const TensorSycl::internal::RangeAccess<AcMd, T> &p)
const {
228 deallocate_temp(p.get_virtual_pointer());
234 std::lock_guard<std::mutex> lock(pmapper_mutex_);
235 TensorSycl::internal::SYCLfree(p, pMapper);
239 std::lock_guard<std::mutex> lock(pmapper_mutex_);
240 TensorSycl::internal::SYCLfreeAll(pMapper);
241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 242 scratch_buffers.clear();
251 void *dst,
const void *src,
size_t n,
252 std::function<
void()> callback)
const {
253 static const auto write_mode = cl::sycl::access::mode::discard_write;
254 static const auto global_access = cl::sycl::access::target::global_buffer;
255 typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
258 if (callback) callback();
261 n /=
sizeof(buffer_scalar_t);
262 auto f = [&](cl::sycl::handler &cgh) {
263 write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst,
n);
264 buffer_scalar_t
const *ptr =
static_cast<buffer_scalar_t
const *
>(src);
265 auto non_deleter = [](buffer_scalar_t
const *) {};
266 std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267 cgh.copy(s_ptr, dst_acc);
271 synchronize_and_callback(e, callback);
279 void *dst,
const void *src,
size_t n,
280 std::function<
void()> callback)
const {
282 static const auto global_access = cl::sycl::access::target::global_buffer;
283 typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
286 if (callback) callback();
289 n /=
sizeof(buffer_scalar_t);
290 auto f = [&](cl::sycl::handler &cgh) {
291 read_accessor src_acc = get_range_accessor<read_mode>(cgh, src,
n);
292 buffer_scalar_t *ptr =
static_cast<buffer_scalar_t *
>(dst);
293 auto non_deleter = [](buffer_scalar_t *) {};
294 std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295 cgh.copy(src_acc, s_ptr);
299 synchronize_and_callback(e, callback);
307 static const auto write_mode = cl::sycl::access::mode::discard_write;
311 n /=
sizeof(buffer_scalar_t);
312 auto f = [&](cl::sycl::handler &cgh) {
313 auto src_acc = get_range_accessor<read_mode>(cgh, src,
n);
314 auto dst_acc = get_range_accessor<write_mode>(cgh, dst,
n);
315 cgh.copy(src_acc, dst_acc);
319 async_synchronize(e);
326 static const auto write_mode = cl::sycl::access::mode::discard_write;
330 n /=
sizeof(buffer_scalar_t);
331 auto f = [&](cl::sycl::handler &cgh) {
332 auto dst_acc = get_range_accessor<write_mode>(cgh,
data,
n);
336 cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
340 async_synchronize(e);
350 template <cl::sycl::access::mode AcMd,
typename T>
352 get_range_accessor(
const void *ptr)
const {
353 static const auto global_access = cl::sycl::access::target::global_buffer;
354 static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355 typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356 typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
358 std::lock_guard<std::mutex> lock(pmapper_mutex_);
360 auto original_buffer = pMapper.get_buffer(ptr);
361 const ptrdiff_t
offset = pMapper.get_offset(ptr);
362 const ptrdiff_t typed_offset = offset /
sizeof(
T);
364 const auto typed_size = original_buffer.get_size() /
sizeof(
T);
365 auto buffer = original_buffer.template reinterpret<
367 cl::sycl::range<1>(typed_size));
368 const ptrdiff_t
size =
buffer.get_count() - typed_offset;
371 1, AcMd, global_access, is_place_holder>
372 placeholder_accessor_t;
373 const auto start_ptr =
static_cast<internal_ptr_t
>(ptr) - offset;
374 return ret_type(placeholder_accessor_t(
buffer, cl::sycl::range<1>(size),
375 cl::sycl::id<1>(typed_offset)),
376 static_cast<size_t>(typed_offset),
377 reinterpret_cast<std::intptr_t>(start_ptr));
382 template <cl::sycl::access::mode AcMd,
typename Index>
384 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
385 get_range_accessor(cl::sycl::handler &cgh,
const void *ptr,
386 const Index n_bytes)
const {
387 static const auto global_access = cl::sycl::access::target::global_buffer;
389 std::lock_guard<std::mutex> lock(pmapper_mutex_);
390 auto buffer = pMapper.get_buffer(ptr);
391 const ptrdiff_t offset = pMapper.get_offset(ptr);
394 return buffer.template get_access<AcMd, global_access>(
395 cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
402 template <cl::sycl::access::mode AcMd>
404 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
405 get_sycl_accessor(cl::sycl::handler &cgh,
const void *ptr)
const {
406 std::lock_guard<std::mutex> lock(pmapper_mutex_);
407 return pMapper.get_buffer(ptr)
408 .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
413 const void *ptr)
const {
414 std::lock_guard<std::mutex> lock(pmapper_mutex_);
415 return pMapper.get_buffer(ptr);
419 std::lock_guard<std::mutex> lock(pmapper_mutex_);
420 return pMapper.get_offset(ptr);
423 template <
typename OutScalar,
typename sycl_kernel,
typename Lhs,
424 typename Rhs,
typename OutPtr,
typename Range,
typename Index,
427 const Rhs &rhs, OutPtr outptr,
431 auto kernel_functor = [=](cl::sycl::handler &cgh) {
436 typedef cl::sycl::accessor<OutScalar, 1,
437 cl::sycl::access::mode::read_write,
438 cl::sycl::access::target::local>
441 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444 program().
template get_kernel<sycl_kernel>(),
446 thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
450 async_synchronize(e);
453 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
454 typename OutPtr,
typename Range,
typename Index,
typename...
T>
460 auto kernel_functor = [=](cl::sycl::handler &cgh) {
464 typedef cl::sycl::accessor<OutScalar, 1,
465 cl::sycl::access::mode::read_write,
466 cl::sycl::access::target::local>
469 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472 program().
template get_kernel<sycl_kernel>(),
474 thread_range, sycl_kernel(scratch, inptr, outptr, var...));
478 async_synchronize(e);
481 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
487 auto kernel_functor = [=](cl::sycl::handler &cgh) {
490 typedef cl::sycl::accessor<OutScalar, 1,
491 cl::sycl::access::mode::read_write,
492 cl::sycl::access::target::local>
495 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498 program().
template get_kernel<sycl_kernel>(),
500 thread_range, sycl_kernel(scratch, inptr, var...));
504 async_synchronize(e);
509 #ifdef EIGEN_EXCEPTIONS 510 m_queue.wait_and_throw();
519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION 524 template <
typename Index>
526 Index &
rng, Index &GRange)
const {
527 tileSize =
static_cast<Index
>(getNearestPowerOfTwoWorkGroupSize());
528 tileSize =
std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529 EIGEN_SYCL_LOCAL_THREAD_DIM1),
530 static_cast<Index>(tileSize));
532 if (rng == 0) rng =
static_cast<Index
>(1);
534 if (tileSize > GRange)
536 else if (GRange > tileSize) {
537 Index xMode =
static_cast<Index
>(GRange % tileSize);
538 if (xMode != 0) GRange +=
static_cast<Index
>(tileSize - xMode);
544 template <
typename Index>
546 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547 cl::sycl::range<2> &local_range)
const {
548 std::array<Index, 2> input_range = input_dim;
549 Index max_workgroup_Size =
550 static_cast<Index
>(getNearestPowerOfTwoWorkGroupSize());
552 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
553 EIGEN_SYCL_LOCAL_THREAD_DIM1),
554 static_cast<Index>(max_workgroup_Size));
555 Index pow_of_2 =
static_cast<Index
>(
std::log2(max_workgroup_Size));
557 static_cast<Index
>(
std::pow(2, static_cast<Index>(pow_of_2 / 2)));
558 input_range[1] = input_dim[1];
559 if (input_range[1] == 0) input_range[1] =
static_cast<Index
>(1);
560 global_range[1] = input_range[1];
561 if (local_range[1] > global_range[1])
562 local_range[1] = global_range[1];
563 else if (global_range[1] > local_range[1]) {
564 Index xMode =
static_cast<Index
>(global_range[1] % local_range[1]);
566 global_range[1] +=
static_cast<Index
>(local_range[1] - xMode);
568 local_range[0] =
static_cast<Index
>(max_workgroup_Size / local_range[1]);
569 input_range[0] = input_dim[0];
570 if (input_range[0] == 0) input_range[0] =
static_cast<Index
>(1);
571 global_range[0] = input_range[0];
572 if (local_range[0] > global_range[0])
573 local_range[0] = global_range[0];
574 else if (global_range[0] > local_range[0]) {
575 Index xMode =
static_cast<Index
>(global_range[0] % local_range[0]);
577 global_range[0] +=
static_cast<Index
>(local_range[0] - xMode);
583 template <
typename Index>
585 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586 cl::sycl::range<3> &local_range)
const {
587 std::array<Index, 3> input_range = input_dim;
588 Index max_workgroup_Size =
589 static_cast<Index
>(getNearestPowerOfTwoWorkGroupSize());
591 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
592 EIGEN_SYCL_LOCAL_THREAD_DIM1),
593 static_cast<Index>(max_workgroup_Size));
594 Index pow_of_2 =
static_cast<Index
>(
std::log2(max_workgroup_Size));
596 static_cast<Index
>(
std::pow(2, static_cast<Index>(pow_of_2 / 3)));
597 input_range[2] = input_dim[2];
598 if (input_range[2] == 0) input_range[1] =
static_cast<Index
>(1);
599 global_range[2] = input_range[2];
600 if (local_range[2] > global_range[2])
601 local_range[2] = global_range[2];
602 else if (global_range[2] > local_range[2]) {
603 Index xMode =
static_cast<Index
>(global_range[2] % local_range[2]);
605 global_range[2] +=
static_cast<Index
>(local_range[2] - xMode);
607 pow_of_2 =
static_cast<Index
>(
608 std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
610 static_cast<Index
>(
std::pow(2, static_cast<Index>(pow_of_2 / 2)));
611 input_range[1] = input_dim[1];
612 if (input_range[1] == 0) input_range[1] =
static_cast<Index
>(1);
613 global_range[1] = input_range[1];
614 if (local_range[1] > global_range[1])
615 local_range[1] = global_range[1];
616 else if (global_range[1] > local_range[1]) {
617 Index xMode =
static_cast<Index
>(global_range[1] % local_range[1]);
619 global_range[1] +=
static_cast<Index
>(local_range[1] - xMode);
621 local_range[0] =
static_cast<Index
>(max_workgroup_Size /
622 (local_range[1] * local_range[2]));
623 input_range[0] = input_dim[0];
624 if (input_range[0] == 0) input_range[0] =
static_cast<Index
>(1);
625 global_range[0] = input_range[0];
626 if (local_range[0] > global_range[0])
627 local_range[0] = global_range[0];
628 else if (global_range[0] > local_range[0]) {
629 Index xMode =
static_cast<Index
>(global_range[0] % local_range[0]);
631 global_range[0] +=
static_cast<Index
>(local_range[0] - xMode);
636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) 638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) 641 return m_device_info.local_mem_type ==
642 cl::sycl::info::local_mem_type::local;
647 return m_device_info.max_mem_alloc_size;
651 return m_device_info.max_compute_units;
655 return m_device_info.max_work_group_size;
659 return m_device_info.max_work_item_sizes;
671 return m_device_info.local_mem_size;
677 return getPowerOfTwo(m_device_info.max_work_group_size,
false);
681 return m_device_info.platform_name;
685 return m_device_info.device_name;
689 return m_device_info.device_vendor;
696 if (roundUp) --wGSize;
697 wGSize |= (wGSize >> 1);
698 wGSize |= (wGSize >> 2);
699 wGSize |= (wGSize >> 4);
700 wGSize |= (wGSize >> 8);
701 wGSize |= (wGSize >> 16);
702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 703 wGSize |= (wGSize >> 32);
705 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
713 if (!exception_caught_) {
716 return !exception_caught_;
720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT 721 std::lock_guard<std::mutex> lock(event_mutex_);
722 return latest_events_[std::this_thread::get_id()];
725 return cl::sycl::event();
732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 733 scratch_buffers.clear();
739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT 740 std::lock_guard<std::mutex> lock(event_mutex_);
741 latest_events_[std::this_thread::get_id()] =
e;
747 void synchronize_and_callback(cl::sycl::event e,
748 const std::function<
void()> &callback)
const {
751 auto callback_ = [=]() {
752 #ifdef EIGEN_EXCEPTIONS 753 cl::sycl::event(e).wait_and_throw();
755 cl::sycl::event(e).wait();
759 m_thread_pool.Schedule(std::move(callback_));
761 #ifdef EIGEN_EXCEPTIONS 762 m_queue.wait_and_throw();
769 bool sycl_async_handler(cl::sycl::exception_list exceptions)
const {
770 bool exception_caught =
false;
771 for (
const auto &e : exceptions) {
773 exception_caught =
true;
777 return exception_caught;
781 bool exception_caught_ =
false;
783 mutable std::mutex pmapper_mutex_;
785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT 786 mutable std::mutex event_mutex_;
787 mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
794 mutable TensorSycl::internal::PointerMapper pMapper;
795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS 796 mutable std::unordered_set<void *> scratch_buffers;
798 mutable cl::sycl::queue m_queue;
800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 801 mutable cl::sycl::program m_prog;
808 const TensorSycl::internal::SyclDeviceInfo m_device_info;
811 struct SyclDeviceBase {
814 const QueueInterface *m_queue_stream;
815 explicit SyclDeviceBase(
const QueueInterface *queue_stream)
816 : m_queue_stream(queue_stream) {}
818 return m_queue_stream;
824 struct SyclDevice :
public SyclDeviceBase {
825 explicit SyclDevice(
const QueueInterface *queue_stream)
826 : SyclDeviceBase(queue_stream) {}
829 template <cl::sycl::access::mode AcMd,
typename T>
831 get_range_accessor(
const void *ptr)
const {
832 return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
836 template <cl::sycl::access::mode AcMd>
838 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
839 get_sycl_accessor(cl::sycl::handler &cgh,
const void *ptr)
const {
840 return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
845 const void *ptr)
const {
846 return queue_stream()->get_sycl_buffer(ptr);
851 template <
typename Index>
853 Index &rng, Index &GRange)
const {
854 queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
859 template <
typename Index>
861 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862 cl::sycl::range<2> &local_range)
const {
863 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
868 template <
typename Index>
870 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871 cl::sycl::range<3> &local_range)
const {
872 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
877 return queue_stream()->allocate(num_bytes);
881 return queue_stream()->allocate_temp(num_bytes);
886 queue_stream()->deallocate(p);
890 queue_stream()->deallocate_temp(buffer);
892 template <cl::sycl::access::mode AcMd,
typename T>
894 const TensorSycl::internal::RangeAccess<AcMd, T> &buffer)
const {
895 queue_stream()->deallocate_temp(buffer);
898 queue_stream()->deallocate_all();
901 template <
typename data_t>
903 cl::sycl::access::mode::read_write,
data_t>
905 return queue_stream()->get(data);
907 template <
typename data_t>
909 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
912 return queue_stream()->get(data);
917 cl::sycl::buffer<buffer_scalar_t, 1> &buf)
const {
918 return queue_stream()->attach_buffer(buf);
922 queue_stream()->detach_buffer(p);
925 return queue_stream()->get_offset(ptr);
932 template <
typename Index>
934 Index *dst,
const Index *src,
size_t n,
935 std::function<
void()> callback = {})
const {
936 queue_stream()->memcpyHostToDevice(dst, src, n, callback);
939 template <
typename Index>
941 void *dst,
const Index *src,
size_t n,
942 std::function<
void()> callback = {})
const {
943 queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
946 template <
typename Index>
948 queue_stream()->memcpy(dst, src, n);
952 queue_stream()->memset(data, c, n);
956 return queue_stream()->sycl_queue();
958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS 960 return queue_stream()->program();
969 return firstLevelCacheSize();
972 return queue_stream()->getNumSyclMultiProcessors();
975 return queue_stream()->maxSyclThreadsPerBlock();
978 return queue_stream()->maxWorkItemSizes();
982 return queue_stream()->maxSyclThreadsPerMultiProcessor();
985 return queue_stream()->sharedMemPerBlock();
988 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
992 return queue_stream()->getPowerOfTwo(val, roundUp);
996 return queue_stream()->majorDeviceVersion();
1000 queue_stream()->synchronize();
1003 cl::sycl::event e = cl::sycl::event())
const {
1004 queue_stream()->async_synchronize(e);
1007 return queue_stream()->get_latest_event();
1015 return queue_stream()->has_local_memory();
1018 return queue_stream()->max_buffer_size();
1021 return queue_stream()->getPlatformName();
1024 return queue_stream()->getDeviceName();
1027 return queue_stream()->getDeviceVendor();
1029 template <
typename OutScalar,
typename KernelType,
typename...
T>
1031 queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1034 template <
typename OutScalar,
typename KernelType,
typename...
T>
1036 queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1040 template <
typename OutScalar,
typename KernelType,
typename...
T>
1042 queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1048 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H #define EIGEN_ALWAYS_INLINE
#define EIGEN_STRONG_INLINE
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if< PacketLoad, PacketType >::type read(const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld)
read, a template function used for loading the data from global memory. This function is used to guar...
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
static Similarity3 align(const Point3Pairs &d_abPointPairs, const Rot3 &aRb, const Point3Pair ¢roids)
This method estimates the similarity transform from differences point pairs,.
static const Line3 l(Rot3(), 1, 1)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16 &a)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Eigen::Triplet< double > T
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
EIGEN_DONT_INLINE void transform(const Transformation &t, Data &data)
pair< size_t, size_t > Range
#define EIGEN_DEVICE_FUNC
void synchronize(ConcurrentFilter &filter, ConcurrentSmoother &smoother)
#define EIGEN_SYCL_TRY_CATCH(X)
Annotation for function names.
Jet< T, N > pow(const Jet< T, N > &f, double g)
#define EIGEN_UNUSED_VARIABLE(var)
arr data_t(const arr_t &a, Ix... index)