37 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
38 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
41 namespace TensorSycl {
44 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
45 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
48 template <
typename index_t>
49 struct ScanParameters {
64 ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
65 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
66 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
81 template <
typename Evaluator,
typename CoeffReturnType,
typename OutAccessor,
typename Op,
typename Index,
83 struct ScanKernelFunctor {
84 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
96 OutAccessor out_accessor_, OutAccessor temp_accessor_,
97 const ScanParameters<Index> scanParameters_, Op accumulator_,
98 const bool inclusive_)
107 template <scan_step sst = stp,
typename Input>
110 read(
const Input &inpt,
Index global_id) {
111 return inpt.coeff(global_id);
114 template <scan_step sst = stp,
typename Input>
118 return inpt[global_id];
121 template <scan_step sst = stp,
typename InclusiveOp>
127 template <scan_step sst = stp,
typename InclusiveOp>
134 auto scratch_ptr =
scratch.get_pointer().get();
137 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
148 CoeffReturnType inclusive_scan;
156 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
159 for (
int i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
160 Index global_id = global_offset + next_elements;
175 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex +=
PacketSize) {
182 Index ai = private_offset * (2 *
l + 1) - 1 + packetIndex;
183 Index bi = private_offset * (2 *
l + 2) - 1 + packetIndex;
191 scratch_ptr[2 * local_id + (packetIndex /
PacketSize) + scratch_offset] =
197 private_offset >>= 1;
200 Index ai = private_offset * (2 *
l + 1) - 1 + packetIndex;
201 Index bi = private_offset * (2 *
l + 2) - 1 + packetIndex;
205 private_scan[ai] = private_scan[bi];
213 for (
Index d = scratch_stride >> 1;
d > 0;
d >>= 1) {
215 itemID.barrier(cl::sycl::access::fence_space::local_space);
217 Index ai =
offset * (2 * local_id + 1) - 1 + scratch_offset;
218 Index bi =
offset * (2 * local_id + 2) - 1 + scratch_offset;
227 itemID.barrier(cl::sycl::access::fence_space::local_space);
235 tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
238 scratch_ptr[scratch_stride - 1 + scratch_offset] =
accumulator.initialize();
241 for (
Index d = 1;
d < scratch_stride;
d *= 2) {
244 itemID.barrier(cl::sycl::access::fence_space::local_space);
246 Index ai =
offset * (2 * local_id + 1) - 1 + scratch_offset;
247 Index bi =
offset * (2 * local_id + 2) - 1 + scratch_offset;
251 scratch_ptr[ai] = scratch_ptr[bi];
256 itemID.barrier(cl::sycl::access::fence_space::local_space);
259 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex +=
PacketSize) {
262 CoeffReturnType accum = private_scan[packetIndex +
i];
264 private_scan[packetIndex +
i] =
accumulator.finalize(accum);
270 private_scan[0] =
accumulator.finalize(inclusive_scan);
276 for (
Index i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
277 Index global_id = global_offset + next_elements;
282 out_ptr[global_id] = private_scan[private_id];
290 template <
typename CoeffReturnType,
typename InAccessor,
typename OutAccessor,
typename Op,
typename Index>
291 struct ScanAdjustmentKernelFunctor {
292 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
300 OutAccessor out_accessor_,
301 const ScanParameters<Index> scanParameters_,
313 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
329 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
332 CoeffReturnType adjust_val = in_ptr[in_id];
334 Index next_elements = 0;
336 for (
Index i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
337 Index global_id = global_offset + next_elements;
341 CoeffReturnType accum = adjust_val;
351 template <
typename Index>
370 const Eigen::SyclDevice &
dev;
372 const Index &non_scan_size_,
const Index &scan_stride_,
const Index &non_scan_stride_,
373 const Eigen::SyclDevice &dev_)
383 Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
395 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
413 template <
typename EvaluatorPo
interType,
typename CoeffReturnType,
typename Reducer,
typename Index>
414 struct SYCLAdjustBlockOffset {
416 Reducer &accumulator,
const Index total_size,
417 const Index scan_size,
const Index panel_size,
419 const Index non_scan_stride,
const Eigen::SyclDevice &dev) {
421 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
425 dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
426 scan_info.max_elements_per_block,
427 scan_info.get_scan_parameter(), accumulator);
431 template <
typename CoeffReturnType, scan_step stp>
433 template <
typename Input,
typename EvaluatorPo
interType,
typename Reducer,
typename Index>
437 const Index non_scan_stride,
const bool inclusive,
438 const Eigen::SyclDevice &dev) {
440 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
441 const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
443 CoeffReturnType *temp_pointer =
444 static_cast<CoeffReturnType *
>(dev.allocate_temp(temp_pointer_size *
sizeof(CoeffReturnType)));
445 EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
448 dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
449 in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
450 scan_info.get_scan_parameter(), accumulator, inclusive);
452 if (scan_info.block_size > 1) {
454 tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
455 non_scan_size,
Index(1), scan_info.block_size,
false, dev);
458 tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
459 non_scan_stride, dev);
461 dev.deallocate_temp(temp_pointer);
468 template <
typename Self,
typename Reducer,
bool vectorize>
469 struct ScanLauncher<Self, Reducer,
Eigen::SyclDevice,
vectorize> {
471 typedef typename Self::CoeffReturnType CoeffReturnType;
472 typedef typename Self::Storage Storage;
473 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
476 const Index scan_size =
self.size();
477 const Index scan_stride =
self.stride();
479 auto accumulator =
self.accumulator();
480 auto inclusive = !
self.exclusive();
481 auto consume_dim =
self.consume_dim();
482 auto dev =
self.device();
484 auto dims =
self.inner().dimensions();
486 Index non_scan_size = 1;
487 Index panel_size = 1;
488 if (
static_cast<int>(Self::Layout) ==
static_cast<int>(
ColMajor)) {
489 for (
int i = 0;
i < consume_dim;
i++) {
490 non_scan_size *= dims[
i];
492 for (
int i = consume_dim + 1;
i < Self::NumDims;
i++) {
493 panel_size *= dims[
i];
496 for (
int i = Self::NumDims - 1;
i > consume_dim;
i--) {
497 non_scan_size *= dims[
i];
499 for (
int i = consume_dim - 1;
i >= 0;
i--) {
500 panel_size *= dims[
i];
503 const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
504 auto eval_impl =
self.inner();
506 eval_impl,
data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
513 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP