37 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP 38 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP 41 namespace TensorSycl {
44 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE 45 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4) 48 template <
typename index_t>
64 ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
65 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
66 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
67 : total_size(total_size_),
68 non_scan_size(non_scan_size_),
69 scan_size(scan_size_),
70 non_scan_stride(non_scan_stride_),
71 scan_stride(scan_stride_),
72 panel_threads(panel_threads_),
73 group_threads(group_threads_),
74 block_threads(block_threads_),
75 elements_per_group(elements_per_group_),
76 elements_per_block(elements_per_block_),
77 loop_range(loop_range_) {}
81 template <
typename Evaluator,
typename CoeffReturnType,
typename OutAccessor,
typename Op,
typename Index,
84 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
96 OutAccessor out_accessor_, OutAccessor temp_accessor_,
98 const bool inclusive_)
101 out_accessor(out_accessor_),
102 temp_accessor(temp_accessor_),
103 scanParameters(scanParameters_),
104 accumulator(accumulator_),
105 inclusive(inclusive_) {}
107 template <scan_step sst = stp,
typename Input>
110 read(
const Input &inpt, Index global_id) {
111 return inpt.coeff(global_id);
114 template <scan_step sst = stp,
typename Input>
117 read(
const Input &inpt, Index global_id) {
118 return inpt[global_id];
121 template <scan_step sst = stp,
typename InclusiveOp>
127 template <scan_step sst = stp,
typename InclusiveOp>
132 auto out_ptr = out_accessor.get_pointer();
133 auto tmp_ptr = temp_accessor.get_pointer();
134 auto scratch_ptr = scratch.get_pointer().get();
136 for (Index loop_offset = 0; loop_offset < scanParameters.
loop_range; loop_offset++) {
137 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
139 const Index panel_id = data_offset / scanParameters.
panel_threads;
146 const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.
block_threads) * scratch_stride;
148 CoeffReturnType inclusive_scan;
156 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
157 Index next_elements = 0;
159 for (
int i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
160 Index global_id = global_offset + next_elements;
164 ?
read(dev_eval, global_id)
165 : accumulator.initialize();
175 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
176 Index private_offset = 1;
179 for (Index
d = PacketSize >> 1;
d > 0;
d >>= 1) {
181 for (Index
l = 0;
l <
d;
l++) {
182 Index ai = private_offset * (2 *
l + 1) - 1 + packetIndex;
183 Index bi = private_offset * (2 *
l + 2) - 1 + packetIndex;
184 CoeffReturnType accum = accumulator.initialize();
185 accumulator.reduce(private_scan[ai], &accum);
186 accumulator.reduce(private_scan[bi], &accum);
187 private_scan[bi] = accumulator.finalize(accum);
191 scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
192 private_scan[PacketSize - 1 + packetIndex];
193 private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
196 for (Index
d = 1;
d < PacketSize;
d *= 2) {
197 private_offset >>= 1;
199 for (Index
l = 0;
l <
d;
l++) {
200 Index ai = private_offset * (2 *
l + 1) - 1 + packetIndex;
201 Index bi = private_offset * (2 *
l + 2) - 1 + packetIndex;
202 CoeffReturnType accum = accumulator.initialize();
203 accumulator.reduce(private_scan[ai], &accum);
204 accumulator.reduce(private_scan[bi], &accum);
205 private_scan[ai] = private_scan[bi];
206 private_scan[bi] = accumulator.finalize(accum);
213 for (Index
d = scratch_stride >> 1;
d > 0;
d >>= 1) {
215 itemID.barrier(cl::sycl::access::fence_space::local_space);
217 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
218 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
219 CoeffReturnType accum = accumulator.initialize();
220 accumulator.reduce(scratch_ptr[ai], &accum);
221 accumulator.reduce(scratch_ptr[bi], &accum);
222 scratch_ptr[bi] = accumulator.finalize(accum);
227 itemID.barrier(cl::sycl::access::fence_space::local_space);
235 tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
238 scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
241 for (Index
d = 1;
d < scratch_stride;
d *= 2) {
244 itemID.barrier(cl::sycl::access::fence_space::local_space);
246 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
247 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
248 CoeffReturnType accum = accumulator.initialize();
249 accumulator.reduce(scratch_ptr[ai], &accum);
250 accumulator.reduce(scratch_ptr[bi], &accum);
251 scratch_ptr[ai] = scratch_ptr[bi];
252 scratch_ptr[bi] = accumulator.finalize(accum);
256 itemID.barrier(cl::sycl::access::fence_space::local_space);
259 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
261 for (Index
i = 0;
i < PacketSize;
i++) {
262 CoeffReturnType accum = private_scan[packetIndex +
i];
263 accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
264 private_scan[packetIndex +
i] = accumulator.finalize(accum);
270 private_scan[0] = accumulator.finalize(inclusive_scan);
276 for (Index
i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
277 Index global_id = global_offset + next_elements;
282 out_ptr[global_id] = private_scan[private_id];
290 template <
typename CoeffReturnType,
typename InAccessor,
typename OutAccessor,
typename Op,
typename Index>
292 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
300 OutAccessor out_accessor_,
303 : in_accessor(in_accessor_),
304 out_accessor(out_accessor_),
305 scanParameters(scanParameters_),
306 accumulator(accumulator_) {}
309 auto in_ptr = in_accessor.get_pointer();
310 auto out_ptr = out_accessor.get_pointer();
312 for (Index loop_offset = 0; loop_offset < scanParameters.
loop_range; loop_offset++) {
313 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
315 const Index panel_id = data_offset / scanParameters.
panel_threads;
329 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
331 const Index in_id = (panel_id * block_size * scanParameters.
non_scan_size) + (group_id * block_size) + block_id;
332 CoeffReturnType adjust_val = in_ptr[in_id];
334 Index next_elements = 0;
336 for (Index
i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
337 Index global_id = global_offset + next_elements;
341 CoeffReturnType accum = adjust_val;
342 accumulator.reduce(out_ptr[global_id], &accum);
343 out_ptr[global_id] = accumulator.finalize(accum);
351 template <
typename Index>
370 const Eigen::SyclDevice &
dev;
372 const Index &non_scan_size_,
const Index &scan_stride_,
const Index &non_scan_stride_,
373 const Eigen::SyclDevice &dev_)
374 : total_size(total_size_),
375 scan_size(scan_size_),
376 panel_size(panel_size_),
377 non_scan_size(non_scan_size_),
378 scan_stride(scan_stride_),
379 non_scan_stride(non_scan_stride_),
382 local_range =
std::min(
Index(dev.getNearestPowerOfTwoWorkGroupSize()),
383 Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
388 dev.getPowerOfTwo(
Index(roundUp(
Index(scan_size), ScanParameters<Index>::ScanPerThread)),
true);
389 const Index elements_per_panel = elements_per_group *
non_scan_size;
390 elements_per_block =
std::min(
Index(elements_per_group),
Index(max_elements_per_block));
391 panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
392 group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
393 block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
395 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE 398 const Index max_threads = panel_threads * panel_size;
400 global_range = roundUp(max_threads, local_range);
402 std::ceil(
double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
409 return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
413 template <
typename EvaluatorPo
interType,
typename CoeffReturnType,
typename Reducer,
typename Index>
417 const Index
scan_size,
const Index panel_size,
425 dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
426 scan_info.max_elements_per_block,
427 scan_info.get_scan_parameter(), accumulator);
431 template <
typename CoeffReturnType, scan_step stp>
433 template <
typename Input,
typename EvaluatorPo
interType,
typename Reducer,
typename Index>
438 const Eigen::SyclDevice &dev) {
441 const Index temp_pointer_size = scan_info.
block_size * non_scan_size * panel_size;
443 CoeffReturnType *temp_pointer =
444 static_cast<CoeffReturnType *
>(dev.allocate_temp(temp_pointer_size *
sizeof(CoeffReturnType)));
445 EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
448 dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
449 in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
450 scan_info.get_scan_parameter(), accumulator, inclusive);
452 if (scan_info.block_size > 1) {
454 tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.
block_size, panel_size,
455 non_scan_size, Index(1), scan_info.
block_size,
false, dev);
458 tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
459 non_scan_stride, dev);
461 dev.deallocate_temp(temp_pointer);
468 template <
typename Self,
typename Reducer,
bool vectorize>
479 auto accumulator =
self.accumulator();
480 auto inclusive = !
self.exclusive();
481 auto consume_dim =
self.consume_dim();
482 auto dev =
self.device();
484 auto dims =
self.inner().dimensions();
487 Index panel_size = 1;
488 if (static_cast<int>(Self::Layout) == static_cast<int>(
ColMajor)) {
489 for (
int i = 0;
i < consume_dim;
i++) {
490 non_scan_size *= dims[
i];
492 for (
int i = consume_dim + 1;
i < Self::NumDims;
i++) {
493 panel_size *= dims[
i];
496 for (
int i = Self::NumDims - 1;
i > consume_dim;
i--) {
497 non_scan_size *= dims[
i];
499 for (
int i = consume_dim - 1;
i >= 0;
i--) {
500 panel_size *= dims[
i];
504 auto eval_impl =
self.inner();
506 eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
513 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
const index_t block_threads
typename ::Eigen::internal::enable_if< sst==scan_step::first >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op)
const Index & scan_stride
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
const ScanParameters< Index > scanParameters
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if< PacketLoad, PacketType >::type read(const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld)
read, a template function used for loading the data from global memory. This function is used to guar...
const index_t non_scan_size
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, OutAccessor out_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_)
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
static EIGEN_STRONG_INLINE void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const bool inclusive, const Eigen::SyclDevice &dev)
static EIGEN_CONSTEXPR index_t ScanPerThread
typename ::Eigen::internal::enable_if< sst !=scan_step::first >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
#define EIGEN_SYCL_MAX_GLOBAL_RANGE
const index_t panel_threads
typename ::Eigen::internal::enable_if< sst !=scan_step::first, CoeffReturnType >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id)
const index_t elements_per_block
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
const Eigen::SyclDevice & dev
Self::EvaluatorPointerType EvaluatorPointerType
cl::sycl::nd_range< 1 > get_thread_range()
static const Line3 l(Rot3(), 1, 1)
detail::vectorize_helper< Return(*)(Args...), Return, Args... > vectorize(Return(*f)(Args...))
ScanParameters< Index > get_scan_parameter()
const index_t group_threads
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
const ScanParameters< Index > scanParameters
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
const Index & non_scan_stride
#define EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const Eigen::SyclDevice &dev)
const index_t elements_per_group
const index_t non_scan_stride
OutAccessor temp_accessor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, OutAccessor out_accessor_, OutAccessor temp_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_, const bool inclusive_)
const Index & non_scan_size
Self::CoeffReturnType CoeffReturnType
EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, const Eigen::SyclDevice &dev_)
const index_t scan_stride
Index max_elements_per_block
ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
const std::vector< size_t > dimensions
typename ::Eigen::internal::enable_if< sst==scan_step::first, CoeffReturnType >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id)
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
void operator()(Self &self, EvaluatorPointerType data)
#define EIGEN_UNROLL_LOOP