28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
31 namespace TensorSycl {
34 template <
typename Op,
typename CoeffReturnType,
typename Index,
bool Vectorizable>
36 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType
PacketReturnType;
46 template <
typename CoeffReturnType,
typename Index>
47 struct OpDefiner<
Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType,
Index, false> {
56 return quotient_op(accumulator, CoeffReturnType(
scale));
60 template <
typename CoeffReturnType,
typename Index>
61 struct OpDefiner<
Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType,
Index, true> {
62 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType
PacketReturnType;
74 template <
typename CoeffReturnType,
typename OpType,
typename InputAccessor,
typename OutputAccessor,
typename Index,
77 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
96 const Index localid = itemID.get_local_id(0);
97 auto aInPtr =
aI.get_pointer() + localid;
98 auto aOutPtr =
outAcc.get_pointer();
99 CoeffReturnType *scratchptr =
scratch.get_pointer();
100 CoeffReturnType accumulator = *aInPtr;
102 scratchptr[localid] =
op.finalize(accumulator);
104 itemID.barrier(cl::sycl::access::fence_space::local_space);
106 op.reduce(scratchptr[localid +
offset], &accumulator);
107 scratchptr[localid] =
op.finalize(accumulator);
110 if (localid == 0) *aOutPtr =
op.finalize(accumulator);
116 template <
typename Evaluator,
typename OpType,
typename Evaluator::Index local_range>
122 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
129 typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
131 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
145 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
147 const cl::sycl::nd_item<1> &itemID) {
149 Index VectorizedRange = (
rng / Evaluator::PacketSize) * Evaluator::PacketSize;
150 Index globalid = itemID.get_global_id(0);
151 Index localid = itemID.get_local_id(0);
152 Index step = Evaluator::PacketSize * itemID.get_global_range(0);
153 Index start = Evaluator::PacketSize * globalid;
155 PacketReturnType packetAccumulator =
op.template initializePacket<PacketReturnType>();
157 op.template reducePacket<PacketReturnType>(
evaluator.impl().template packet<Unaligned>(
i), &packetAccumulator);
159 globalid += VectorizedRange;
161 for (
Index i = globalid;
i <
rng;
i += itemID.get_global_range(0)) {
162 op.template reducePacket<PacketReturnType>(
167 scratch[localid] = packetAccumulator =
172 itemID.barrier(cl::sycl::access::fence_space::local_space);
174 op.template reducePacket<PacketReturnType>(
scratch[localid +
offset], &packetAccumulator);
175 scratch[localid] =
op.template finalizePacket<PacketReturnType>(packetAccumulator);
179 output_ptr[itemID.get_group(0)] =
180 op.finalizeBoth(
op.initialize(),
op.template finalizePacket<PacketReturnType>(packetAccumulator));
184 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
186 const cl::sycl::nd_item<1> &itemID) {
188 Index globalid = itemID.get_global_id(0);
189 Index localid = itemID.get_local_id(0);
193 for (
Index i = globalid;
i <
rng;
i += itemID.get_global_range(0)) {
201 itemID.barrier(cl::sycl::access::fence_space::local_space);
204 scratch[localid] =
op.finalize(accumulator);
208 output_ptr[itemID.get_group(0)] =
op.finalize(accumulator);
213 template <
typename Evaluator,
typename OpType>
214 class GenericNondeterministicReducer {
219 typedef OpDefiner<OpType, CoeffReturnType, Index, false>
OpDef;
221 template <
typename Scratch>
230 void operator()(cl::sycl::nd_item<1> itemID) {
233 Index globalid =
static_cast<Index>(itemID.get_global_linear_id());
234 if (globalid <
range) {
252 template <
typename Evaluator,
typename OpType,
typename PannelParameters, reduction_dim rt>
259 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
271 const Index preserve_elements_num_groups_,
const Index reduce_elements_num_groups_,
272 const Index num_coeffs_to_preserve_,
const Index num_coeffs_to_reduce_)
292 const Index per_thread_global_stride =
296 localOffset += per_thread_local_stride;
297 global_offset += per_thread_global_stride;
301 const Index linearLocalThreadId = itemID.get_local_id(0);
303 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
305 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
311 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
312 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
313 auto scratchPtr =
scratch.get_pointer().get();
321 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
324 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
325 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
326 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
331 auto out_scratch_ptr =
332 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
333 itemID.barrier(cl::sycl::access::fence_space::local_space);
335 accumulator = *out_scratch_ptr;
340 if (rLocalThreadId <
offset) {
341 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) *
offset], &accumulator);
344 *out_scratch_ptr =
op.finalize(accumulator);
351 itemID.barrier(cl::sycl::access::fence_space::local_space);
355 outPtr[globalPId] =
op.finalize(accumulator);
360 template <
typename OutScalar,
typename Index,
typename InputAccessor,
typename OutputAccessor,
typename OpType>
361 struct SecondStepPartialReduction {
362 typedef OpDefiner<OpType, OutScalar, Index, false>
OpDef;
364 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
373 OutputAccessor output_accessor_, OpType
op_,
374 const Index num_coeffs_to_preserve_,
375 const Index num_coeffs_to_reduce_)
383 const Index globalId = itemID.get_global_id(0);
389 OutScalar accumulator =
op.initialize();
392 op.reduce(*in_ptr, &accumulator);
399 template <
typename Index, Index LTP, Index LTR,
bool BC_>
406 template <
typename Self,
typename Op, TensorSycl::
internal::reduction_dim rt>
410 typedef typename Self::Storage
Storage;
418 Index num_coeffs_to_reduce,
Index num_coeffs_to_preserve) {
426 "The Local thread size must be a power of 2 for the reduction "
437 const Index reductionPerThread = 64;
438 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(),
true);
440 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
441 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ?
std::min(rGroups, localRange) : 1;
442 const Index globalRange = pNumGroups * rNumGroups * localRange;
446 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
447 if (rNumGroups > 1) {
451 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
452 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
453 num_coeffs_to_reduce);
456 SecondStepPartialReductionKernel;
458 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
459 temp_accessor, output,
460 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
Index(1),
461 reducer, num_coeffs_to_preserve, rNumGroups);
463 self.device().deallocate_temp(temp_pointer);
465 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467 num_coeffs_to_reduce);
477 template <
typename Self,
typename Op,
bool Vectorizable>
478 struct FullReducer<Self, Op,
Eigen::SyclDevice, Vectorizable> {
479 typedef typename Self::CoeffReturnType CoeffReturnType;
480 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
482 static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
483 static void run(
const Self &
self, Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType
data) {
485 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
486 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
487 "The Local thread size must be a power of 2 for the reduction "
489 EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
491 typename Self::Index inputSize =
self.impl().dimensions().TotalSize();
495 const Index reductionPerThread = 2048;
497 Index reductionGroup = dev.getPowerOfTwo(
498 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range),
true);
499 const Index num_work_group =
std::min(reductionGroup, local_range);
503 const Index global_range = num_work_group * local_range;
505 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
507 if (num_work_group > 1) {
510 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
511 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(
self, tmp_global_accessor, thread_range,
512 local_range, inputSize, reducer);
517 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
518 tmp_global_accessor,
data,
519 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
522 dev.deallocate_temp(temp_pointer);
524 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(
self,
data, thread_range, local_range, inputSize,
531 template <
typename Self,
typename Op>
532 struct OuterReducer<Self, Op,
Eigen::SyclDevice> {
535 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
536 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
538 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
539 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>
::run(
self, reducer, dev, output,
540 num_coeffs_to_reduce,
541 num_coeffs_to_preserve);
545 template <
typename Self,
typename Op>
549 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
550 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
552 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
553 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>
::run(
self, reducer, dev, output,
554 num_coeffs_to_reduce,
555 num_coeffs_to_preserve);
562 template <
typename Self,
typename Op>
563 struct GenericReducer<Self, Op,
Eigen::SyclDevice> {
565 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
566 typename Self::EvaluatorPointerType output,
typename Self::Index num_values_to_reduce,
569 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize,
range, GRange);
571 dev.template unary_kernel_launcher<
typename Self::CoeffReturnType,
573 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
Index(1),
574 reducer,
range, (num_values_to_reduce != 0) ? num_values_to_reduce :
static_cast<Index>(1));
582 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP