28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 31 namespace TensorSycl {
34 template <
typename Op,
typename CoeffReturnType,
typename Index,
bool Vectorizable>
36 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType
PacketReturnType;
46 template <
typename CoeffReturnType,
typename Index>
56 return quotient_op(accumulator, CoeffReturnType(scale));
60 template <
typename CoeffReturnType,
typename Index>
62 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType
PacketReturnType;
74 template <
typename CoeffReturnType,
typename OpType,
typename InputAccessor,
typename OutputAccessor,
typename Index,
77 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
86 : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::
get_op(op_)) {}
96 const Index localid = itemID.get_local_id(0);
97 auto aInPtr = aI.get_pointer() + localid;
98 auto aOutPtr = outAcc.get_pointer();
99 CoeffReturnType *scratchptr = scratch.get_pointer();
100 CoeffReturnType accumulator = *aInPtr;
102 scratchptr[localid] = op.finalize(accumulator);
104 itemID.barrier(cl::sycl::access::fence_space::local_space);
106 op.reduce(scratchptr[localid +
offset], &accumulator);
107 scratchptr[localid] = op.finalize(accumulator);
110 if (localid == 0) *aOutPtr = op.finalize(accumulator);
116 template <
typename Evaluator,
typename OpType,
typename Evaluator::Index local_range>
121 typedef OpDefiner<OpType,
typename Evaluator::CoeffReturnType,
Index,
122 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
129 typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
131 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
140 Index rng_, OpType
op_)
141 : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(
OpDef::
get_op(op_)) {}
143 void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
145 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
147 const cl::sycl::nd_item<1> &itemID) {
148 auto output_ptr = final_output.get_pointer();
149 Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
150 Index globalid = itemID.get_global_id(0);
151 Index localid = itemID.get_local_id(0);
152 Index
step = Evaluator::PacketSize * itemID.get_global_range(0);
153 Index start = Evaluator::PacketSize * globalid;
155 PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
156 for (Index
i = start;
i < VectorizedRange;
i +=
step) {
157 op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(
i), &packetAccumulator);
159 globalid += VectorizedRange;
161 for (Index
i = globalid;
i <
rng;
i += itemID.get_global_range(0)) {
162 op.template reducePacket<PacketReturnType>(
164 evaluator.impl().coeff(
i), op.initialize()),
167 scratch[localid] = packetAccumulator =
168 OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
172 itemID.barrier(cl::sycl::access::fence_space::local_space);
174 op.template reducePacket<PacketReturnType>(scratch[localid +
offset], &packetAccumulator);
175 scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
179 output_ptr[itemID.get_group(0)] =
180 op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
184 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
186 const cl::sycl::nd_item<1> &itemID) {
187 auto output_ptr = final_output.get_pointer();
188 Index globalid = itemID.get_global_id(0);
189 Index localid = itemID.get_local_id(0);
191 CoeffReturnType accumulator = op.initialize();
193 for (Index
i = globalid;
i <
rng;
i += itemID.get_global_range(0)) {
194 op.reduce(evaluator.impl().coeff(
i), &accumulator);
196 scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator),
rng);
201 itemID.barrier(cl::sycl::access::fence_space::local_space);
203 op.reduce(scratch[localid +
offset], &accumulator);
204 scratch[localid] = op.finalize(accumulator);
208 output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
213 template <
typename Evaluator,
typename OpType>
221 template <
typename Scratch>
223 Index range_, Index num_values_to_reduce_)
224 : evaluator(evaluator_),
225 output_accessor(output_accessor_),
226 functor(OpDef::
get_op(functor_)),
228 num_values_to_reduce(num_values_to_reduce_) {}
231 auto output_accessor_ptr = output_accessor.get_pointer();
233 Index globalid =
static_cast<Index
>(itemID.get_global_linear_id());
234 if (globalid <
range) {
235 CoeffReturnType accum = functor.initialize();
237 evaluator, evaluator.firstInput(globalid), functor, &accum);
238 output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
252 template <
typename Evaluator,
typename OpType,
typename PannelParameters, reduction_dim rt>
259 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
271 const Index preserve_elements_num_groups_,
const Index reduce_elements_num_groups_,
272 const Index num_coeffs_to_preserve_,
const Index num_coeffs_to_reduce_)
274 evaluator(evaluator_),
275 output_accessor(output_accessor_),
277 preserve_elements_num_groups(preserve_elements_num_groups_),
278 reduce_elements_num_groups(reduce_elements_num_groups_),
279 num_coeffs_to_preserve(num_coeffs_to_preserve_),
280 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
283 CoeffReturnType &accumulator) {
284 if (globalPId >= num_coeffs_to_preserve) {
288 : globalRId + (globalPId * num_coeffs_to_reduce);
289 Index localOffset = globalRId;
291 const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
292 const Index per_thread_global_stride =
294 for (Index
i = globalRId;
i < num_coeffs_to_reduce;
i += per_thread_local_stride) {
295 op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
296 localOffset += per_thread_local_stride;
297 global_offset += per_thread_global_stride;
301 const Index linearLocalThreadId = itemID.get_local_id(0);
303 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
305 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
307 : itemID.get_group(0) / reduce_elements_num_groups;
309 : itemID.get_group(0) % reduce_elements_num_groups;
311 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
312 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
313 auto scratchPtr = scratch.get_pointer().get();
315 output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
316 CoeffReturnType accumulator = op.initialize();
318 element_wise_reduce(globalRId, globalPId, accumulator);
320 accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
321 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
324 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
325 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
326 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
331 auto out_scratch_ptr =
332 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
333 itemID.barrier(cl::sycl::access::fence_space::local_space);
335 accumulator = *out_scratch_ptr;
339 for (Index
offset = PannelParameters::LocalThreadSizeR >> 1;
offset > 0;
offset >>= 1) {
340 if (rLocalThreadId <
offset) {
341 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) *
offset], &accumulator);
344 *out_scratch_ptr = op.finalize(accumulator);
351 itemID.barrier(cl::sycl::access::fence_space::local_space);
354 if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
355 outPtr[globalPId] = op.finalize(accumulator);
360 template <
typename OutScalar,
typename Index,
typename InputAccessor,
typename OutputAccessor,
typename OpType>
364 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
373 OutputAccessor output_accessor_, OpType
op_,
374 const Index num_coeffs_to_preserve_,
375 const Index num_coeffs_to_reduce_)
376 : input_accessor(input_accessor_),
377 output_accessor(output_accessor_),
379 num_coeffs_to_preserve(num_coeffs_to_preserve_),
380 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
383 const Index globalId = itemID.get_global_id(0);
385 if (globalId >= num_coeffs_to_preserve)
return;
387 auto in_ptr = input_accessor.get_pointer() + globalId;
389 OutScalar accumulator = op.initialize();
391 for (Index
i = 0;
i < num_coeffs_to_reduce;
i++) {
392 op.reduce(*in_ptr, &accumulator);
393 in_ptr += num_coeffs_to_preserve;
395 output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
399 template <
typename Index, Index LTP, Index LTR,
bool BC_>
406 template <
typename Self,
typename Op, TensorSycl::
internal::reduction_dim rt>
417 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType output,
418 Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
419 Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
424 static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
425 (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
426 "The Local thread size must be a power of 2 for the reduction " 429 EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
437 const Index reductionPerThread = 64;
438 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(),
true);
439 const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
440 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
441 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ?
std::min(rGroups, localRange) : 1;
442 const Index globalRange = pNumGroups * rNumGroups * localRange;
445 PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
446 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
447 if (rNumGroups > 1) {
448 CoeffReturnType *temp_pointer =
static_cast<CoeffReturnType *
>(
449 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups *
sizeof(CoeffReturnType)));
450 EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
451 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
452 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
453 num_coeffs_to_reduce);
456 SecondStepPartialReductionKernel;
458 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
459 temp_accessor, output,
460 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
Index(1),
461 reducer, num_coeffs_to_preserve, rNumGroups);
463 self.device().deallocate_temp(temp_pointer);
465 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467 num_coeffs_to_reduce);
477 template <
typename Self,
typename Op,
bool Vectorizable>
483 static void run(
const Self &
self, Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType
data) {
485 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
486 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
487 "The Local thread size must be a power of 2 for the reduction " 489 EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
491 typename Self::Index inputSize =
self.impl().dimensions().TotalSize();
495 const Index reductionPerThread = 2048;
497 Index reductionGroup = dev.getPowerOfTwo(
498 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range),
true);
499 const Index num_work_group =
std::min(reductionGroup, local_range);
503 const Index global_range = num_work_group * local_range;
505 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
507 if (num_work_group > 1) {
508 CoeffReturnType *temp_pointer =
509 static_cast<CoeffReturnType *
>(dev.allocate_temp(num_work_group *
sizeof(CoeffReturnType)));
510 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
511 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(
self, tmp_global_accessor, thread_range,
512 local_range, inputSize, reducer);
515 EvaluatorPointerType,
Index, local_range>
517 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
518 tmp_global_accessor,
data,
519 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
522 dev.deallocate_temp(temp_pointer);
524 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(
self,
data, thread_range, local_range, inputSize,
531 template <
typename Self,
typename Op>
535 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
536 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
538 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
539 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>
::run(
self, reducer, dev, output,
540 num_coeffs_to_reduce,
541 num_coeffs_to_preserve);
545 template <
typename Self,
typename Op>
549 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
550 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
552 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
553 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>
::run(
self, reducer, dev, output,
554 num_coeffs_to_reduce,
555 num_coeffs_to_preserve);
562 template <
typename Self,
typename Op>
563 struct GenericReducer<Self, Op,
Eigen::SyclDevice> {
565 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
566 typename Self::EvaluatorPointerType output,
typename Self::Index num_values_to_reduce,
569 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
571 dev.template unary_kernel_launcher<
typename Self::CoeffReturnType,
573 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
Index(1),
574 reducer,
range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
582 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP Eigen::internal::SumReducer< CoeffReturnType > type
GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, Index range_, Index num_values_to_reduce_)
def step(data, isam, result, truth, currPoseIndex)
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, const Index &scale)
#define EIGEN_STRONG_INLINE
Evaluator::EvaluatorPointerType EvaluatorPointerType
Operator implementation generator.
Vectorise< CoeffReturnType, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Evaluator::CoeffReturnType CoeffReturnType
Evaluator::CoeffReturnType CoeffReturnType
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
EvaluatorPointerType output_accessor
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAccessor
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve)
OpDefiner< OpType, CoeffReturnType, Index, true > OpDef
EvaluatorPointerType output_accessor
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
cl::sycl::accessor< OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
void operator()(cl::sycl::nd_item< 1 > itemID)
Self::EvaluatorPointerType EvaluatorPointerType
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Eigen::internal::SumReducer< CoeffReturnType > type
Namespace containing all symbols from the Eigen library.
const Index num_coeffs_to_reduce
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Self::CoeffReturnType CoeffReturnType
void operator()(cl::sycl::nd_item< 1 > itemID)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &scale)
Evaluator::CoeffReturnType CoeffReturnType
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Self::EvaluatorPointerType EvaluatorPointerType
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op)
void operator()(cl::sycl::nd_item< 1 > itemID)
typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess &Evaluator::InputPacketAccess), PacketReturnType, CoeffReturnType >::type OutType
OpDefiner< OpType, OutScalar, Index, false > OpDef
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, Index num_coeffs_to_reduce, Index num_coeffs_to_preserve)
Self::CoeffReturnType CoeffReturnType
ReductionPannel< typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true > PannelParameters
EvaluatorPointerType final_output
PartialReductionKernel< Self, Op, PannelParameters, rt > SyclReducerKerneType
const Index num_coeffs_to_preserve
FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, Index rng_, OpType op_)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Vectorise< CoeffReturnType, Eigen::SyclDevice, true >::PacketReturnType PacketReturnType
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
const Index preserve_elements_num_groups
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
InputAccessor input_accessor
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data)
Evaluator::EvaluatorPointerType EvaluatorPointerType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, CoeffReturnType &accumulator)
#define EIGEN_DEVICE_FUNC
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, OutputAccessor output_accessor_, OpType op_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if<!Vect >::type compute_reduction(const cl::sycl::nd_item< 1 > &itemID)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if< Vect >::type compute_reduction(const cl::sycl::nd_item< 1 > &itemID)
Evaluator::PacketReturnType PacketReturnType
Double_ range(const Point2_ &p, const Point2_ &q)
Evaluator::EvaluatorPointerType EvaluatorPointerType
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics scale
const Index num_coeffs_to_reduce
static EIGEN_DEVICE_FUNC PacketReturnType convert_to_packet_type(Scalar in, Scalar)
OutputAccessor output_accessor
Generic expression where a coefficient-wise unary operator is applied to an expression.
const Index num_coeffs_to_preserve
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAcc
const Index reduce_elements_num_groups
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
#define EIGEN_UNROLL_LOOP
Index num_values_to_reduce
PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)