TensorReductionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 /*****************************************************************
14  * TensorReductionSycl.h
15  *
16  * \brief:
17  * This is the specialization of the reduction operation. Two phase reduction approach
18  * is used since the GPU does not have Global Synchronization for global memory among
19  * different work-group/thread block. To solve the problem, we need to create two kernels
20  * to reduce the data, where the first kernel reduce the data locally and each local
21  * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
22  * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
23  * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
24  * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
25  *
26  *****************************************************************/
27 
28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
30 namespace Eigen {
31 namespace TensorSycl {
32 namespace internal {
33 
34 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
35 struct OpDefiner {
36  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
37  typedef Op type;
38  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
39 
41  const Index &) {
42  return accumulator;
43  }
44 };
45 
46 template <typename CoeffReturnType, typename Index>
47 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
50  return type();
51  }
52 
53  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
54  const Index &scale) {
56  return quotient_op(accumulator, CoeffReturnType(scale));
57  }
58 };
59 
60 template <typename CoeffReturnType, typename Index>
61 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
62  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
65  return type();
66  }
67 
69  const Index &scale) {
70  return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
71  }
72 };
73 
74 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
75  Index local_range>
76 struct SecondStepFullReducer {
77  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
80  typedef typename OpDef::type Op;
82  InputAccessor aI;
83  OutputAccessor outAcc;
84  Op op;
85  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
86  : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
87 
88  void operator()(cl::sycl::nd_item<1> itemID) {
89  // Our empirical research shows that the best performance will be achieved
90  // when there is only one element per thread to reduce in the second step.
91  // in this step the second step reduction time is almost negligible.
92  // Hence, in the second step of reduction the input size is fixed to the
93  // local size, thus, there is only one element read per thread. The
94  // algorithm must be changed if the number of reduce per thread in the
95  // second step is greater than 1. Otherwise, the result will be wrong.
96  const Index localid = itemID.get_local_id(0);
97  auto aInPtr = aI.get_pointer() + localid;
98  auto aOutPtr = outAcc.get_pointer();
99  CoeffReturnType *scratchptr = scratch.get_pointer();
100  CoeffReturnType accumulator = *aInPtr;
101 
102  scratchptr[localid] = op.finalize(accumulator);
103  for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
104  itemID.barrier(cl::sycl::access::fence_space::local_space);
105  if (localid < offset) {
106  op.reduce(scratchptr[localid + offset], &accumulator);
107  scratchptr[localid] = op.finalize(accumulator);
108  }
109  }
110  if (localid == 0) *aOutPtr = op.finalize(accumulator);
111  }
112 };
113 
114 // Full reduction first phase. In this version the vectorization is true and the reduction accept
115 // any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
116 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
118  public:
119  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
120  typedef typename Evaluator::Index Index;
121  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
122  (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
124 
125  typedef typename OpDef::type Op;
126  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
127  typedef typename Evaluator::PacketReturnType PacketReturnType;
128  typedef
129  typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
131  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
134  Evaluator evaluator;
136  Index rng;
137  Op op;
138 
139  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
140  Index rng_, OpType op_)
141  : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
142 
143  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
144 
145  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
147  const cl::sycl::nd_item<1> &itemID) {
148  auto output_ptr = final_output.get_pointer();
149  Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
150  Index globalid = itemID.get_global_id(0);
151  Index localid = itemID.get_local_id(0);
152  Index step = Evaluator::PacketSize * itemID.get_global_range(0);
153  Index start = Evaluator::PacketSize * globalid;
154  // vectorizable parts
155  PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
156  for (Index i = start; i < VectorizedRange; i += step) {
157  op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
158  }
159  globalid += VectorizedRange;
160  // non vectorizable parts
161  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
162  op.template reducePacket<PacketReturnType>(
164  evaluator.impl().coeff(i), op.initialize()),
165  &packetAccumulator);
166  }
167  scratch[localid] = packetAccumulator =
168  OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
169  // reduction parts // Local size is always power of 2
171  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
172  itemID.barrier(cl::sycl::access::fence_space::local_space);
173  if (localid < offset) {
174  op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
175  scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
176  }
177  }
178  if (localid == 0) {
179  output_ptr[itemID.get_group(0)] =
180  op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
181  }
182  }
183 
184  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
186  const cl::sycl::nd_item<1> &itemID) {
187  auto output_ptr = final_output.get_pointer();
188  Index globalid = itemID.get_global_id(0);
189  Index localid = itemID.get_local_id(0);
190  // vectorizable parts
191  CoeffReturnType accumulator = op.initialize();
192  // non vectorizable parts
193  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
194  op.reduce(evaluator.impl().coeff(i), &accumulator);
195  }
196  scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
197 
198  // reduction parts. the local size is always power of 2
200  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
201  itemID.barrier(cl::sycl::access::fence_space::local_space);
202  if (localid < offset) {
203  op.reduce(scratch[localid + offset], &accumulator);
204  scratch[localid] = op.finalize(accumulator);
205  }
206  }
207  if (localid == 0) {
208  output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
209  }
210  }
211 };
212 
213 template <typename Evaluator, typename OpType>
214 class GenericNondeterministicReducer {
215  public:
216  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
217  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
218  typedef typename Evaluator::Index Index;
219  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
220  typedef typename OpDef::type Op;
221  template <typename Scratch>
222  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
223  Index range_, Index num_values_to_reduce_)
224  : evaluator(evaluator_),
225  output_accessor(output_accessor_),
226  functor(OpDef::get_op(functor_)),
227  range(range_),
228  num_values_to_reduce(num_values_to_reduce_) {}
229 
230  void operator()(cl::sycl::nd_item<1> itemID) {
231  auto output_accessor_ptr = output_accessor.get_pointer();
233  Index globalid = static_cast<Index>(itemID.get_global_linear_id());
234  if (globalid < range) {
235  CoeffReturnType accum = functor.initialize();
237  evaluator, evaluator.firstInput(globalid), functor, &accum);
238  output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
239  }
240  }
241 
242  private:
243  Evaluator evaluator;
245  Op functor;
246  Index range;
248 };
249 
250 enum class reduction_dim { inner_most, outer_most };
251 // default is preserver
252 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
254  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
255  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
256  typedef typename Evaluator::Index Index;
258  typedef typename OpDef::type Op;
259  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
260  ScratchAcc;
262  Evaluator evaluator;
264  Op op;
269 
270  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
271  const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
272  const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
273  : scratch(scratch_),
274  evaluator(evaluator_),
275  output_accessor(output_accessor_),
276  op(OpDef::get_op(op_)),
277  preserve_elements_num_groups(preserve_elements_num_groups_),
278  reduce_elements_num_groups(reduce_elements_num_groups_),
279  num_coeffs_to_preserve(num_coeffs_to_preserve_),
280  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
281 
283  CoeffReturnType &accumulator) {
284  if (globalPId >= num_coeffs_to_preserve) {
285  return;
286  }
287  Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
288  : globalRId + (globalPId * num_coeffs_to_reduce);
289  Index localOffset = globalRId;
290 
291  const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
292  const Index per_thread_global_stride =
293  rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
294  for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
295  op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
296  localOffset += per_thread_local_stride;
297  global_offset += per_thread_global_stride;
298  }
299  }
300  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
301  const Index linearLocalThreadId = itemID.get_local_id(0);
302  Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
303  : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
304  Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
305  : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
306  const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
307  : itemID.get_group(0) / reduce_elements_num_groups;
308  const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
309  : itemID.get_group(0) % reduce_elements_num_groups;
310 
311  Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
312  const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
313  auto scratchPtr = scratch.get_pointer().get();
314  auto outPtr =
315  output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
316  CoeffReturnType accumulator = op.initialize();
317 
318  element_wise_reduce(globalRId, globalPId, accumulator);
319 
320  accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
321  scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
322  accumulator;
323  if (rt == reduction_dim::inner_most) {
324  pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
325  rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
326  globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
327  }
328 
329  /* Apply the reduction operation between the current local
330  * id and the one on the other half of the vector. */
331  auto out_scratch_ptr =
332  scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
333  itemID.barrier(cl::sycl::access::fence_space::local_space);
334  if (rt == reduction_dim::inner_most) {
335  accumulator = *out_scratch_ptr;
336  }
337  // The Local LocalThreadSizeR is always power of 2
339  for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
340  if (rLocalThreadId < offset) {
341  op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
342  // The result has already been divided for mean reducer in the
343  // previous reduction so no need to divide furthermore
344  *out_scratch_ptr = op.finalize(accumulator);
345  }
346  /* All threads collectively read from global memory into local.
347  * The barrier ensures all threads' IO is resolved before
348  * execution continues (strictly speaking, all threads within
349  * a single work-group - there is no co-ordination between
350  * work-groups, only work-items). */
351  itemID.barrier(cl::sycl::access::fence_space::local_space);
352  }
353 
354  if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
355  outPtr[globalPId] = op.finalize(accumulator);
356  }
357  }
358 };
359 
360 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
361 struct SecondStepPartialReduction {
362  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
363  typedef typename OpDef::type Op;
364  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
366  InputAccessor input_accessor;
367  OutputAccessor output_accessor;
368  Op op;
371 
373  OutputAccessor output_accessor_, OpType op_,
374  const Index num_coeffs_to_preserve_,
375  const Index num_coeffs_to_reduce_)
376  : input_accessor(input_accessor_),
377  output_accessor(output_accessor_),
378  op(OpDef::get_op(op_)),
379  num_coeffs_to_preserve(num_coeffs_to_preserve_),
380  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
381 
382  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
383  const Index globalId = itemID.get_global_id(0);
384 
385  if (globalId >= num_coeffs_to_preserve) return;
386 
387  auto in_ptr = input_accessor.get_pointer() + globalId;
388 
389  OutScalar accumulator = op.initialize();
390 // num_coeffs_to_reduce is not bigger that 256
391  for (Index i = 0; i < num_coeffs_to_reduce; i++) {
392  op.reduce(*in_ptr, &accumulator);
393  in_ptr += num_coeffs_to_preserve;
394  }
395  output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
396  }
397 }; // namespace internal
398 
399 template <typename Index, Index LTP, Index LTR, bool BC_>
403  static EIGEN_CONSTEXPR bool BC = BC_;
404 };
405 
406 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
408  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
409  typedef typename Self::CoeffReturnType CoeffReturnType;
410  typedef typename Self::Storage Storage;
411  typedef typename Self::Index Index;
414 
416 
417  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
418  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
419  Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
420 
421  // getPowerOfTwo makes sure local range is power of 2 and <=
422  // maxSyclThreadPerBlock this will help us to avoid extra check on the
423  // kernel
426  "The Local thread size must be a power of 2 for the reduction "
427  "operation");
428 
430  // In this step, we force the code not to be more than 2-step reduction:
431  // Our empirical research shows that if each thread reduces at least 64
432  // elemnts individually, we get better performance. However, this can change
433  // on different platforms. In this step we force the code not to be
434  // morthan step reduction: Our empirical research shows that for inner_most
435  // dim reducer, it is better to have 8 group in a reduce dimension for sizes
436  // > 1024 to achieve the best performance.
437  const Index reductionPerThread = 64;
438  Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
439  const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
440  Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
441  const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
442  const Index globalRange = pNumGroups * rNumGroups * localRange;
443 
444  EIGEN_CONSTEXPR Index scratchSize =
446  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
447  if (rNumGroups > 1) {
448  CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
449  dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
450  EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
451  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
452  self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
453  num_coeffs_to_reduce);
454 
456  SecondStepPartialReductionKernel;
457 
458  dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
459  temp_accessor, output,
460  cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
461  reducer, num_coeffs_to_preserve, rNumGroups);
462 
463  self.device().deallocate_temp(temp_pointer);
464  } else {
465  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466  self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467  num_coeffs_to_reduce);
468  }
469  return false;
470  }
471 };
472 } // namespace internal
473 } // namespace TensorSycl
474 
475 namespace internal {
476 
477 template <typename Self, typename Op, bool Vectorizable>
478 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
479  typedef typename Self::CoeffReturnType CoeffReturnType;
480  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
481  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
482  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
483  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
485  static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
486  (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
487  "The Local thread size must be a power of 2 for the reduction "
488  "operation");
489  EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
490 
491  typename Self::Index inputSize = self.impl().dimensions().TotalSize();
492  // In this step we force the code not to be more than 2-step reduction:
493  // Our empirical research shows that if each thread reduces at least 512
494  // elemnts individually, we get better performance.
495  const Index reductionPerThread = 2048;
496  // const Index num_work_group =
497  Index reductionGroup = dev.getPowerOfTwo(
498  (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
499  const Index num_work_group = std::min(reductionGroup, local_range);
500  // 1
501  // ? local_range
502  // : 1);
503  const Index global_range = num_work_group * local_range;
504 
505  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
507  if (num_work_group > 1) {
508  CoeffReturnType *temp_pointer =
509  static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
510  typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
511  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
512  local_range, inputSize, reducer);
513 
515  EvaluatorPointerType, Index, local_range>
516  GenericRKernel;
517  dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
518  tmp_global_accessor, data,
519  cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
520  reducer);
521 
522  dev.deallocate_temp(temp_pointer);
523  } else {
524  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
525  reducer);
526  }
527  }
528 };
529 // vectorizable inner_most most dim preserver
530 // col reduction
531 template <typename Self, typename Op>
532 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
533  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
534 
535  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
536  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
537  typename Self::Index num_coeffs_to_preserve) {
538  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
539  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
540  num_coeffs_to_reduce,
541  num_coeffs_to_preserve);
542  }
543 };
544 // row reduction
545 template <typename Self, typename Op>
546 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
547  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
548 
549  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
550  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
551  typename Self::Index num_coeffs_to_preserve) {
552  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
553  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
554  num_coeffs_to_reduce,
555  num_coeffs_to_preserve);
556  }
557 };
558 
559 // ArmgMax uses this kernel for partial reduction//
560 // TODO(@mehdi.goli) come up with a better kernel
561 // generic partial reduction
562 template <typename Self, typename Op>
563 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
564  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
565  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
566  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
567  typename Self::Index num_coeffs_to_preserve) {
568  typename Self::Index range, GRange, tileSize;
569  dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
570 
571  dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
573  self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
574  reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
575  return false;
576  }
577 };
578 
579 } // namespace internal
580 } // namespace Eigen
581 
582 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
Eigen::TensorSycl::internal::reduction_dim::outer_most
@ outer_most
Eigen::TensorSycl::internal::SecondStepFullReducer::OpDef
OpDefiner< OpType, CoeffReturnType, Index, true > OpDef
Definition: TensorReductionSycl.h:118
Eigen::TensorSycl::internal::SecondStepPartialReduction::output_accessor
OutputAccessor output_accessor
Definition: TensorReductionSycl.h:406
Eigen::TensorSycl::internal::FullReductionKernelFunctor::OutType
typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess &Evaluator::InputPacketAccess), PacketReturnType, CoeffReturnType >::type OutType
Definition: TensorReductionSycl.h:169
Eigen::TensorSycl::internal::PartialReductionKernel::Op
OpDef::type Op
Definition: TensorReductionSycl.h:297
Eigen::TensorSycl::internal::PartialReductionKernel::preserve_elements_num_groups
const Index preserve_elements_num_groups
Definition: TensorReductionSycl.h:304
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
Eigen::TensorSycl::internal::SecondStepPartialReduction::num_coeffs_to_preserve
const Index num_coeffs_to_preserve
Definition: TensorReductionSycl.h:408
Eigen::TensorSycl::internal::SecondStepPartialReduction::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition: TensorReductionSycl.h:421
Eigen::TensorSycl::internal::PartialReductionKernel::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition: TensorReductionSycl.h:339
Eigen::TensorSycl::internal::OpDefiner
Definition: TensorReductionSycl.h:74
Eigen::TensorSycl::internal::PartialReducerLauncher::run
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, Index num_coeffs_to_reduce, Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:456
Eigen::TensorSycl::internal::PacketWrapper
Definition: InteropHeaders.h:132
Eigen::TensorSycl::internal::PartialReductionKernel::num_coeffs_to_preserve
const Index num_coeffs_to_preserve
Definition: TensorReductionSycl.h:306
Eigen::TensorSycl::internal::SecondStepPartialReduction::num_coeffs_to_reduce
const Index num_coeffs_to_reduce
Definition: TensorReductionSycl.h:409
Eigen::TensorSycl::internal::SecondStepPartialReduction::ScratchAccessor
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAccessor
Definition: TensorReductionSycl.h:404
gtsam.examples.DogLegOptimizerExample.type
type
Definition: DogLegOptimizerExample.py:111
Eigen::internal::scalar_quotient_op
Definition: BinaryFunctors.h:378
Eigen::TensorSycl::internal::FullReductionKernelFunctor::operator()
void operator()(cl::sycl::nd_item< 1 > itemID)
Definition: TensorReductionSycl.h:182
Eigen::internal::SumReducer
Definition: TensorFunctors.h:68
Eigen::TensorSycl::internal::SecondStepFullReducer::operator()
void operator()(cl::sycl::nd_item< 1 > itemID)
Definition: TensorReductionSycl.h:127
Eigen::internal::conditional::type
Then type
Definition: Meta.h:109
Eigen::TensorSycl::internal::FullReductionKernelFunctor::FullReductionKernelFunctor
FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, Index rng_, OpType op_)
Definition: TensorReductionSycl.h:178
Eigen::TensorSycl::internal::PartialReducerLauncher::SyclReducerKerneType
PartialReductionKernel< Self, Op, PannelParameters, rt > SyclReducerKerneType
Definition: TensorReductionSycl.h:454
Eigen::TensorSycl::internal::FullReductionKernelFunctor::LocalAccessor
cl::sycl::accessor< OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorReductionSycl.h:171
EIGEN_CONSTEXPR
#define EIGEN_CONSTEXPR
Definition: Macros.h:787
Eigen::TensorSycl::internal::SecondStepPartialReduction::op
Op op
Definition: TensorReductionSycl.h:407
type
Definition: pytypes.h:1525
Eigen::internal::pdiv
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:244
Eigen::TensorSycl::internal::FullReductionKernelFunctor::op
Op op
Definition: TensorReductionSycl.h:176
Eigen::TensorSycl::internal::FullReductionKernelFunctor::PacketReturnType
Evaluator::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:166
Eigen::TensorSycl::internal::GenericNondeterministicReducer::range
Index range
Definition: TensorReductionSycl.h:285
Eigen::TensorSycl::internal::FullReductionKernelFunctor::EvaluatorPointerType
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:165
op_
Operator implementation generator.
Definition: attr.h:174
Eigen::TensorSycl::internal::PartialReducerLauncher
Definition: TensorReductionSycl.h:446
Eigen::TensorSycl::internal::PartialReductionKernel
Definition: TensorReductionSycl.h:292
Eigen::TensorSycl::internal::SecondStepFullReducer::SecondStepFullReducer
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
Definition: TensorReductionSycl.h:124
Eigen::TensorSycl::internal::GenericNondeterministicReducer::output_accessor
EvaluatorPointerType output_accessor
Definition: TensorReductionSycl.h:283
Eigen::TensorSycl::internal::PartialReductionKernel::scratch
ScratchAcc scratch
Definition: TensorReductionSycl.h:300
Eigen::TensorSycl::internal::PartialReductionKernel::reduce_elements_num_groups
const Index reduce_elements_num_groups
Definition: TensorReductionSycl.h:305
Eigen::TensorSycl::internal::SecondStepPartialReduction::Op
OpDef::type Op
Definition: TensorReductionSycl.h:402
Eigen::TensorSycl::internal::GenericNondeterministicReducer::Index
Evaluator::Index Index
Definition: TensorReductionSycl.h:257
gtsam::range
Double_ range(const Point2_ &p, const Point2_ &q)
Definition: slam/expressions.h:30
Eigen::internal::FullReducer< Self, Op, Eigen::SyclDevice, Vectorizable >::EvaluatorPointerType
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:493
Eigen::TensorSycl::internal::ReductionPannel::BC
static EIGEN_CONSTEXPR bool BC
Definition: TensorReductionSycl.h:442
Eigen::TensorSycl::internal::PartialReductionKernel::element_wise_reduce
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, CoeffReturnType &accumulator)
Definition: TensorReductionSycl.h:321
Eigen::TensorSycl::internal::GenericNondeterministicReducer::EvaluatorPointerType
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:256
Eigen::TensorSycl::internal::PartialReductionKernel::EvaluatorPointerType
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:294
data
int data[]
Definition: Map_placement_new.cpp:1
Eigen::TensorSycl::internal::PartialReductionKernel::num_coeffs_to_reduce
const Index num_coeffs_to_reduce
Definition: TensorReductionSycl.h:307
Eigen::TensorSycl::internal::FullReductionKernelFunctor::evaluator
Evaluator evaluator
Definition: TensorReductionSycl.h:173
scale
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics scale
Definition: gnuplot_common_settings.hh:54
Eigen::TensorSycl::internal::PartialReductionKernel::ScratchAcc
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAcc
Definition: TensorReductionSycl.h:299
Eigen::TensorSycl::internal::FullReductionKernelFunctor::final_output
EvaluatorPointerType final_output
Definition: TensorReductionSycl.h:174
Eigen::TensorSycl::internal::SecondStepFullReducer::scratch
LocalAccessor scratch
Definition: TensorReductionSycl.h:120
Eigen::TensorSycl::internal::FullReductionKernelFunctor::Op
OpDef::type Op
Definition: TensorReductionSycl.h:164
Eigen::TensorSycl::internal::PartialReductionKernel::CoeffReturnType
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:293
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
Eigen::TensorSycl::internal::SecondStepPartialReduction::SecondStepPartialReduction
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, OutputAccessor output_accessor_, OpType op_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition: TensorReductionSycl.h:411
EIGEN_UNROLL_LOOP
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1461
Eigen::internal::InnerReducer::HasOptimizedImplementation
static const bool HasOptimizedImplementation
Definition: TensorReduction.h:396
gtsam::utils.visual_isam.step
def step(data, isam, result, truth, currPoseIndex, isamArgs=())
Definition: visual_isam.py:82
Eigen::internal::InnerReducer
Definition: TensorReduction.h:395
Eigen::TensorSycl::internal::reduction_dim
reduction_dim
Definition: TensorReductionSycl.h:289
Eigen::TensorSycl::internal::FullReductionKernelFunctor::Index
Evaluator::Index Index
Definition: TensorReductionSycl.h:159
Eigen::TensorSycl::internal::GenericNondeterministicReducer::operator()
void operator()(cl::sycl::nd_item< 1 > itemID)
Definition: TensorReductionSycl.h:269
Eigen::TensorSycl::internal::SecondStepFullReducer
Definition: TensorReductionSycl.h:115
gtsam.examples.DogLegOptimizerExample.run
def run(args)
Definition: DogLegOptimizerExample.py:21
Eigen::TensorSycl::internal::ReductionPannel::LocalThreadSizeP
static EIGEN_CONSTEXPR Index LocalThreadSizeP
Definition: TensorReductionSycl.h:440
Eigen::TensorSycl::internal::SecondStepFullReducer::op
Op op
Definition: TensorReductionSycl.h:123
Eigen::TensorSycl::internal::SecondStepFullReducer::outAcc
OutputAccessor outAcc
Definition: TensorReductionSycl.h:122
Eigen::TensorSycl::internal::GenericNondeterministicReducer
Definition: TensorReductionSycl.h:253
Eigen::TensorSycl::internal::PartialReductionKernel::Index
Evaluator::Index Index
Definition: TensorReductionSycl.h:295
Eigen::TensorSycl::internal::PartialReducerLauncher::Index
Self::Index Index
Definition: TensorReductionSycl.h:450
Eigen::TensorSycl::internal::GenericNondeterministicReducer::Op
OpDef::type Op
Definition: TensorReductionSycl.h:259
Eigen::TensorSycl::internal::PartialReducerLauncher::PannelParameters
ReductionPannel< typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true > PannelParameters
Definition: TensorReductionSycl.h:452
Eigen::TensorSycl::internal::PartialReductionKernel::output_accessor
EvaluatorPointerType output_accessor
Definition: TensorReductionSycl.h:302
Eigen::TensorSycl::internal::OpDefiner::get_op
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op)
Definition: TensorReductionSycl.h:90
offset
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition: gnuplot_common_settings.hh:64
Eigen::TensorSycl::internal::SecondStepFullReducer::LocalAccessor
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorReductionSycl.h:117
Eigen::internal::OuterReducer::HasOptimizedImplementation
static const bool HasOptimizedImplementation
Definition: TensorReduction.h:407
Eigen::TensorSycl::internal::FullReductionKernelFunctor::compute_reduction
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if< Vect >::type compute_reduction(const cl::sycl::nd_item< 1 > &itemID)
Definition: TensorReductionSycl.h:185
Eigen::TensorSycl::internal::reduction_dim::inner_most
@ inner_most
Eigen::TensorSycl::internal::PartialReductionKernel::PartialReductionKernel
PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition: TensorReductionSycl.h:309
Eigen::TensorSycl::internal::FullReductionKernelFunctor::rng
Index rng
Definition: TensorReductionSycl.h:175
Eigen::internal::FullReducer< Self, Op, Eigen::SyclDevice, Vectorizable >::CoeffReturnType
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:492
Eigen::TensorSycl::internal::SecondStepFullReducer::Op
OpDef::type Op
Definition: TensorReductionSycl.h:119
Eigen::internal::FullReducer::run
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::EvaluatorPointerType output)
Definition: TensorReduction.h:314
Eigen::TensorSycl::internal::PartialReductionKernel::evaluator
Evaluator evaluator
Definition: TensorReductionSycl.h:301
Eigen::TensorSycl::internal::FullReductionKernelFunctor
Definition: TensorReductionSycl.h:156
Eigen::TensorSycl::internal::PartialReductionKernel::op
Op op
Definition: TensorReductionSycl.h:303
Eigen::TensorSycl::internal::ReductionPannel::LocalThreadSizeR
static EIGEN_CONSTEXPR Index LocalThreadSizeR
Definition: TensorReductionSycl.h:441
Eigen::internal::MeanReducer
Definition: TensorFunctors.h:111
Eigen::TensorSycl::internal::GenericNondeterministicReducer::OpDef
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition: TensorReductionSycl.h:258
min
#define min(a, b)
Definition: datatypes.h:19
Eigen::TensorSycl::internal::OpDefiner::PacketReturnType
Vectorise< CoeffReturnType, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:88
Eigen::TensorSycl::internal::SecondStepFullReducer::aI
InputAccessor aI
Definition: TensorReductionSycl.h:121
Eigen::TensorSycl::internal::PartialReducerLauncher::EvaluatorPointerType
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:447
Eigen::TensorSycl::internal::GenericNondeterministicReducer::CoeffReturnType
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:255
Eigen::TensorSycl::internal::GenericNondeterministicReducer::num_values_to_reduce
Index num_values_to_reduce
Definition: TensorReductionSycl.h:286
Eigen::TensorSycl::internal::OpDefiner::finalise_op
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &)
Definition: TensorReductionSycl.h:92
internal
Definition: BandTriangularSolver.h:13
Eigen::TensorSycl::internal::FullReductionKernelFunctor::CoeffReturnType
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:158
Eigen::TensorSycl::internal::GenericNondeterministicReducer::GenericNondeterministicReducer
GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, Index range_, Index num_values_to_reduce_)
Definition: TensorReductionSycl.h:261
Eigen::TensorSycl::internal::GenericNondeterministicReducer::evaluator
Evaluator evaluator
Definition: TensorReductionSycl.h:282
Eigen::TensorSycl::internal::SecondStepPartialReduction
Definition: TensorReductionSycl.h:400
Eigen::TensorSycl::internal::FullReductionKernelFunctor::scratch
LocalAccessor scratch
Definition: TensorReductionSycl.h:172
Eigen::TensorSycl::internal::PartialReducerLauncher::CoeffReturnType
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:448
Eigen::internal::GenericDimReducer::reduce
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:144
Eigen::TensorSycl::internal::PartialReducerLauncher::Storage
Self::Storage Storage
Definition: TensorReductionSycl.h:449
Eigen::TensorSycl::internal::GenericNondeterministicReducer::functor
Op functor
Definition: TensorReductionSycl.h:284
Eigen::TensorSycl::internal::SecondStepPartialReduction::input_accessor
InputAccessor input_accessor
Definition: TensorReductionSycl.h:405
Eigen::TensorSycl::internal::SecondStepPartialReduction::OpDef
OpDefiner< OpType, OutScalar, Index, false > OpDef
Definition: TensorReductionSycl.h:401
Eigen::internal::FullReducer::HasOptimizedImplementation
static const bool HasOptimizedImplementation
Definition: TensorReduction.h:312
Eigen::TensorSycl::internal::ReductionPannel
Definition: TensorReductionSycl.h:439
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9
Eigen::internal::OuterReducer::run
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:409
Eigen::TensorSycl::internal::PartialReductionKernel::OpDef
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition: TensorReductionSycl.h:296
Eigen::internal::InnerReducer::run
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:398
Eigen::TensorSycl::internal::OpDefiner::type
Op type
Definition: TensorReductionSycl.h:89
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74


gtsam
Author(s):
autogenerated on Wed Jan 22 2025 04:05:37