TensorReduction.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
13 
14 // clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
15 // so we'll use a macro to make clang happy.
16 #ifndef KERNEL_FRIEND
17 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
18 #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
19 #else
20 #define KERNEL_FRIEND friend
21 #endif
22 #endif
23 
24 
25 namespace Eigen {
26 
27 
35 namespace internal {
36  template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
37  struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
38  : traits<XprType>
39 {
41  typedef typename XprTraits::Scalar Scalar;
42  typedef typename XprTraits::StorageKind StorageKind;
43  typedef typename XprTraits::Index Index;
44  typedef typename XprType::Nested Nested;
45  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
46  static const int Layout = XprTraits::Layout;
47  typedef typename XprTraits::PointerType PointerType;
48 
49  template <class T> struct MakePointer {
50  // Intermediate typedef to workaround MSVC issue.
51  typedef MakePointer_<T> MakePointerT;
52  typedef typename MakePointerT::Type Type;
53  };
54 };
55 
56 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
57 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
58 {
60 };
61 
62 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
63 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
64 {
66 };
67 
68 
69 template <typename OutputDims> struct DimInitializer {
70  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
71  static void run(const InputDims& input_dims,
72  const array<bool, internal::array_size<InputDims>::value>& reduced,
73  OutputDims* output_dims, ReducedDims* reduced_dims) {
74  const int NumInputDims = internal::array_size<InputDims>::value;
75  int outputIndex = 0;
76  int reduceIndex = 0;
77  for (int i = 0; i < NumInputDims; ++i) {
78  if (reduced[i]) {
79  (*reduced_dims)[reduceIndex] = input_dims[i];
80  ++reduceIndex;
81  } else {
82  (*output_dims)[outputIndex] = input_dims[i];
83  ++outputIndex;
84  }
85  }
86  }
87 };
88 
89 template <> struct DimInitializer<Sizes<> > {
90  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
91  static void run(const InputDims& input_dims, const array<bool, Rank>&,
92  Sizes<>*, array<Index, Rank>* reduced_dims) {
93  const int NumInputDims = internal::array_size<InputDims>::value;
94  for (int i = 0; i < NumInputDims; ++i) {
95  (*reduced_dims)[i] = input_dims[i];
96  }
97  }
98 };
99 
100 
101 template <typename ReducedDims, int NumTensorDims, int Layout>
103  static const bool value = false;
104 };
105 template <typename ReducedDims, int NumTensorDims, int Layout>
107  static const bool value = false;
108 };
109 
110 #if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
111 template <typename ReducedDims, int NumTensorDims>
112 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
113  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
114  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
115  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
116  static const bool value = tmp1 & tmp2 & tmp3;
117 };
118 template <typename ReducedDims, int NumTensorDims>
119 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
120  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
121  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
122  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
123  static const bool value = tmp1 & tmp2 & tmp3;
124 
125 };
126 template <typename ReducedDims, int NumTensorDims>
127 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
128  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
129  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
130  static const bool value = tmp1 & tmp2;
131 
132 };
133 template <typename ReducedDims, int NumTensorDims>
134 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
135  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
136  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
137  static const bool value = tmp1 & tmp2;
138 };
139 #endif
140 
141 
142 template <int DimIndex, typename Self, typename Op>
144  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
145  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
146  for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
147  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
148  GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
149  }
150  }
151 };
152 template <typename Self, typename Op>
153 struct GenericDimReducer<0, Self, Op> {
154  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
155  for (int j = 0; j < self.m_reducedDims[0]; ++j) {
156  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
157  reducer.reduce(self.m_impl.coeff(input), accum);
158  }
159  }
160 };
161 template <typename Self, typename Op>
162 struct GenericDimReducer<-1, Self, Op> {
163  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
164  reducer.reduce(self.m_impl.coeff(index), accum);
165  }
166 };
167 
168 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
169  bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
170  !Self::ReducerTraits::IsExactlyAssociative)>
172  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
173  typename Self::CoeffReturnType accum = reducer.initialize();
174  for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
175  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
176  }
177  return reducer.finalize(accum);
178  }
179 };
180 
181 template <typename Self, typename Op>
182 struct InnerMostDimReducer<Self, Op, true, false> {
183  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
185  const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
186  typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
187  for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
188  reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
189  }
190  typename Self::CoeffReturnType accum = reducer.initialize();
191  for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
192  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
193  }
194  return reducer.finalizeBoth(accum, paccum);
195  }
196 };
197 
198 #if !defined(EIGEN_HIPCC)
199 static const int kLeafSize = 1024;
200 
201 template <typename Self, typename Op>
202 struct InnerMostDimReducer<Self, Op, false, true> {
203  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
204  reduce(const Self& self, typename Self::Index firstIndex,
205  typename Self::Index numValuesToReduce, Op& reducer) {
206  typename Self::CoeffReturnType accum = reducer.initialize();
207  if (numValuesToReduce > kLeafSize) {
208  const typename Self::Index half = numValuesToReduce / 2;
209  reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
210  reducer.reduce(
211  reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
212  &accum);
213  } else {
214  for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
215  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
216  }
217  }
218  return reducer.finalize(accum);
219  }
220 };
221 
222 template <typename Self, typename Op>
223 struct InnerMostDimReducer<Self, Op, true, true> {
224  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
225  reduce(const Self& self, typename Self::Index firstIndex,
226  typename Self::Index numValuesToReduce, Op& reducer) {
227  const typename Self::Index packetSize =
229  typename Self::CoeffReturnType accum = reducer.initialize();
230  if (numValuesToReduce > packetSize * kLeafSize) {
231  // Make sure the split point is aligned on a packet boundary.
232  const typename Self::Index split =
233  packetSize *
234  divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
235  packetSize);
236  const typename Self::Index num_left =
237  numext::mini(split - firstIndex, numValuesToReduce);
238  reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
239  if (num_left < numValuesToReduce) {
240  reducer.reduce(
241  reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
242  }
243  return reducer.finalize(accum);
244  } else {
245  const typename Self::Index UnrollSize =
246  (numValuesToReduce / (2*packetSize)) * 2*packetSize;
247  const typename Self::Index VectorizedSize =
248  (numValuesToReduce / packetSize) * packetSize;
249  typename Self::PacketReturnType paccum =
250  reducer.template initializePacket<typename Self::PacketReturnType>();
251  typename Self::PacketReturnType paccum2 =
252  reducer.template initializePacket<typename Self::PacketReturnType>();
253  for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
254  reducer.reducePacket(
255  self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
256  reducer.reducePacket(
257  self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
258  &paccum2);
259  }
260  for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
261  reducer.reducePacket(self.m_impl.template packet<Unaligned>(
262  firstIndex + j), &paccum);
263  }
264  reducer.reducePacket(paccum2, &paccum);
265  for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
266  ++j) {
267  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
268  }
269  return reducer.finalizeBoth(accum, paccum);
270  }
271  }
272 };
273 #endif
274 
275 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
277  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
278  eigen_assert(false && "should never be called");
279  }
280 };
281 
282 template <int DimIndex, typename Self, typename Op>
283 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
284  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
285  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
286  for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
287  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
288  InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
289  }
290  }
291 };
292 
293 template <typename Self, typename Op>
294 struct InnerMostDimPreserver<0, Self, Op, true> {
295  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
296  for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
297  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
298  reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
299  }
300  }
301 };
302 template <typename Self, typename Op>
303 struct InnerMostDimPreserver<-1, Self, Op, true> {
304  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
305  eigen_assert(false && "should never be called");
306  }
307 };
308 
309 // Default full reducer
310 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
311 struct FullReducer {
312  static const bool HasOptimizedImplementation = false;
313 
314  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
315  const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
316  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
317  }
318 };
319 
320 
321 #ifdef EIGEN_USE_THREADS
322 // Multithreaded full reducers
323 template <typename Self, typename Op,
324  bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
325 struct FullReducerShard {
326  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
327  typename Self::Index numValuesToReduce, Op& reducer,
328  typename Self::CoeffReturnType* output) {
330  self, firstIndex, numValuesToReduce, reducer);
331  }
332 };
333 
334 // Multithreaded full reducer
335 template <typename Self, typename Op, bool Vectorizable>
337  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
338  static const Index PacketSize =
340 
341  // launch one reducer per thread and accumulate the result.
342  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
343  typename Self::CoeffReturnType* output) {
344  typedef typename Self::Index Index;
345  const Index num_coeffs = array_prod(self.m_impl.dimensions());
346  if (num_coeffs == 0) {
347  *output = reducer.finalize(reducer.initialize());
348  return;
349  }
350  const TensorOpCost cost =
351  self.m_impl.costPerCoeff(Vectorizable) +
353  PacketSize);
354  const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
355  num_coeffs, cost, device.numThreads());
356  if (num_threads == 1) {
357  *output =
358  InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
359  return;
360  }
361  const Index blocksize =
362  std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
363  const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
364  eigen_assert(num_coeffs >= numblocks * blocksize);
365 
366  Barrier barrier(internal::convert_index<unsigned int>(numblocks));
367  MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
368  for (Index i = 0; i < numblocks; ++i) {
369  device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
370  self, i * blocksize, blocksize, reducer,
371  &shards[i]);
372  }
373  typename Self::CoeffReturnType finalShard;
374  if (numblocks * blocksize < num_coeffs) {
376  self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
377  reducer);
378  } else {
379  finalShard = reducer.initialize();
380  }
381  barrier.Wait();
382 
383  for (Index i = 0; i < numblocks; ++i) {
384  reducer.reduce(shards[i], &finalShard);
385  }
386  *output = reducer.finalize(finalShard);
387  }
388 };
389 
390 #endif
391 
392 
393 // Default inner reducer
394 template <typename Self, typename Op, typename Device>
395 struct InnerReducer {
396  static const bool HasOptimizedImplementation = false;
397 
398  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
399  eigen_assert(false && "Not implemented");
400  return true;
401  }
402 };
403 
404 // Default outer reducer
405 template <typename Self, typename Op, typename Device>
406 struct OuterReducer {
407  static const bool HasOptimizedImplementation = false;
408 
409  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
410  eigen_assert(false && "Not implemented");
411  return true;
412  }
413 };
414 
415 #ifdef EIGEN_USE_SYCL
416 // Default Generic reducer
417 template <typename Self, typename Op, typename Device>
418 struct GenericReducer {
419  static const bool HasOptimizedImplementation = false;
420 
421  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
422  eigen_assert(false && "Not implemented");
423  return true;
424  }
425 };
426 #endif
427 
428 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
429 template <int B, int N, typename S, typename R, typename I_>
430 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
431 
432 
433 #if defined(EIGEN_HAS_GPU_FP16)
434 template <typename S, typename R, typename I_>
435 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
436 template <int B, int N, typename S, typename R, typename I_>
437 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
438 template <int NPT, typename S, typename R, typename I_>
439 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
440 
441 #endif
442 
443 template <int NPT, typename S, typename R, typename I_>
444 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
445 
446 template <int NPT, typename S, typename R, typename I_>
447 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
448 #endif
449 
458 template <typename Op, typename CoeffReturnType>
460 #if defined(EIGEN_USE_SYCL)
462 #else
464 #endif
465 };
466 
467 } // end namespace internal
468 
469 
470 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
471 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
472  public:
479 
481  TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
482  { }
484  TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
485  { }
486 
488  const XprType& expression() const { return m_expr; }
490  const Dims& dims() const { return m_dims; }
492  const Op& reducer() const { return m_reducer; }
493 
494  protected:
495  typename XprType::Nested m_expr;
496  const Dims m_dims;
497  const Op m_reducer;
498 };
499 
500 template<typename ArgType, typename Device>
502 
503 // Eval as rvalue
504 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
505 struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
506 {
508  typedef Dims ReducedDims;
510  typedef typename XprType::Index Index;
511  typedef ArgType ChildType;
513  static const int NumInputDims = internal::array_size<InputDimensions>::value;
514  static const int NumReducedDims = internal::array_size<Dims>::value;
515  static const int NumOutputDims = NumInputDims - NumReducedDims;
517  typedef typename XprType::Scalar Scalar;
519  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
522  static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
523 
527 
528  // Subset of strides of the input tensor for the non-reduced dimensions.
529  // Indexed by output dimensions.
530  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
531 
532  enum {
533  IsAligned = false,
534  PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
535  BlockAccess = false,
536  PreferBlockAccess = true,
538  CoordAccess = false, // to be implemented
539  RawAccess = false
540  };
541 
543 
544  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
546  //===--------------------------------------------------------------------===//
547 
548  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
550  static const bool RunningFullReduction = (NumOutputDims==0);
551 
552  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
553  : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
554  {
555  EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
556  EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
557  YOU_MADE_A_PROGRAMMING_MISTAKE);
558 
559  // Build the bitmap indicating if an input dimension is reduced or not.
560  for (int i = 0; i < NumInputDims; ++i) {
561  m_reduced[i] = false;
562  }
563  for (int i = 0; i < NumReducedDims; ++i) {
564  eigen_assert(op.dims()[i] >= 0);
565  eigen_assert(op.dims()[i] < NumInputDims);
566  m_reduced[op.dims()[i]] = true;
567  }
568 
569  const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
570  internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
571 
572  // Precompute output strides.
573  if (NumOutputDims > 0) {
574  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
575  m_outputStrides[0] = 1;
576  for (int i = 1; i < NumOutputDims; ++i) {
577  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
578  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
579  }
580  } else {
581  m_outputStrides[NumOutputDims - 1] = 1;
582  for (int i = NumOutputDims - 2; i >= 0; --i) {
583  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
584  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
585  }
586  }
587  }
588 
589  // Precompute input strides.
590  if (NumInputDims > 0) {
591  array<Index, NumInputDims> input_strides;
592  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
593  input_strides[0] = 1;
594  for (int i = 1; i < NumInputDims; ++i) {
595  input_strides[i] = input_strides[i-1] * input_dims[i-1];
596  }
597  } else {
598  input_strides.back() = 1;
599  for (int i = NumInputDims - 2; i >= 0; --i) {
600  input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
601  }
602  }
603 
604  int outputIndex = 0;
605  int reduceIndex = 0;
606  for (int i = 0; i < NumInputDims; ++i) {
607  if (m_reduced[i]) {
608  m_reducedStrides[reduceIndex] = input_strides[i];
609  ++reduceIndex;
610  } else {
611  m_preservedStrides[outputIndex] = input_strides[i];
612  m_output_to_input_dim_map[outputIndex] = i;
613  ++outputIndex;
614  }
615  }
616  }
617 
618  // Special case for full reductions
619  if (NumOutputDims == 0) {
620  m_preservedStrides[0] = internal::array_prod(input_dims);
621  }
622 
623  m_numValuesToReduce =
624  NumOutputDims == 0
625  ? internal::array_prod(input_dims)
626  : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
627  ? m_preservedStrides[0]
628  : m_preservedStrides[NumOutputDims - 1];
629  }
630 
631  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
632 
634  bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
635  // Use the FullReducer if possible.
636  if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
638  ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
639  !RunningOnGPU))) {
640  bool need_assign = false;
641  if (!data) {
642  m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
643  data = m_result;
644  need_assign = true;
645  }
646  Op reducer(m_reducer);
647  internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
648  return need_assign;
649  }
650 
651  // Attempt to use an optimized reduction.
652  else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
653  bool reducing_inner_dims = true;
654  for (int i = 0; i < NumReducedDims; ++i) {
655  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
656  reducing_inner_dims &= m_reduced[i];
657  } else {
658  reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
659  }
660  }
662  (reducing_inner_dims || ReducingInnerMostDims)) {
663  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
664  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
665  if (!data) {
666  if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
667  data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
668  m_result = data;
669  }
670  else {
671  return true;
672  }
673  }
674  Op reducer(m_reducer);
675  // For SYCL this if always return false
676  if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
677  if (m_result) {
678  m_device.deallocate_temp(m_result);
679  m_result = NULL;
680  }
681  return true;
682  } else {
683  return (m_result != NULL);
684  }
685  }
686 
687  bool preserving_inner_dims = true;
688  for (int i = 0; i < NumReducedDims; ++i) {
689  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
690  preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
691  } else {
692  preserving_inner_dims &= m_reduced[i];
693  }
694  }
696  preserving_inner_dims) {
697  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
698  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
699  if (!data) {
700  if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
701  data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
702  m_result = data;
703  }
704  else {
705  return true;
706  }
707  }
708  Op reducer(m_reducer);
709  // For SYCL this if always return false
710  if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
711  if (m_result) {
712  m_device.deallocate_temp(m_result);
713  m_result = NULL;
714  }
715  return true;
716  } else {
717  return (m_result != NULL);
718  }
719  }
720  #if defined(EIGEN_USE_SYCL)
721  // If there is no Optimised version for SYCL, the reduction expression
722  // must break into two subexpression and use the SYCL generic Reducer on the device.
723  if(RunningOnSycl) {
724  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
725  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
726  if (!data) {
727  data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
728  m_result = data;
729  }
730  Op reducer(m_reducer);
731  internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
732  return (m_result != NULL);
733  }
734  #endif
735  }
736  return true;
737  }
738 
739 #ifdef EIGEN_USE_THREADS
740  template <typename EvalSubExprsCallback>
742  void
743  evalSubExprsIfNeededAsync(EvaluatorPointerType data,
744  EvalSubExprsCallback done) {
745  m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
746  done(evalSubExprsIfNeededCommon(data));
747  });
748  }
749 #endif
750 
752  bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
753  m_impl.evalSubExprsIfNeeded(NULL);
754  return evalSubExprsIfNeededCommon(data);
755  }
756 
758  m_impl.cleanup();
759  if (m_result) {
760  m_device.deallocate_temp(m_result);
761  m_result = NULL;
762  }
763  }
764 
765  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
766  {
767  if (( RunningFullReduction || RunningOnGPU) && m_result ) {
768  return *(m_result + index);
769  }
770  Op reducer(m_reducer);
771  if (ReducingInnerMostDims || RunningFullReduction) {
772  const Index num_values_to_reduce =
773  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
774  return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
775  num_values_to_reduce, reducer);
776  } else {
777  typename Self::CoeffReturnType accum = reducer.initialize();
778  internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
779  return reducer.finalize(accum);
780  }
781  }
782 
783  // TODO(bsteiner): provide a more efficient implementation.
784  template<int LoadMode>
785  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
786  {
787  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
788  eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
789 
790  if (RunningOnGPU && m_result) {
791  return internal::pload<PacketReturnType>(m_result + index);
792  }
793 
795  if (ReducingInnerMostDims) {
796  const Index num_values_to_reduce =
797  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
798  const Index firstIndex = firstInput(index);
799  for (Index i = 0; i < PacketSize; ++i) {
800  Op reducer(m_reducer);
801  values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
802  num_values_to_reduce, reducer);
803  }
804  } else if (PreservingInnerMostDims) {
805  const Index firstIndex = firstInput(index);
806  const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
807  // TBD: extend this the the n innermost dimensions that we preserve.
808  if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
809  Op reducer(m_reducer);
810  typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
812  return reducer.finalizePacket(accum);
813  } else {
814  for (int i = 0; i < PacketSize; ++i) {
815  values[i] = coeff(index + i);
816  }
817  }
818  } else {
819  for (int i = 0; i < PacketSize; ++i) {
820  values[i] = coeff(index + i);
821  }
822  }
823  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
824  return rslt;
825  }
826 
827  // Must be called after evalSubExprsIfNeeded().
829  if (RunningFullReduction && m_result) {
830  return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
831  } else {
832  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
833  const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
834  return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
835  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
836  }
837  }
838 
839  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
840  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
841  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
842 #ifdef EIGEN_USE_SYCL
843  // binding placeholder accessors to a command group handler for SYCL
844  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
845  m_impl.bind(cgh);
846  m_result.bind(cgh);
847  }
848 #endif
849 
850  private:
851  template <int, typename, typename> friend struct internal::GenericDimReducer;
852  template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
853  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
854  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
855 #ifdef EIGEN_USE_THREADS
856  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
857 #endif
858 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
859  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
860 #if defined(EIGEN_HAS_GPU_FP16)
861  template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
862  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
863  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
864 #endif
865  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
866 
867  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
868 #endif
869 
870 #if defined(EIGEN_USE_SYCL)
871  template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
872  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
873  template <typename, typename, typename> friend struct internal::GenericReducer;
874 #endif
875 
876 
877  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
878 
879  struct BlockIteratorState {
880  Index input_dim;
881  Index output_size;
883  };
884 
885  // Returns the Index in the input tensor of the first value that needs to be
886  // used to compute the reduction at output index "index".
888  if (ReducingInnerMostDims) {
889  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
890  return index * m_preservedStrides[0];
891  } else {
892  return index * m_preservedStrides[NumPreservedStrides - 1];
893  }
894  }
895  // TBD: optimize the case where we preserve the innermost dimensions.
896  Index startInput = 0;
897  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
898  for (int i = NumOutputDims - 1; i > 0; --i) {
899  // This is index_i in the output tensor.
900  const Index idx = index / m_outputStrides[i];
901  startInput += idx * m_preservedStrides[i];
902  index -= idx * m_outputStrides[i];
903  }
904  if (PreservingInnerMostDims) {
905  eigen_assert(m_preservedStrides[0] == 1);
906  startInput += index;
907  } else {
908  startInput += index * m_preservedStrides[0];
909  }
910  } else {
911  for (int i = 0; i < NumOutputDims - 1; ++i) {
912  // This is index_i in the output tensor.
913  const Index idx = index / m_outputStrides[i];
914  startInput += idx * m_preservedStrides[i];
915  index -= idx * m_outputStrides[i];
916  }
917  if (PreservingInnerMostDims) {
918  eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
919  startInput += index;
920  } else {
921  startInput += index * m_preservedStrides[NumPreservedStrides - 1];
922  }
923  }
924  return startInput;
925  }
926 
927  // Bitmap indicating if an input dimension is reduced or not.
929  // Dimensions of the output of the operation.
930  Dimensions m_dimensions;
931  // Precomputed strides for the output tensor.
935  // Map from output to input dimension index.
937  // How many values go into each reduction
939 
940  // Subset of strides of the input tensor for the reduced dimensions.
941  // Indexed by reduced dimensions.
943  // Size of the input dimensions that are reduced.
944  // Indexed by reduced dimensions.
946 
947  // Evaluator for the input expression.
949 
950  // Operation to apply for computing the reduction.
952 
953  // For full reductions
954 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
955  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
956  static const bool RunningOnSycl = false;
957 #elif defined(EIGEN_USE_SYCL)
958 static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
959 static const bool RunningOnGPU = false;
960 #else
961  static const bool RunningOnGPU = false;
962  static const bool RunningOnSycl = false;
963 #endif
964  EvaluatorPointerType m_result;
965 
967 };
968 
969 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
970 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
971 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
973  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
974 };
975 
976 
977 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
978 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
979 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
980 
982  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
983  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
984  //Therefore the coeff function should be overridden by for SYCL kernel
985  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
986  return *(this->data() + index);
987  }
988  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
989  //Therefore the packet function should be overridden by for SYCL kernel
990  template<int LoadMode>
991  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
992  return internal::pload<typename Base::PacketReturnType>(this->data() + index);
993  }
994 };
995 
996 } // end namespace Eigen
997 
998 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::EvaluatorPointerType output)
SCALAR Scalar
Definition: bench_gemm.cpp:46
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:510
std::vector< Eigen::Index > Dims
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
#define KERNEL_FRIEND
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
Eigen::internal::traits< TensorReductionOp >::Scalar Scalar
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Device > Base
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Base::PacketReturnType packet(typename Base::Index index) const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType &op, const Eigen::SyclDevice &device)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType &expr, const Dims &dims)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &, typename Self::Index, Op &, typename Self::PacketReturnType *)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Base::CoeffReturnType coeff(typename Base::Index index) const
Rot2 R(Rot2::fromAngle(0.1))
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
leaf::MyValues values
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType &expr, const Dims &dims, const Op &reducer)
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
internal::ReductionReturnType< Op, typename XprType::CoeffReturnType >::type CoeffReturnType
A cost model used to limit the number of threads used for evaluating tensor expression.
Eigen::NumTraits< Scalar >::Real RealScalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T & back()
Definition: EmulateArray.h:39
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:127
internal::conditional< NumOutputDims==0, Sizes<>, DSizes< Index, NumOutputDims > >::type Dimensions
#define EIGEN_ALIGN_MAX
static EIGEN_DEVICE_FUNC void run(const InputDims &input_dims, const array< bool, internal::array_size< InputDims >::value > &reduced, OutputDims *output_dims, ReducedDims *reduced_dims)
DiscreteKey S(1, 2)
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Device > Self
void split(const G &g, const PredecessorMap< KEY > &tree, G &Ab1, G &Ab2)
Definition: graph-inl.h:245
remove_const< CoeffReturnType >::type type
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:77
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
internal::remove_const< typename XprType::CoeffReturnType >::type CoeffReturnType
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
#define eigen_assert(x)
Definition: Macros.h:1037
int data[]
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
#define NULL
Definition: ccolamd.c:609
EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType &op, const Device &device)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
void Wait()
Definition: Barrier.h:40
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::PacketReturnType *accum)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index index, Op &reducer, typename Self::CoeffReturnType *accum)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims & dims() const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &, typename Self::Index, Op &, typename Self::PacketReturnType *)
The tensor base class.
Definition: TensorBase.h:973
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
Eigen::internal::traits< TensorReductionOp >::StorageKind StorageKind
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & reducer() const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::PacketReturnType *accum)
static EIGEN_DEVICE_FUNC void run(const InputDims &input_dims, const array< bool, Rank > &, Sizes<> *, array< Index, Rank > *reduced_dims)
Eigen::internal::traits< TensorReductionOp >::Index Index
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:50
static const int kLeafSize
Generic expression where a coefficient-wise unary operator is applied to an expression.
Definition: CwiseUnaryOp.h:55
The MaxSizeVector class.
Definition: MaxSizeVector.h:31
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
const std::vector< size_t > dimensions
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Values initialize(const NonlinearFactorGraph &graph, bool useOdometricPath)
Definition: lago.cpp:375
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition: TensorMeta.h:30
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
std::ptrdiff_t j
Definition: pytypes.h:1370
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Eigen::SyclDevice > Base
Eigen::internal::nested< TensorReductionOp >::type Nested


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:37:23