TensorReduction.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
13 
14 namespace Eigen {
15 
23 namespace internal {
24  template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
25  struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
26  : traits<XprType>
27 {
29  typedef typename XprTraits::Scalar Scalar;
30  typedef typename XprTraits::StorageKind StorageKind;
31  typedef typename XprTraits::Index Index;
32  typedef typename XprType::Nested Nested;
33  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
34  static const int Layout = XprTraits::Layout;
35 
36  template <class T> struct MakePointer {
37  // Intermediate typedef to workaround MSVC issue.
38  typedef MakePointer_<T> MakePointerT;
39  typedef typename MakePointerT::Type Type;
40  };
41 };
42 
43 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
44 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
45 {
47 };
48 
49 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
50 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
51 {
53 };
54 
55 
56 template <typename OutputDims> struct DimInitializer {
57  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
58  static void run(const InputDims& input_dims,
59  const array<bool, internal::array_size<InputDims>::value>& reduced,
60  OutputDims* output_dims, ReducedDims* reduced_dims) {
61  const int NumInputDims = internal::array_size<InputDims>::value;
62  int outputIndex = 0;
63  int reduceIndex = 0;
64  for (int i = 0; i < NumInputDims; ++i) {
65  if (reduced[i]) {
66  (*reduced_dims)[reduceIndex] = input_dims[i];
67  ++reduceIndex;
68  } else {
69  (*output_dims)[outputIndex] = input_dims[i];
70  ++outputIndex;
71  }
72  }
73  }
74 };
75 
76 template <> struct DimInitializer<Sizes<> > {
77  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
78  static void run(const InputDims& input_dims, const array<bool, Rank>&,
79  Sizes<>*, array<Index, Rank>* reduced_dims) {
80  const int NumInputDims = internal::array_size<InputDims>::value;
81  for (int i = 0; i < NumInputDims; ++i) {
82  (*reduced_dims)[i] = input_dims[i];
83  }
84  }
85 };
86 
87 
88 template <typename ReducedDims, int NumTensorDims, int Layout>
90  static const bool value = false;
91 };
92 template <typename ReducedDims, int NumTensorDims, int Layout>
94  static const bool value = false;
95 };
96 
97 #if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
98 template <typename ReducedDims, int NumTensorDims>
99 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
100  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
101  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
102  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
103  static const bool value = tmp1 & tmp2 & tmp3;
104 };
105 template <typename ReducedDims, int NumTensorDims>
106 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
107  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
108  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
109  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
110  static const bool value = tmp1 & tmp2 & tmp3;
111 
112 };
113 template <typename ReducedDims, int NumTensorDims>
114 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
115  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
116  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
117  static const bool value = tmp1 & tmp2;
118 
119 };
120 template <typename ReducedDims, int NumTensorDims>
121 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
122  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
123  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
124  static const bool value = tmp1 & tmp2;
125 };
126 #endif
127 
128 
129 template <int DimIndex, typename Self, typename Op>
131  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
132  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
133  for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
134  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
135  GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
136  }
137  }
138 };
139 template <typename Self, typename Op>
140 struct GenericDimReducer<0, Self, Op> {
141  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
142  for (int j = 0; j < self.m_reducedDims[0]; ++j) {
143  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
144  reducer.reduce(self.m_impl.coeff(input), accum);
145  }
146  }
147 };
148 template <typename Self, typename Op>
149 struct GenericDimReducer<-1, Self, Op> {
150  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
151  reducer.reduce(self.m_impl.coeff(index), accum);
152  }
153 };
154 
155 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
157  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
158  typename Self::CoeffReturnType accum = reducer.initialize();
159  for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
160  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
161  }
162  return reducer.finalize(accum);
163  }
164 };
165 
166 template <typename Self, typename Op>
167 struct InnerMostDimReducer<Self, Op, true> {
168  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
170  const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
171  typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
172  for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
173  reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
174  }
175  typename Self::CoeffReturnType accum = reducer.initialize();
176  for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
177  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
178  }
179  return reducer.finalizeBoth(accum, p);
180  }
181 };
182 
183 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
185  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
186  eigen_assert(false && "should never be called");
187  }
188 };
189 
190 template <int DimIndex, typename Self, typename Op>
191 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
192  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
193  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
194  for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
195  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
196  InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
197  }
198  }
199 };
200 
201 template <typename Self, typename Op>
202 struct InnerMostDimPreserver<0, Self, Op, true> {
203  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
204  for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
205  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
206  reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
207  }
208  }
209 };
210 template <typename Self, typename Op>
211 struct InnerMostDimPreserver<-1, Self, Op, true> {
212  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
213  eigen_assert(false && "should never be called");
214  }
215 };
216 
217 // Default full reducer
218 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
219 struct FullReducer {
220  static const bool HasOptimizedImplementation = false;
221 
222  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
223  const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
224  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
225  }
226 };
227 
228 
229 #ifdef EIGEN_USE_THREADS
230 // Multithreaded full reducers
231 template <typename Self, typename Op,
232  bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
233 struct FullReducerShard {
234  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
235  typename Self::Index numValuesToReduce, Op& reducer,
236  typename Self::CoeffReturnType* output) {
238  self, firstIndex, numValuesToReduce, reducer);
239  }
240 };
241 
242 // Multithreaded full reducer
243 template <typename Self, typename Op, bool Vectorizable>
245  static const bool HasOptimizedImplementation = !Op::IsStateful;
246  static const int PacketSize =
248 
249  // launch one reducer per thread and accumulate the result.
250  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
251  typename Self::CoeffReturnType* output) {
252  typedef typename Self::Index Index;
253  const Index num_coeffs = array_prod(self.m_impl.dimensions());
254  if (num_coeffs == 0) {
255  *output = reducer.finalize(reducer.initialize());
256  return;
257  }
258  const TensorOpCost cost =
259  self.m_impl.costPerCoeff(Vectorizable) +
261  PacketSize);
262  const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
263  num_coeffs, cost, device.numThreads());
264  if (num_threads == 1) {
265  *output =
266  InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
267  return;
268  }
269  const Index blocksize =
270  std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
271  const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
272  eigen_assert(num_coeffs >= numblocks * blocksize);
273 
274  Barrier barrier(internal::convert_index<unsigned int>(numblocks));
275  MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
276  for (Index i = 0; i < numblocks; ++i) {
277  device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
278  self, i * blocksize, blocksize, reducer,
279  &shards[i]);
280  }
281  typename Self::CoeffReturnType finalShard;
282  if (numblocks * blocksize < num_coeffs) {
284  self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
285  reducer);
286  } else {
287  finalShard = reducer.initialize();
288  }
289  barrier.Wait();
290 
291  for (Index i = 0; i < numblocks; ++i) {
292  reducer.reduce(shards[i], &finalShard);
293  }
294  *output = reducer.finalize(finalShard);
295  }
296 };
297 
298 #endif
299 
300 
301 // Default inner reducer
302 template <typename Self, typename Op, typename Device>
303 struct InnerReducer {
304  static const bool HasOptimizedImplementation = false;
305 
306  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
307  eigen_assert(false && "Not implemented");
308  return true;
309  }
310 };
311 
312 // Default outer reducer
313 template <typename Self, typename Op, typename Device>
314 struct OuterReducer {
315  static const bool HasOptimizedImplementation = false;
316 
317  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
318  eigen_assert(false && "Not implemented");
319  return true;
320  }
321 };
322 
323 
324 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
325 template <int B, int N, typename S, typename R, typename I>
326 __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
327 
328 
329 #ifdef EIGEN_HAS_CUDA_FP16
330 template <typename S, typename R, typename I>
331 __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
332 template <int B, int N, typename S, typename R, typename I>
333 __global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
334 template <int NPT, typename S, typename R, typename I>
335 __global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
336 
337 #endif
338 
339 template <int NPT, typename S, typename R, typename I>
340 __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
341 
342 template <int NPT, typename S, typename R, typename I>
343 __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
344 #endif
345 
346 } // end namespace internal
347 
348 
349 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
350 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
351  public:
358 
359  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
360  TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
361  { }
362  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
363  TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
364  { }
365 
366  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
367  const XprType& expression() const { return m_expr; }
368  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
369  const Dims& dims() const { return m_dims; }
370  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
371  const Op& reducer() const { return m_reducer; }
372 
373  protected:
374  typename XprType::Nested m_expr;
375  const Dims m_dims;
376  const Op m_reducer;
377 };
378 
379 
380 // Eval as rvalue
381 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
382 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
383 {
385  typedef typename XprType::Index Index;
386  typedef ArgType ChildType;
388  static const int NumInputDims = internal::array_size<InputDimensions>::value;
389  static const int NumReducedDims = internal::array_size<Dims>::value;
390  static const int NumOutputDims = NumInputDims - NumReducedDims;
392  typedef typename XprType::Scalar Scalar;
394  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
398 
399  enum {
400  IsAligned = false,
401  PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
403  CoordAccess = false, // to be implemented
404  RawAccess = false
405  };
406 
407  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
409  static const bool RunningFullReduction = (NumOutputDims==0);
410 
411  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
412  : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
413  {
414  EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
415  EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
416  YOU_MADE_A_PROGRAMMING_MISTAKE);
417 
418  // Build the bitmap indicating if an input dimension is reduced or not.
419  for (int i = 0; i < NumInputDims; ++i) {
420  m_reduced[i] = false;
421  }
422  for (int i = 0; i < NumReducedDims; ++i) {
423  eigen_assert(op.dims()[i] >= 0);
424  eigen_assert(op.dims()[i] < NumInputDims);
425  m_reduced[op.dims()[i]] = true;
426  }
427 
428  const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
429  internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
430 
431  // Precompute output strides.
432  if (NumOutputDims > 0) {
433  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
434  m_outputStrides[0] = 1;
435  for (int i = 1; i < NumOutputDims; ++i) {
436  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
437  }
438  } else {
439  m_outputStrides.back() = 1;
440  for (int i = NumOutputDims - 2; i >= 0; --i) {
441  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
442  }
443  }
444  }
445 
446  // Precompute input strides.
447  if (NumInputDims > 0) {
448  array<Index, NumInputDims> input_strides;
449  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
450  input_strides[0] = 1;
451  for (int i = 1; i < NumInputDims; ++i) {
452  input_strides[i] = input_strides[i-1] * input_dims[i-1];
453  }
454  } else {
455  input_strides.back() = 1;
456  for (int i = NumInputDims - 2; i >= 0; --i) {
457  input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
458  }
459  }
460 
461  int outputIndex = 0;
462  int reduceIndex = 0;
463  for (int i = 0; i < NumInputDims; ++i) {
464  if (m_reduced[i]) {
465  m_reducedStrides[reduceIndex] = input_strides[i];
466  ++reduceIndex;
467  } else {
468  m_preservedStrides[outputIndex] = input_strides[i];
469  ++outputIndex;
470  }
471  }
472  }
473 
474  // Special case for full reductions
475  if (NumOutputDims == 0) {
476  m_preservedStrides[0] = internal::array_prod(input_dims);
477  }
478  }
479 
480  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
481 
483  m_impl.evalSubExprsIfNeeded(NULL);
484 
485  // Use the FullReducer if possible.
486  if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
488  ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
489  !RunningOnGPU))) {
490  bool need_assign = false;
491  if (!data) {
492  m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
493  data = m_result;
494  need_assign = true;
495  }
496  Op reducer(m_reducer);
497  internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
498  return need_assign;
499  }
500  else if(RunningOnSycl){
501  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
502  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
503  if (!data) {
504  data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
505  m_result = data;
506  }
507  Op reducer(m_reducer);
508  internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
509  return (m_result != NULL);
510  }
511 
512  // Attempt to use an optimized reduction.
513  else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
514  bool reducing_inner_dims = true;
515  for (int i = 0; i < NumReducedDims; ++i) {
516  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
517  reducing_inner_dims &= m_reduced[i];
518  } else {
519  reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
520  }
521  }
523  (reducing_inner_dims || ReducingInnerMostDims)) {
524  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
525  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
526  if (!data) {
527  if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
528  data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
529  m_result = data;
530  }
531  else {
532  return true;
533  }
534  }
535  Op reducer(m_reducer);
536  if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
537  if (m_result) {
538  m_device.deallocate(m_result);
539  m_result = NULL;
540  }
541  return true;
542  } else {
543  return (m_result != NULL);
544  }
545  }
546 
547  bool preserving_inner_dims = true;
548  for (int i = 0; i < NumReducedDims; ++i) {
549  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
550  preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
551  } else {
552  preserving_inner_dims &= m_reduced[i];
553  }
554  }
556  preserving_inner_dims) {
557  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
558  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
559  if (!data) {
560  if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
561  data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
562  m_result = data;
563  }
564  else {
565  return true;
566  }
567  }
568  Op reducer(m_reducer);
569  if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
570  if (m_result) {
571  m_device.deallocate(m_result);
572  m_result = NULL;
573  }
574  return true;
575  } else {
576  return (m_result != NULL);
577  }
578  }
579  }
580  return true;
581  }
582 
583  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
584  m_impl.cleanup();
585  if (m_result) {
586  m_device.deallocate(m_result);
587  m_result = NULL;
588  }
589  }
590 
591  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
592  {
593  if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
594  return *(m_result + index);
595  }
596  Op reducer(m_reducer);
597  if (ReducingInnerMostDims || RunningFullReduction) {
598  const Index num_values_to_reduce =
599  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
600  return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
601  num_values_to_reduce, reducer);
602  } else {
603  typename Self::CoeffReturnType accum = reducer.initialize();
604  internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
605  return reducer.finalize(accum);
606  }
607  }
608 
609  // TODO(bsteiner): provide a more efficient implementation.
610  template<int LoadMode>
611  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
612  {
613  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
614  eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
615 
616  if (RunningOnGPU && m_result) {
617  return internal::pload<PacketReturnType>(m_result + index);
618  }
619 
621  if (ReducingInnerMostDims) {
622  const Index num_values_to_reduce =
623  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
624  const Index firstIndex = firstInput(index);
625  for (Index i = 0; i < PacketSize; ++i) {
626  Op reducer(m_reducer);
627  values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
628  num_values_to_reduce, reducer);
629  }
630  } else if (PreservingInnerMostDims) {
631  const Index firstIndex = firstInput(index);
632  const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
633  // TBD: extend this the the n innermost dimensions that we preserve.
634  if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
635  Op reducer(m_reducer);
636  typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
638  return reducer.finalizePacket(accum);
639  } else {
640  for (int i = 0; i < PacketSize; ++i) {
641  values[i] = coeff(index + i);
642  }
643  }
644  } else {
645  for (int i = 0; i < PacketSize; ++i) {
646  values[i] = coeff(index + i);
647  }
648  }
649  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
650  return rslt;
651  }
652 
653  // Must be called after evalSubExprsIfNeeded().
654  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
655  if (RunningFullReduction && m_result) {
656  return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
657  } else {
658  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
659  const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
660  return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
661  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
662  }
663  }
664 
665  EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
667  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
669  const Device& device() const{return m_device;}
671  const Dims& xprDims() const {return m_xpr_dims;}
672 
673 
674  private:
675  template <int, typename, typename> friend struct internal::GenericDimReducer;
676  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
677  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
678  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
679 #ifdef EIGEN_USE_THREADS
680  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
681 #endif
682 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
683  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
684 #ifdef EIGEN_HAS_CUDA_FP16
685  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
686  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
687  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
688 #endif
689  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
690 
691  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
692 #endif
693 
694  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
695 
696  // Returns the Index in the input tensor of the first value that needs to be
697  // used to compute the reduction at output index "index".
698  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
699  if (ReducingInnerMostDims) {
700  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
701  return index * m_preservedStrides[0];
702  } else {
703  return index * m_preservedStrides[NumPreservedStrides - 1];
704  }
705  }
706  // TBD: optimize the case where we preserve the innermost dimensions.
707  Index startInput = 0;
708  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
709  for (int i = NumOutputDims - 1; i > 0; --i) {
710  // This is index_i in the output tensor.
711  const Index idx = index / m_outputStrides[i];
712  startInput += idx * m_preservedStrides[i];
713  index -= idx * m_outputStrides[i];
714  }
715  if (PreservingInnerMostDims) {
716  eigen_assert(m_preservedStrides[0] == 1);
717  startInput += index;
718  } else {
719  startInput += index * m_preservedStrides[0];
720  }
721  } else {
722  for (int i = 0; i < NumOutputDims - 1; ++i) {
723  // This is index_i in the output tensor.
724  const Index idx = index / m_outputStrides[i];
725  startInput += idx * m_preservedStrides[i];
726  index -= idx * m_outputStrides[i];
727  }
728  if (PreservingInnerMostDims) {
729  eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
730  startInput += index;
731  } else {
732  startInput += index * m_preservedStrides[NumPreservedStrides - 1];
733  }
734  }
735  return startInput;
736  }
737 
738  // Bitmap indicating if an input dimension is reduced or not.
740  // Dimensions of the output of the operation.
741  Dimensions m_dimensions;
742  // Precomputed strides for the output tensor.
744  // Subset of strides of the input tensor for the non-reduced dimensions.
745  // Indexed by output dimensions.
746  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
748 
749  // Subset of strides of the input tensor for the reduced dimensions.
750  // Indexed by reduced dimensions.
752  // Size of the input dimensions that are reduced.
753  // Indexed by reduced dimensions.
755 
756  // Evaluator for the input expression.
758 
759  // Operation to apply for computing the reduction.
761 
762  // For full reductions
763 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
764  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
765  static const bool RunningOnSycl = false;
766 #elif defined(EIGEN_USE_SYCL)
767 static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
768 static const bool RunningOnGPU = false;
769 #else
770  static const bool RunningOnGPU = false;
771  static const bool RunningOnSycl = false;
772 #endif
774 
775  const Device& m_device;
776  const Dims& m_xpr_dims;
777 };
778 
779 } // end namespace Eigen
780 
781 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
const TensorEvaluator< ArgType, Device > & impl() const
required by sycl in order to extract the accessor
SCALAR Scalar
Definition: bench_gemm.cpp:33
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
Definition: Macros.h:494
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Eigen::internal::traits< TensorReductionOp >::Scalar Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
internal::remove_const< typename XprType::CoeffReturnType >::type CoeffReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType &expr, const Dims &dims)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &, typename Self::Index, Op &, typename Self::PacketReturnType *)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const
Rot2 R(Rot2::fromAngle(0.1))
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
leaf::MyValues values
const Device & device() const
added for sycl in order to construct the buffer from the sycl device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType &expr, const Dims &dims, const Op &reducer)
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
A cost model used to limit the number of threads used for evaluating tensor expression.
Eigen::NumTraits< Scalar >::Real RealScalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T & back()
Definition: EmulateArray.h:34
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:124
static EIGEN_DEVICE_FUNC void run(const InputDims &input_dims, const array< bool, internal::array_size< InputDims >::value > &reduced, OutputDims *output_dims, ReducedDims *reduced_dims)
vector< size_t > dimensions(L.begin(), L.end())
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & reducer() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims & dims() const
Key S(std::uint64_t j)
internal::remove_const< typename XprType::CoeffReturnType >::type CoeffReturnType
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:33
#define eigen_assert(x)
Definition: Macros.h:579
int data[]
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
static const Matrix I
Definition: lago.cpp:35
#define NULL
Definition: ccolamd.c:609
internal::conditional< NumOutputDims==0, Sizes<>, DSizes< Index, NumOutputDims > >::type Dimensions
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::PacketReturnType *accum)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index index, Op &reducer, typename Self::CoeffReturnType *accum)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &, typename Self::Index, Op &, typename Self::PacketReturnType *)
The tensor base class.
Definition: TensorBase.h:829
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
TensorEvaluator< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Device > Self
#define EIGEN_ALIGN_MAX
Definition: Macros.h:757
Eigen::internal::traits< TensorReductionOp >::StorageKind StorageKind
float * p
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::PacketReturnType *accum)
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::CoeffReturnType *output)
static EIGEN_DEVICE_FUNC void run(const InputDims &input_dims, const array< bool, Rank > &, Sizes<> *, array< Index, Rank > *reduced_dims)
Eigen::internal::traits< TensorReductionOp >::Index Index
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_< CoeffReturnType >::Type data)
The MaxSizeVector class.
Definition: MaxSizeVector.h:31
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
const Dims & xprDims() const
added for sycl in order to re-construct the reduction eval on the device for the sub-kernel ...
void run(Expr &expr, Dev &dev)
Definition: TensorSyclRun.h:33
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
std::ptrdiff_t j
Definition: pytypes.h:897
Eigen::internal::nested< TensorReductionOp >::type Nested


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:45:44