TensorScan.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 template <typename Op, typename XprType>
18 struct traits<TensorScanOp<Op, XprType> >
19  : public traits<XprType> {
20  typedef typename XprType::Scalar Scalar;
22  typedef typename XprTraits::StorageKind StorageKind;
23  typedef typename XprType::Nested Nested;
25  static const int NumDimensions = XprTraits::NumDimensions;
26  static const int Layout = XprTraits::Layout;
27 };
28 
29 template<typename Op, typename XprType>
30 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
31 {
33 };
34 
35 template<typename Op, typename XprType>
36 struct nested<TensorScanOp<Op, XprType>, 1,
37  typename eval<TensorScanOp<Op, XprType> >::type>
38 {
40 };
41 } // end namespace internal
42 
48 template <typename Op, typename XprType>
49 class TensorScanOp
50  : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
51 public:
54  typedef typename XprType::CoeffReturnType CoeffReturnType;
58 
59  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
60  const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
61  : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
62 
63  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
64  const Index axis() const { return m_axis; }
65  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
66  const XprType& expression() const { return m_expr; }
67  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
68  const Op accumulator() const { return m_accumulator; }
69  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
70  bool exclusive() const { return m_exclusive; }
71 
72 protected:
73  typename XprType::Nested m_expr;
74  const Index m_axis;
75  const Op m_accumulator;
76  const bool m_exclusive;
77 };
78 
79 template <typename Self, typename Reducer, typename Device>
80 struct ScanLauncher;
81 
82 // Eval as rvalue
83 template <typename Op, typename ArgType, typename Device>
84 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
85 
87  typedef typename XprType::Index Index;
94 
95  enum {
96  IsAligned = false,
98  BlockAccess = false,
100  CoordAccess = false,
101  RawAccess = true
102  };
103 
104  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
105  const Device& device)
106  : m_impl(op.expression(), device),
107  m_device(device),
108  m_exclusive(op.exclusive()),
109  m_accumulator(op.accumulator()),
110  m_size(m_impl.dimensions()[op.axis()]),
111  m_stride(1),
112  m_output(NULL) {
113 
114  // Accumulating a scalar isn't supported.
115  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
116  eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
117 
118  // Compute stride of scan axis
119  const Dimensions& dims = m_impl.dimensions();
120  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
121  for (int i = 0; i < op.axis(); ++i) {
122  m_stride = m_stride * dims[i];
123  }
124  } else {
125  for (int i = NumDims - 1; i > op.axis(); --i) {
126  m_stride = m_stride * dims[i];
127  }
128  }
129  }
130 
131  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
132  return m_impl.dimensions();
133  }
134 
135  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
136  return m_stride;
137  }
138 
139  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
140  return m_size;
141  }
142 
143  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
144  return m_accumulator;
145  }
146 
147  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
148  return m_exclusive;
149  }
150 
151  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
152  return m_impl;
153  }
154 
155  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
156  return m_device;
157  }
158 
160  m_impl.evalSubExprsIfNeeded(NULL);
162  if (data) {
163  launcher(*this, data);
164  return false;
165  }
166 
167  const Index total_size = internal::array_prod(dimensions());
168  m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
169  launcher(*this, m_output);
170  return true;
171  }
172 
173  template<int LoadMode>
174  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
175  return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
176  }
177 
178  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
179  {
180  return m_output;
181  }
182 
183  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
184  {
185  return m_output[index];
186  }
187 
188  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
189  return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
190  }
191 
192  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
193  if (m_output != NULL) {
194  m_device.deallocate(m_output);
195  m_output = NULL;
196  }
197  m_impl.cleanup();
198  }
199 
200 protected:
202  const Device& m_device;
203  const bool m_exclusive;
205  const Index m_size;
206  Index m_stride;
207  CoeffReturnType* m_output;
208 };
209 
210 // CPU implementation of scan
211 // TODO(ibab) This single-threaded implementation should be parallelized,
212 // at least by running multiple scans at the same time.
213 template <typename Self, typename Reducer, typename Device>
214 struct ScanLauncher {
215  void operator()(Self& self, typename Self::CoeffReturnType *data) {
216  Index total_size = internal::array_prod(self.dimensions());
217 
218  // We fix the index along the scan axis to 0 and perform a
219  // scan per remaining entry. The iteration is split into two nested
220  // loops to avoid an integer division by keeping track of each idx1 and idx2.
221  for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
222  for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
223  // Calculate the starting offset for the scan
224  Index offset = idx1 + idx2;
225 
226  // Compute the scan along the axis, starting at the calculated offset
227  typename Self::CoeffReturnType accum = self.accumulator().initialize();
228  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
229  Index curr = offset + idx3 * self.stride();
230 
231  if (self.exclusive()) {
232  data[curr] = self.accumulator().finalize(accum);
233  self.accumulator().reduce(self.inner().coeff(curr), &accum);
234  } else {
235  self.accumulator().reduce(self.inner().coeff(curr), &accum);
236  data[curr] = self.accumulator().finalize(accum);
237  }
238  }
239  }
240  }
241  }
242 };
243 
244 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
245 
246 // GPU implementation of scan
247 // TODO(ibab) This placeholder implementation performs multiple scans in
248 // parallel, but it would be better to use a parallel scan algorithm and
249 // optimize memory access.
250 template <typename Self, typename Reducer>
251 __global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
252  // Compute offset as in the CPU version
253  Index val = threadIdx.x + blockIdx.x * blockDim.x;
254  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
255 
256  if (offset + (self.size() - 1) * self.stride() < total_size) {
257  // Compute the scan along the axis, starting at the calculated offset
258  typename Self::CoeffReturnType accum = self.accumulator().initialize();
259  for (Index idx = 0; idx < self.size(); idx++) {
260  Index curr = offset + idx * self.stride();
261  if (self.exclusive()) {
262  data[curr] = self.accumulator().finalize(accum);
263  self.accumulator().reduce(self.inner().coeff(curr), &accum);
264  } else {
265  self.accumulator().reduce(self.inner().coeff(curr), &accum);
266  data[curr] = self.accumulator().finalize(accum);
267  }
268  }
269  }
270  __syncthreads();
271 
272 }
273 
274 template <typename Self, typename Reducer>
275 struct ScanLauncher<Self, Reducer, GpuDevice> {
276  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
277  Index total_size = internal::array_prod(self.dimensions());
278  Index num_blocks = (total_size / self.size() + 63) / 64;
279  Index block_size = 64;
280  LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
281  }
282 };
283 #endif // EIGEN_USE_GPU && __CUDACC__
284 
285 } // end namespace Eigen
286 
287 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator< ArgType, Device > & inner() const
Definition: TensorScan.h:151
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
Definition: TensorScan.h:66
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & accumulator() const
Definition: TensorScan.h:143
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
Definition: TensorScan.h:174
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
Definition: Macros.h:493
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition: TensorScan.h:70
internal::remove_const< typename XprType::Scalar >::type Scalar
Definition: TensorScan.h:90
const bool m_exclusive
Definition: TensorScan.h:76
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType * data() const
Definition: TensorScan.h:178
const Op m_accumulator
Definition: TensorScan.h:75
Definition: LDLT.h:16
static constexpr size_t size(Tuple< Args... > &)
Provides access to the number of elements in a tuple as a compile-time constant expression.
A cost model used to limit the number of threads used for evaluating tensor expression.
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:122
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar *data)
Definition: TensorScan.h:159
TensorEvaluator< const TensorScanOp< Op, ArgType >, Device > Self
Definition: TensorScan.h:93
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorScan.h:183
remove_reference< Nested >::type _Nested
Definition: TensorScan.h:24
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
Definition: TensorScan.h:192
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
Definition: TensorScan.h:155
Eigen::internal::traits< TensorScanOp >::Scalar Scalar
Definition: TensorScan.h:52
void operator()(Self &self, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:215
#define eigen_assert(x)
Definition: Macros.h:577
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorScan.h:92
Eigen::internal::traits< TensorScanOp >::StorageKind StorageKind
Definition: TensorScan.h:56
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorScan.h:104
XprType::Nested m_expr
Definition: TensorScan.h:73
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition: TensorScan.h:147
XprType::CoeffReturnType CoeffReturnType
Definition: TensorScan.h:54
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const
Definition: TensorScan.h:64
The tensor base class.
Definition: TensorBase.h:827
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorScan.h:131
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const
Definition: TensorScan.h:188
Eigen::internal::nested< TensorScanOp >::type Nested
Definition: TensorScan.h:55
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & stride() const
Definition: TensorScan.h:135
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const
Definition: TensorScan.h:68
const Index m_axis
Definition: TensorScan.h:74
Eigen::internal::traits< TensorScanOp >::Index Index
Definition: TensorScan.h:57
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & size() const
Definition: TensorScan.h:139
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorScan.h:53
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:51
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType &expr, const Index &axis, bool exclusive=false, const Op &op=Op())
Definition: TensorScan.h:59


hebiros
Author(s): Xavier Artache , Matthew Tesch
autogenerated on Thu Sep 3 2020 04:09:36