TensorScan.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 template <typename Op, typename XprType>
18 struct traits<TensorScanOp<Op, XprType> >
19  : public traits<XprType> {
20  typedef typename XprType::Scalar Scalar;
22  typedef typename XprTraits::StorageKind StorageKind;
23  typedef typename XprType::Nested Nested;
25  static const int NumDimensions = XprTraits::NumDimensions;
26  static const int Layout = XprTraits::Layout;
27 };
28 
29 template<typename Op, typename XprType>
30 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
31 {
33 };
34 
35 template<typename Op, typename XprType>
36 struct nested<TensorScanOp<Op, XprType>, 1,
37  typename eval<TensorScanOp<Op, XprType> >::type>
38 {
40 };
41 } // end namespace internal
42 
48 template <typename Op, typename XprType>
49 class TensorScanOp
50  : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
51 public:
54  typedef typename XprType::CoeffReturnType CoeffReturnType;
58 
59  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
60  const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
61  : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
62 
63  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
64  const Index axis() const { return m_axis; }
65  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
66  const XprType& expression() const { return m_expr; }
67  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
68  const Op accumulator() const { return m_accumulator; }
69  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
70  bool exclusive() const { return m_exclusive; }
71 
72 protected:
73  typename XprType::Nested m_expr;
74  const Index m_axis;
75  const Op m_accumulator;
76  const bool m_exclusive;
77 };
78 
79 template <typename Self, typename Reducer, typename Device>
80 struct ScanLauncher;
81 
82 // Eval as rvalue
83 template <typename Op, typename ArgType, typename Device>
84 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
85 
87  typedef typename XprType::Index Index;
94 
95  enum {
96  IsAligned = false,
98  BlockAccess = false,
100  CoordAccess = false,
101  RawAccess = true
102  };
103 
104  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
105  const Device& device)
106  : m_impl(op.expression(), device),
107  m_device(device),
108  m_exclusive(op.exclusive()),
109  m_accumulator(op.accumulator()),
110  m_size(m_impl.dimensions()[op.axis()]),
111  m_stride(1),
112  m_output(NULL) {
113 
114  // Accumulating a scalar isn't supported.
115  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
116  eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
117 
118  // Compute stride of scan axis
119  const Dimensions& dims = m_impl.dimensions();
120  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
121  for (int i = 0; i < op.axis(); ++i) {
122  m_stride = m_stride * dims[i];
123  }
124  } else {
125  for (int i = NumDims - 1; i > op.axis(); --i) {
126  m_stride = m_stride * dims[i];
127  }
128  }
129  }
130 
131  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
132  return m_impl.dimensions();
133  }
134 
135  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
136  return m_stride;
137  }
138 
139  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
140  return m_size;
141  }
142 
143  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
144  return m_accumulator;
145  }
146 
147  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
148  return m_exclusive;
149  }
150 
151  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
152  return m_impl;
153  }
154 
155  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
156  return m_device;
157  }
158 
160  m_impl.evalSubExprsIfNeeded(NULL);
162  if (data) {
163  launcher(*this, data);
164  return false;
165  }
166 
167  const Index total_size = internal::array_prod(dimensions());
168  m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
169  launcher(*this, m_output);
170  return true;
171  }
172 
173  template<int LoadMode>
174  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
175  return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
176  }
177 
178  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
179  {
180  return m_output;
181  }
182 
183  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
184  {
185  return m_output[index];
186  }
187 
188  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
189  return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
190  }
191 
192  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
193  if (m_output != NULL) {
194  m_device.deallocate(m_output);
195  m_output = NULL;
196  }
197  m_impl.cleanup();
198  }
199 
200 protected:
202  const Device& m_device;
203  const bool m_exclusive;
205  const Index m_size;
206  Index m_stride;
207  CoeffReturnType* m_output;
208 };
209 
210 // CPU implementation of scan
211 // TODO(ibab) This single-threaded implementation should be parallelized,
212 // at least by running multiple scans at the same time.
213 template <typename Self, typename Reducer, typename Device>
214 struct ScanLauncher {
215  void operator()(Self& self, typename Self::CoeffReturnType *data) {
216  Index total_size = internal::array_prod(self.dimensions());
217 
218  // We fix the index along the scan axis to 0 and perform a
219  // scan per remaining entry. The iteration is split into two nested
220  // loops to avoid an integer division by keeping track of each idx1 and idx2.
221  for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
222  for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
223  // Calculate the starting offset for the scan
224  Index offset = idx1 + idx2;
225 
226  // Compute the scan along the axis, starting at the calculated offset
227  typename Self::CoeffReturnType accum = self.accumulator().initialize();
228  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
229  Index curr = offset + idx3 * self.stride();
230 
231  if (self.exclusive()) {
232  data[curr] = self.accumulator().finalize(accum);
233  self.accumulator().reduce(self.inner().coeff(curr), &accum);
234  } else {
235  self.accumulator().reduce(self.inner().coeff(curr), &accum);
236  data[curr] = self.accumulator().finalize(accum);
237  }
238  }
239  }
240  }
241  }
242 };
243 
244 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
245 
246 // GPU implementation of scan
247 // TODO(ibab) This placeholder implementation performs multiple scans in
248 // parallel, but it would be better to use a parallel scan algorithm and
249 // optimize memory access.
250 template <typename Self, typename Reducer>
251 __global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
252  // Compute offset as in the CPU version
253  Index val = threadIdx.x + blockIdx.x * blockDim.x;
254  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
255 
256  if (offset + (self.size() - 1) * self.stride() < total_size) {
257  // Compute the scan along the axis, starting at the calculated offset
258  typename Self::CoeffReturnType accum = self.accumulator().initialize();
259  for (Index idx = 0; idx < self.size(); idx++) {
260  Index curr = offset + idx * self.stride();
261  if (self.exclusive()) {
262  data[curr] = self.accumulator().finalize(accum);
263  self.accumulator().reduce(self.inner().coeff(curr), &accum);
264  } else {
265  self.accumulator().reduce(self.inner().coeff(curr), &accum);
266  data[curr] = self.accumulator().finalize(accum);
267  }
268  }
269  }
270  __syncthreads();
271 
272 }
273 
274 template <typename Self, typename Reducer>
275 struct ScanLauncher<Self, Reducer, GpuDevice> {
276  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
277  Index total_size = internal::array_prod(self.dimensions());
278  Index num_blocks = (total_size / self.size() + 63) / 64;
279  Index block_size = 64;
280  LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
281  }
282 };
283 #endif // EIGEN_USE_GPU && __CUDACC__
284 
285 } // end namespace Eigen
286 
287 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator< ArgType, Device > & inner() const
Definition: TensorScan.h:151
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
Definition: TensorScan.h:66
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & accumulator() const
Definition: TensorScan.h:143
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
Definition: TensorScan.h:174
SCALAR Scalar
Definition: bench_gemm.cpp:33
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
Definition: Macros.h:494
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition: TensorScan.h:70
internal::remove_const< typename XprType::Scalar >::type Scalar
Definition: TensorScan.h:90
const bool m_exclusive
Definition: TensorScan.h:76
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType * data() const
Definition: TensorScan.h:178
const Op m_accumulator
Definition: TensorScan.h:75
dim3 threadIdx
Definition: cuda_common.h:11
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
A cost model used to limit the number of threads used for evaluating tensor expression.
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:124
vector< size_t > dimensions(L.begin(), L.end())
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar *data)
Definition: TensorScan.h:159
TensorEvaluator< const TensorScanOp< Op, ArgType >, Device > Self
Definition: TensorScan.h:93
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorScan.h:183
remove_reference< Nested >::type _Nested
Definition: TensorScan.h:24
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
Definition: TensorScan.h:192
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
Definition: TensorScan.h:155
Eigen::internal::traits< TensorScanOp >::Scalar Scalar
Definition: TensorScan.h:52
void operator()(Self &self, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:215
#define eigen_assert(x)
Definition: Macros.h:579
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorScan.h:92
size_t size() const
Definition: pytypes.h:1331
int data[]
Eigen::internal::traits< TensorScanOp >::StorageKind StorageKind
Definition: TensorScan.h:56
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorScan.h:104
XprType::Nested m_expr
Definition: TensorScan.h:73
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition: TensorScan.h:147
#define NULL
Definition: ccolamd.c:609
XprType::CoeffReturnType CoeffReturnType
Definition: TensorScan.h:54
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const
Definition: TensorScan.h:64
The tensor base class.
Definition: TensorBase.h:829
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorScan.h:131
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const
Definition: TensorScan.h:188
Eigen::internal::nested< TensorScanOp >::type Nested
Definition: TensorScan.h:55
dim3 blockIdx
Definition: cuda_common.h:11
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & stride() const
Definition: TensorScan.h:135
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const
Definition: TensorScan.h:68
dim3 blockDim
Definition: cuda_common.h:11
const Index m_axis
Definition: TensorScan.h:74
Eigen::internal::traits< TensorScanOp >::Index Index
Definition: TensorScan.h:57
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & size() const
Definition: TensorScan.h:139
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorScan.h:53
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType &expr, const Index &axis, bool exclusive=false, const Op &op=Op())
Definition: TensorScan.h:59


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:45:58