TensorScan.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 template <typename Op, typename XprType>
19  : public traits<XprType> {
20  typedef typename XprType::Scalar Scalar;
22  typedef typename XprTraits::StorageKind StorageKind;
23  typedef typename XprType::Nested Nested;
25  static const int NumDimensions = XprTraits::NumDimensions;
26  static const int Layout = XprTraits::Layout;
27  typedef typename XprTraits::PointerType PointerType;
28 };
29 
30 template<typename Op, typename XprType>
32 {
34 };
35 
36 template<typename Op, typename XprType>
37 struct nested<TensorScanOp<Op, XprType>, 1,
38  typename eval<TensorScanOp<Op, XprType> >::type>
39 {
41 };
42 } // end namespace internal
43 
49 template <typename Op, typename XprType>
50 class TensorScanOp
51  : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
52 public:
55  typedef typename XprType::CoeffReturnType CoeffReturnType;
59 
61  const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
62  : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
63 
65  const Index axis() const { return m_axis; }
67  const XprType& expression() const { return m_expr; }
69  const Op accumulator() const { return m_accumulator; }
71  bool exclusive() const { return m_exclusive; }
72 
73 protected:
74  typename XprType::Nested m_expr;
75  const Index m_axis;
76  const Op m_accumulator;
77  const bool m_exclusive;
78 };
79 
80 
81 namespace internal {
82 
83 template <typename Self>
84 EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
85  typename Self::CoeffReturnType* data) {
86  // Compute the scan along the axis, starting at the given offset
87  typename Self::CoeffReturnType accum = self.accumulator().initialize();
88  if (self.stride() == 1) {
89  if (self.exclusive()) {
90  for (Index curr = offset; curr < offset + self.size(); ++curr) {
91  data[curr] = self.accumulator().finalize(accum);
92  self.accumulator().reduce(self.inner().coeff(curr), &accum);
93  }
94  } else {
95  for (Index curr = offset; curr < offset + self.size(); ++curr) {
96  self.accumulator().reduce(self.inner().coeff(curr), &accum);
97  data[curr] = self.accumulator().finalize(accum);
98  }
99  }
100  } else {
101  if (self.exclusive()) {
102  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
103  Index curr = offset + idx3 * self.stride();
104  data[curr] = self.accumulator().finalize(accum);
105  self.accumulator().reduce(self.inner().coeff(curr), &accum);
106  }
107  } else {
108  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
109  Index curr = offset + idx3 * self.stride();
110  self.accumulator().reduce(self.inner().coeff(curr), &accum);
111  data[curr] = self.accumulator().finalize(accum);
112  }
113  }
114  }
115 }
116 
117 template <typename Self>
118 EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
119  typename Self::CoeffReturnType* data) {
120  using Scalar = typename Self::CoeffReturnType;
121  using Packet = typename Self::PacketReturnType;
122  // Compute the scan along the axis, starting at the calculated offset
123  Packet accum = self.accumulator().template initializePacket<Packet>();
124  if (self.stride() == 1) {
125  if (self.exclusive()) {
126  for (Index curr = offset; curr < offset + self.size(); ++curr) {
127  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
128  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
129  }
130  } else {
131  for (Index curr = offset; curr < offset + self.size(); ++curr) {
132  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
133  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
134  }
135  }
136  } else {
137  if (self.exclusive()) {
138  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
139  const Index curr = offset + idx3 * self.stride();
140  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
141  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
142  }
143  } else {
144  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
145  const Index curr = offset + idx3 * self.stride();
146  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
147  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
148  }
149  }
150  }
151 }
152 
153 template <typename Self, bool Vectorize, bool Parallel>
154 struct ReduceBlock {
155  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
156  typename Self::CoeffReturnType* data) {
157  for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
158  // Calculate the starting offset for the scan
159  Index offset = idx1 + idx2;
160  ReduceScalar(self, offset, data);
161  }
162  }
163 };
164 
165 // Specialization for vectorized reduction.
166 template <typename Self>
167 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
168  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
169  typename Self::CoeffReturnType* data) {
170  using Packet = typename Self::PacketReturnType;
171  const int PacketSize = internal::unpacket_traits<Packet>::size;
172  Index idx2 = 0;
173  for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
174  // Calculate the starting offset for the packet scan
175  Index offset = idx1 + idx2;
176  ReducePacket(self, offset, data);
177  }
178  for (; idx2 < self.stride(); idx2++) {
179  // Calculate the starting offset for the scan
180  Index offset = idx1 + idx2;
181  ReduceScalar(self, offset, data);
182  }
183  }
184 };
185 
186 // Single-threaded CPU implementation of scan
187 template <typename Self, typename Reducer, typename Device,
188  bool Vectorize =
191 struct ScanLauncher {
192  void operator()(Self& self, typename Self::CoeffReturnType* data) {
193  Index total_size = internal::array_prod(self.dimensions());
194 
195  // We fix the index along the scan axis to 0 and perform a
196  // scan per remaining entry. The iteration is split into two nested
197  // loops to avoid an integer division by keeping track of each idx1 and
198  // idx2.
199  for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
200  ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
201  block_reducer(self, idx1, data);
202  }
203  }
204 };
205 
206 #ifdef EIGEN_USE_THREADS
207 
208 // Adjust block_size to avoid false sharing of cachelines among
209 // threads. Currently set to twice the cache line size on Intel and ARM
210 // processors.
211 EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
212  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
213  const Index items_per_cacheline =
214  numext::maxi<Index>(1, kBlockAlignment / item_size);
215  return items_per_cacheline * divup(block_size, items_per_cacheline);
216 }
217 
218 template <typename Self>
219 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
220  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
221  typename Self::CoeffReturnType* data) {
222  using Scalar = typename Self::CoeffReturnType;
223  using Packet = typename Self::PacketReturnType;
224  const int PacketSize = internal::unpacket_traits<Packet>::size;
225  Index num_scalars = self.stride();
226  Index num_packets = 0;
227  if (self.stride() >= PacketSize) {
228  num_packets = self.stride() / PacketSize;
229  self.device().parallelFor(
230  num_packets,
231  TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
232  16 * PacketSize * self.size(), true, PacketSize),
233  // Make the shard size large enough that two neighboring threads
234  // won't write to the same cacheline of `data`.
235  [=](Index blk_size) {
236  return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
237  },
238  [&](Index first, Index last) {
239  for (Index packet = first; packet < last; ++packet) {
240  const Index idx2 = packet * PacketSize;
241  ReducePacket(self, idx1 + idx2, data);
242  }
243  });
244  num_scalars -= num_packets * PacketSize;
245  }
246  self.device().parallelFor(
247  num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
248  // Make the shard size large enough that two neighboring threads
249  // won't write to the same cacheline of `data`.
250  [=](Index blk_size) {
251  return AdjustBlockSize(sizeof(Scalar), blk_size);
252  },
253  [&](Index first, Index last) {
254  for (Index scalar = first; scalar < last; ++scalar) {
255  const Index idx2 = num_packets * PacketSize + scalar;
256  ReduceScalar(self, idx1 + idx2, data);
257  }
258  });
259  }
260 };
261 
262 template <typename Self>
263 struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
264  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
265  typename Self::CoeffReturnType* data) {
266  using Scalar = typename Self::CoeffReturnType;
267  self.device().parallelFor(
268  self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
269  // Make the shard size large enough that two neighboring threads
270  // won't write to the same cacheline of `data`.
271  [=](Index blk_size) {
272  return AdjustBlockSize(sizeof(Scalar), blk_size);
273  },
274  [&](Index first, Index last) {
275  for (Index idx2 = first; idx2 < last; ++idx2) {
276  ReduceScalar(self, idx1 + idx2, data);
277  }
278  });
279  }
280 };
281 
282 // Specialization for multi-threaded execution.
283 template <typename Self, typename Reducer, bool Vectorize>
284 struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
285  void operator()(Self& self, typename Self::CoeffReturnType* data) {
286  using Scalar = typename Self::CoeffReturnType;
287  using Packet = typename Self::PacketReturnType;
288  const int PacketSize = internal::unpacket_traits<Packet>::size;
289  const Index total_size = internal::array_prod(self.dimensions());
290  const Index inner_block_size = self.stride() * self.size();
291  bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
292 
293  if ((parallelize_by_outer_blocks && total_size <= 4096) ||
294  (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
296  launcher(self, data);
297  return;
298  }
299 
300  if (parallelize_by_outer_blocks) {
301  // Parallelize over outer blocks.
302  const Index num_outer_blocks = total_size / inner_block_size;
303  self.device().parallelFor(
304  num_outer_blocks,
305  TensorOpCost(inner_block_size, inner_block_size,
306  16 * PacketSize * inner_block_size, Vectorize,
307  PacketSize),
308  [=](Index blk_size) {
309  return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
310  },
311  [&](Index first, Index last) {
312  for (Index idx1 = first; idx1 < last; ++idx1) {
313  ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
314  block_reducer(self, idx1 * inner_block_size, data);
315  }
316  });
317  } else {
318  // Parallelize over inner packets/scalars dimensions when the reduction
319  // axis is not an inner dimension.
320  ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
321  for (Index idx1 = 0; idx1 < total_size;
322  idx1 += self.stride() * self.size()) {
323  block_reducer(self, idx1, data);
324  }
325  }
326  }
327 };
328 #endif // EIGEN_USE_THREADS
329 
330 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
331 
332 // GPU implementation of scan
333 // TODO(ibab) This placeholder implementation performs multiple scans in
334 // parallel, but it would be better to use a parallel scan algorithm and
335 // optimize memory access.
336 template <typename Self, typename Reducer>
337 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
338  // Compute offset as in the CPU version
339  Index val = threadIdx.x + blockIdx.x * blockDim.x;
340  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
341 
342  if (offset + (self.size() - 1) * self.stride() < total_size) {
343  // Compute the scan along the axis, starting at the calculated offset
344  typename Self::CoeffReturnType accum = self.accumulator().initialize();
345  for (Index idx = 0; idx < self.size(); idx++) {
346  Index curr = offset + idx * self.stride();
347  if (self.exclusive()) {
348  data[curr] = self.accumulator().finalize(accum);
349  self.accumulator().reduce(self.inner().coeff(curr), &accum);
350  } else {
351  self.accumulator().reduce(self.inner().coeff(curr), &accum);
352  data[curr] = self.accumulator().finalize(accum);
353  }
354  }
355  }
356  __syncthreads();
357 
358 }
359 
360 template <typename Self, typename Reducer, bool Vectorize>
361 struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
362  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
363  Index total_size = internal::array_prod(self.dimensions());
364  Index num_blocks = (total_size / self.size() + 63) / 64;
365  Index block_size = 64;
366 
367  LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
368  }
369 };
370 #endif // EIGEN_USE_GPU && (EIGEN_GPUCC)
371 
372 } // namespace internal
373 
374 // Eval as rvalue
375 template <typename Op, typename ArgType, typename Device>
376 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
377 
379  typedef typename XprType::Index Index;
380  typedef const ArgType ChildTypeNoConst;
381  typedef const ArgType ChildType;
390 
391  enum {
392  IsAligned = false,
394  BlockAccess = false,
395  PreferBlockAccess = false,
397  CoordAccess = false,
398  RawAccess = true
399  };
400 
401  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
403  //===--------------------------------------------------------------------===//
404 
405  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
406  : m_impl(op.expression(), device),
407  m_device(device),
408  m_exclusive(op.exclusive()),
409  m_accumulator(op.accumulator()),
410  m_size(m_impl.dimensions()[op.axis()]),
411  m_stride(1), m_consume_dim(op.axis()),
412  m_output(NULL) {
413 
414  // Accumulating a scalar isn't supported.
415  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
416  eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
417 
418  // Compute stride of scan axis
419  const Dimensions& dims = m_impl.dimensions();
420  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
421  for (int i = 0; i < op.axis(); ++i) {
422  m_stride = m_stride * dims[i];
423  }
424  } else {
425  // dims can only be indexed through unsigned integers,
426  // so let's use an unsigned type to let the compiler knows.
427  // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
428  unsigned int axis = internal::convert_index<unsigned int>(op.axis());
429  for (unsigned int i = NumDims - 1; i > axis; --i) {
430  m_stride = m_stride * dims[i];
431  }
432  }
433  }
434 
436  return m_impl.dimensions();
437  }
438 
440  return m_stride;
441  }
442 
444  return m_consume_dim;
445  }
446 
448  return m_size;
449  }
450 
452  return m_accumulator;
453  }
454 
456  return m_exclusive;
457  }
458 
460  return m_impl;
461  }
462 
464  return m_device;
465  }
466 
467  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
468  m_impl.evalSubExprsIfNeeded(NULL);
470  if (data) {
471  launcher(*this, data);
472  return false;
473  }
474 
475  const Index total_size = internal::array_prod(dimensions());
476  m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
477  launcher(*this, m_output);
478  return true;
479  }
480 
481  template<int LoadMode>
482  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
483  return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
484  }
485 
486  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
487  {
488  return m_output;
489  }
490 
491  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
492  {
493  return m_output[index];
494  }
495 
497  return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
498  }
499 
501  if (m_output) {
502  m_device.deallocate_temp(m_output);
503  m_output = NULL;
504  }
505  m_impl.cleanup();
506  }
507 
508 #ifdef EIGEN_USE_SYCL
509  // binding placeholder accessors to a command group handler for SYCL
510  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
511  m_impl.bind(cgh);
512  m_output.bind(cgh);
513  }
514 #endif
515 protected:
518  const bool m_exclusive;
520  const Index m_size;
521  Index m_stride;
523  EvaluatorPointerType m_output;
524 };
525 
526 } // end namespace Eigen
527 
528 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
Definition: TensorScan.h:463
SCALAR Scalar
Definition: bench_gemm.cpp:46
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:510
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorScan.h:491
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorScan.h:435
size_t size() const
Definition: pytypes.h:2022
internal::remove_const< typename XprType::Scalar >::type Scalar
Definition: TensorScan.h:384
const bool m_exclusive
Definition: TensorScan.h:77
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
Definition: TensorScan.h:467
EIGEN_STRONG_INLINE void ReduceScalar(Self &self, Index offset, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:84
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition: TensorScan.h:455
const Op m_accumulator
Definition: TensorScan.h:76
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
dim3 threadIdx
Definition: gpu_common.h:19
A cost model used to limit the number of threads used for evaluating tensor expression.
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:127
EIGEN_CONSTEXPR Index first(const T &x) EIGEN_NOEXCEPT
static const symbolic::SymbolExpr< internal::symbolic_last_tag > last
TensorEvaluator< const TensorScanOp< Op, ArgType >, Device > Self
Definition: TensorScan.h:387
remove_reference< Nested >::type _Nested
Definition: TensorScan.h:24
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorScan.h:405
void operator()(Self &self, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:192
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:77
Eigen::internal::traits< TensorScanOp >::Scalar Scalar
Definition: TensorScan.h:53
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & size() const
Definition: TensorScan.h:447
#define eigen_assert(x)
Definition: Macros.h:1037
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorScan.h:386
EIGEN_STRONG_INLINE void operator()(Self &self, Index idx1, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:155
int data[]
Eigen::internal::traits< TensorScanOp >::StorageKind StorageKind
Definition: TensorScan.h:57
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & consume_dim() const
Definition: TensorScan.h:443
XprType::Nested m_expr
Definition: TensorScan.h:74
#define EIGEN_CONSTEXPR
Definition: Macros.h:787
#define NULL
Definition: ccolamd.c:609
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & accumulator() const
Definition: TensorScan.h:451
XprType::CoeffReturnType CoeffReturnType
Definition: TensorScan.h:55
EIGEN_STRONG_INLINE void ReducePacket(Self &self, Index offset, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:118
The tensor base class.
Definition: TensorBase.h:973
EIGEN_CONSTEXPR Index size(const T &x)
Definition: Meta.h:479
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
Definition: TensorScan.h:486
Eigen::internal::nested< TensorScanOp >::type Nested
Definition: TensorScan.h:56
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
Definition: TensorScan.h:67
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const
Definition: TensorScan.h:69
const Index m_axis
Definition: TensorScan.h:75
Eigen::internal::traits< TensorScanOp >::Index Index
Definition: TensorScan.h:58
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:50
Generic expression where a coefficient-wise unary operator is applied to an expression.
Definition: CwiseUnaryOp.h:55
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const
Definition: TensorScan.h:496
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & stride() const
Definition: TensorScan.h:439
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
Definition: TensorScan.h:482
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorScan.h:54
const std::vector< size_t > dimensions
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const
Definition: TensorScan.h:65
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
dim3 blockIdx
Definition: gpu_common.h:19
mxArray * scalar(mxClassID classid)
Definition: matlab.h:82
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition: TensorMeta.h:30
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition: TensorScan.h:71
EIGEN_STRONG_INLINE void operator()(Self &self, Index idx1, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:168
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType &expr, const Index &axis, bool exclusive=false, const Op &op=Op())
Definition: TensorScan.h:60
dim3 blockDim
Definition: gpu_common.h:19
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator< ArgType, Device > & inner() const
Definition: TensorScan.h:459


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:37:39