10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 17 template <
typename Op,
typename XprType>
23 typedef typename XprType::Nested
Nested;
25 static const int NumDimensions = XprTraits::NumDimensions;
26 static const int Layout = XprTraits::Layout;
30 template<
typename Op,
typename XprType>
36 template<
typename Op,
typename XprType>
49 template <
typename Op,
typename XprType>
51 :
public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
61 const XprType& expr,
const Index& axis,
bool exclusive =
false,
const Op& op = Op())
62 : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
65 const Index
axis()
const {
return m_axis; }
83 template <
typename Self>
85 typename Self::CoeffReturnType*
data) {
87 typename Self::CoeffReturnType accum =
self.accumulator().initialize();
88 if (
self.stride() == 1) {
89 if (
self.exclusive()) {
90 for (Index curr = offset; curr < offset +
self.size(); ++curr) {
91 data[curr] =
self.accumulator().finalize(accum);
92 self.accumulator().reduce(
self.inner().coeff(curr), &accum);
95 for (Index curr = offset; curr < offset +
self.size(); ++curr) {
96 self.accumulator().reduce(
self.inner().coeff(curr), &accum);
97 data[curr] =
self.accumulator().finalize(accum);
101 if (
self.exclusive()) {
102 for (Index idx3 = 0; idx3 <
self.size(); idx3++) {
103 Index curr = offset + idx3 *
self.stride();
104 data[curr] =
self.accumulator().finalize(accum);
105 self.accumulator().reduce(
self.inner().coeff(curr), &accum);
108 for (Index idx3 = 0; idx3 <
self.size(); idx3++) {
109 Index curr = offset + idx3 *
self.stride();
110 self.accumulator().reduce(
self.inner().coeff(curr), &accum);
111 data[curr] =
self.accumulator().finalize(accum);
117 template <
typename Self>
119 typename Self::CoeffReturnType*
data) {
120 using Scalar =
typename Self::CoeffReturnType;
121 using Packet =
typename Self::PacketReturnType;
123 Packet accum =
self.accumulator().template initializePacket<Packet>();
124 if (
self.stride() == 1) {
125 if (
self.exclusive()) {
126 for (Index curr = offset; curr < offset +
self.size(); ++curr) {
127 internal::pstoreu<Scalar, Packet>(data + curr,
self.accumulator().finalizePacket(accum));
128 self.accumulator().reducePacket(
self.inner().
template packet<Unaligned>(curr), &accum);
131 for (Index curr = offset; curr < offset +
self.size(); ++curr) {
132 self.accumulator().reducePacket(
self.inner().
template packet<Unaligned>(curr), &accum);
133 internal::pstoreu<Scalar, Packet>(data + curr,
self.accumulator().finalizePacket(accum));
137 if (
self.exclusive()) {
138 for (Index idx3 = 0; idx3 <
self.size(); idx3++) {
139 const Index curr = offset + idx3 *
self.stride();
140 internal::pstoreu<Scalar, Packet>(data + curr,
self.accumulator().finalizePacket(accum));
141 self.accumulator().reducePacket(
self.inner().
template packet<Unaligned>(curr), &accum);
144 for (Index idx3 = 0; idx3 <
self.size(); idx3++) {
145 const Index curr = offset + idx3 *
self.stride();
146 self.accumulator().reducePacket(
self.inner().
template packet<Unaligned>(curr), &accum);
147 internal::pstoreu<Scalar, Packet>(data + curr,
self.accumulator().finalizePacket(accum));
153 template <
typename Self,
bool Vectorize,
bool Parallel>
156 typename Self::CoeffReturnType*
data) {
157 for (Index idx2 = 0; idx2 <
self.stride(); idx2++) {
159 Index
offset = idx1 + idx2;
166 template <
typename Self>
169 typename Self::CoeffReturnType*
data) {
170 using Packet =
typename Self::PacketReturnType;
173 for (; idx2 + PacketSize <=
self.stride(); idx2 += PacketSize) {
175 Index
offset = idx1 + idx2;
178 for (; idx2 <
self.stride(); idx2++) {
180 Index
offset = idx1 + idx2;
187 template <
typename Self,
typename Reducer,
typename Device,
199 for (Index idx1 = 0; idx1 < total_size; idx1 +=
self.stride() *
self.size()) {
201 block_reducer(
self, idx1, data);
206 #ifdef EIGEN_USE_THREADS 213 const Index items_per_cacheline =
214 numext::maxi<Index>(1, kBlockAlignment / item_size);
215 return items_per_cacheline *
divup(block_size, items_per_cacheline);
218 template <
typename Self>
221 typename Self::CoeffReturnType*
data) {
222 using Scalar =
typename Self::CoeffReturnType;
223 using Packet =
typename Self::PacketReturnType;
225 Index num_scalars =
self.stride();
226 Index num_packets = 0;
227 if (
self.stride() >= PacketSize) {
228 num_packets =
self.stride() / PacketSize;
229 self.device().parallelFor(
232 16 * PacketSize *
self.
size(),
true, PacketSize),
235 [=](Index blk_size) {
236 return AdjustBlockSize(PacketSize *
sizeof(Scalar), blk_size);
239 for (Index packet = first; packet <
last; ++packet) {
240 const Index idx2 = packet * PacketSize;
244 num_scalars -= num_packets * PacketSize;
246 self.device().parallelFor(
250 [=](Index blk_size) {
251 return AdjustBlockSize(
sizeof(Scalar), blk_size);
255 const Index idx2 = num_packets * PacketSize +
scalar;
262 template <
typename Self>
265 typename Self::CoeffReturnType*
data) {
266 using Scalar =
typename Self::CoeffReturnType;
267 self.device().parallelFor(
271 [=](Index blk_size) {
272 return AdjustBlockSize(
sizeof(Scalar), blk_size);
275 for (Index idx2 = first; idx2 <
last; ++idx2) {
283 template <
typename Self,
typename Reducer,
bool Vectorize>
284 struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
285 void operator()(Self&
self,
typename Self::CoeffReturnType*
data) {
286 using Scalar =
typename Self::CoeffReturnType;
287 using Packet =
typename Self::PacketReturnType;
290 const Index inner_block_size =
self.stride() *
self.size();
291 bool parallelize_by_outer_blocks = (total_size >= (
self.stride() * inner_block_size));
293 if ((parallelize_by_outer_blocks && total_size <= 4096) ||
294 (!parallelize_by_outer_blocks &&
self.stride() < PacketSize)) {
296 launcher(
self, data);
300 if (parallelize_by_outer_blocks) {
302 const Index num_outer_blocks = total_size / inner_block_size;
303 self.device().parallelFor(
306 16 * PacketSize * inner_block_size, Vectorize,
308 [=](Index blk_size) {
309 return AdjustBlockSize(inner_block_size *
sizeof(Scalar), blk_size);
312 for (Index idx1 = first; idx1 <
last; ++idx1) {
314 block_reducer(
self, idx1 * inner_block_size, data);
321 for (Index idx1 = 0; idx1 < total_size;
322 idx1 +=
self.stride() *
self.size()) {
323 block_reducer(
self, idx1, data);
328 #endif // EIGEN_USE_THREADS 330 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) 336 template <
typename Self,
typename Reducer>
340 Index
offset = (val /
self.stride()) *
self.stride() *
self.
size() + val %
self.stride();
342 if (offset + (
self.
size() - 1) *
self.stride() < total_size) {
344 typename Self::CoeffReturnType accum =
self.accumulator().initialize();
345 for (Index idx = 0; idx <
self.size(); idx++) {
346 Index curr = offset + idx *
self.stride();
347 if (
self.exclusive()) {
348 data[curr] =
self.accumulator().finalize(accum);
349 self.accumulator().reduce(
self.inner().coeff(curr), &accum);
351 self.accumulator().reduce(
self.inner().coeff(curr), &accum);
352 data[curr] =
self.accumulator().finalize(accum);
360 template <
typename Self,
typename Reducer,
bool Vectorize>
361 struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
362 void operator()(
const Self&
self,
typename Self::CoeffReturnType* data) {
364 Index num_blocks = (total_size /
self.size() + 63) / 64;
365 Index block_size = 64;
367 LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0,
self.device(),
self, total_size, data);
370 #endif // EIGEN_USE_GPU && (EIGEN_GPUCC) 375 template <
typename Op,
typename ArgType,
typename Device>
395 PreferBlockAccess =
false,
406 : m_impl(op.expression(), device),
408 m_exclusive(op.exclusive()),
409 m_accumulator(op.accumulator()),
411 m_stride(1), m_consume_dim(op.axis()),
419 const Dimensions& dims = m_impl.dimensions();
420 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
421 for (
int i = 0;
i < op.
axis(); ++
i) {
422 m_stride = m_stride * dims[
i];
428 unsigned int axis = internal::convert_index<unsigned int>(op.
axis());
429 for (
unsigned int i = NumDims - 1;
i > axis; --
i) {
430 m_stride = m_stride * dims[
i];
436 return m_impl.dimensions();
444 return m_consume_dim;
452 return m_accumulator;
468 m_impl.evalSubExprsIfNeeded(
NULL);
471 launcher(*
this, data);
476 m_output =
static_cast<EvaluatorPointerType
>(m_device.get((Scalar*) m_device.allocate_temp(total_size *
sizeof(Scalar))));
477 launcher(*
this, m_output);
481 template<
int LoadMode>
483 return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
493 return m_output[index];
502 m_device.deallocate_temp(m_output);
508 #ifdef EIGEN_USE_SYCL 528 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
StorageMemory< Scalar, Device > Storage
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
EvaluatorPointerType m_output
internal::remove_const< typename XprType::Scalar >::type Scalar
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
EIGEN_STRONG_INLINE void ReduceScalar(Self &self, Index offset, typename Self::CoeffReturnType *data)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
traits< XprType > XprTraits
const ArgType ChildTypeNoConst
XprTraits::PointerType PointerType
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Storage::Type EvaluatorPointerType
Namespace containing all symbols from the Eigen library.
A cost model used to limit the number of threads used for evaluating tensor expression.
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
EIGEN_CONSTEXPR Index first(const T &x) EIGEN_NOEXCEPT
static const symbolic::SymbolExpr< internal::symbolic_last_tag > last
TensorEvaluator< const TensorScanOp< Op, ArgType >, Device > Self
remove_reference< Nested >::type _Nested
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
void operator()(Self &self, typename Self::CoeffReturnType *data)
Generic expression where a coefficient-wise binary operator is applied to two expressions.
EIGEN_STRONG_INLINE void cleanup()
Eigen::internal::traits< TensorScanOp >::Scalar Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & size() const
PacketType< CoeffReturnType, Device >::type PacketReturnType
EIGEN_STRONG_INLINE void operator()(Self &self, Index idx1, typename Self::CoeffReturnType *data)
Eigen::internal::traits< TensorScanOp >::StorageKind StorageKind
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & consume_dim() const
DSizes< Index, NumDims > Dimensions
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & accumulator() const
XprType::CoeffReturnType CoeffReturnType
XprTraits::StorageKind StorageKind
EIGEN_STRONG_INLINE void ReducePacket(Self &self, Index offset, typename Self::CoeffReturnType *data)
TensorEvaluator< ArgType, Device > m_impl
EIGEN_CONSTEXPR Index size(const T &x)
#define EIGEN_DEVICE_FUNC
TensorScanOp< Op, ArgType > XprType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
Eigen::internal::nested< TensorScanOp >::type Nested
const Device EIGEN_DEVICE_REF m_device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
const TensorScanOp< Op, XprType > & type
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const
Eigen::internal::traits< TensorScanOp >::Index Index
Generic expression where a coefficient-wise unary operator is applied to an expression.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & stride() const
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
Eigen::NumTraits< Scalar >::Real RealScalar
TensorScanOp< Op, XprType > type
const std::vector< size_t > dimensions
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
mxArray * scalar(mxClassID classid)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
XprType::CoeffReturnType CoeffReturnType
internal::TensorBlockNotImplemented TensorBlock
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
EIGEN_STRONG_INLINE void operator()(Self &self, Index idx1, typename Self::CoeffReturnType *data)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType &expr, const Index &axis, bool exclusive=false, const Op &op=Op())
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator< ArgType, Device > & inner() const