10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 41 template<
typename Expression>
46 template<
typename LhsXprType,
typename RhsXprType>
52 template<
typename UnaryOp,
typename XprType>
58 template<
typename BinaryOp,
typename LhsXprType,
typename RhsXprType>
67 template<
typename Broadcast,
typename XprType>
79 template <
typename Expression,
typename Device,
bool Vectorizable,
91 "Default executor instantiated with non-default device. " 92 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " 93 "EIGEN_USE_SYCL before including Eigen headers.");
97 const Device& device = Device()) {
102 for (StorageIndex
i = 0;
i <
size; ++
i) {
103 evaluator.evalScalar(
i);
114 template <
typename Expression,
typename Device,
typename DoneCallback,
121 template <
typename Expression>
140 const StorageIndex UnrolledSize =
141 (size / (4 * PacketSize)) * 4 * PacketSize;
142 for (StorageIndex
i = 0;
i < UnrolledSize;
i += 4 * PacketSize) {
143 for (StorageIndex
j = 0;
j < 4;
j++) {
144 evaluator.evalPacket(
i +
j * PacketSize);
147 const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
148 for (StorageIndex
i = UnrolledSize;
i < VectorizedSize;
i += PacketSize) {
149 evaluator.evalPacket(
i);
151 for (StorageIndex
i = VectorizedSize;
i <
size; ++
i) {
152 evaluator.evalScalar(
i);
163 template <
typename Expression,
bool Vectorizable>
196 const TensorBlockMapper block_mapper(
197 typename TensorBlockDesc::Dimensions(evaluator.
dimensions()),
201 TensorBlockScratch scratch(device);
203 const StorageIndex total_block_count = block_mapper.
blockCount();
204 for (StorageIndex
i = 0;
i < total_block_count; ++
i) {
206 evaluator.evalBlock(desc, scratch);
225 #ifdef EIGEN_USE_THREADS 227 template <
typename TensorBlockMapper>
228 struct TensorExecutorTilingContext {
229 TensorExecutorTilingContext() =
default;
232 : block_mapper(b_mapper),
234 aligned_blocksize(b_aligned_size) {}
238 size_t aligned_blocksize;
243 template <
typename Evaluator,
typename TensorBlockMapper,
bool Vectorizable>
244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
253 requirements.
size =
static_cast<size_t>(1.0 / taskSize);
261 const size_t aligned_blocksize =
269 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
271 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
272 const StorageIndex lastIdx) {
273 Evaluator evaluator = *evaluator_in;
275 for (StorageIndex
i = firstIdx;
i < lastIdx; ++
i) {
276 evaluator.evalScalar(
i);
280 static StorageIndex alignBlockSize(StorageIndex
size) {
return size; }
283 template <
typename Evaluator,
typename StorageIndex>
284 struct EvalRange<Evaluator, StorageIndex, true> {
285 static const int PacketSize =
288 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
289 const StorageIndex lastIdx) {
290 Evaluator evaluator = *evaluator_in;
292 StorageIndex
i = firstIdx;
293 if (lastIdx - firstIdx >= PacketSize) {
295 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
299 for (; i <= last_chunk_offset; i += 4 * PacketSize) {
300 for (StorageIndex
j = 0;
j < 4;
j++) {
301 evaluator.evalPacket(i +
j * PacketSize);
304 last_chunk_offset = lastIdx - PacketSize;
305 for (; i <= last_chunk_offset; i += PacketSize) {
306 evaluator.evalPacket(i);
309 for (; i < lastIdx; ++
i) {
310 evaluator.evalScalar(i);
314 static StorageIndex alignBlockSize(StorageIndex
size) {
316 if (size >= 16 * PacketSize) {
317 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
320 return (size + PacketSize - 1) & ~(PacketSize - 1);
324 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
330 const ThreadPoolDevice& device) {
332 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
334 Evaluator evaluator(expr, device);
338 device.parallelFor(size, evaluator.
costPerCoeff(Vectorizable),
339 EvalRange::alignBlockSize,
340 [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
348 template <
typename Expression,
bool Vectorizable>
350 TiledEvaluation::
On> {
360 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
368 const ThreadPoolDevice& device) {
369 Evaluator evaluator(expr, device);
373 const TilingContext tiling =
374 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
375 Vectorizable>(evaluator);
377 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
378 IndexType lastBlockIdx) {
379 TensorBlockScratch scratch(device);
381 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
383 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
384 evaluator.evalBlock(desc, scratch);
390 if (tiling.block_mapper.blockCount() == 1) {
391 TensorBlockScratch scratch(device);
392 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
393 evaluator.evalBlock(desc, scratch);
395 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
403 template <
typename Expression,
typename DoneCallback,
bool Vectorizable,
404 TiledEvaluation Tiling>
406 Vectorizable, Tiling> {
412 const ThreadPoolDevice& device,
414 TensorAsyncExecutorContext*
const ctx =
415 new TensorAsyncExecutorContext(expr, device, std::move(done));
417 const auto on_eval_subexprs = [ctx, &device](
bool need_assign) ->
void {
423 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424 const StorageIndex
size =
array_prod(ctx->evaluator.dimensions());
425 device.parallelForAsync(
426 size, ctx->evaluator.costPerCoeff(Vectorizable),
427 EvalRange::alignBlockSize,
428 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
431 [ctx]() {
delete ctx; });
434 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
438 struct TensorAsyncExecutorContext {
439 TensorAsyncExecutorContext(
const Expression& expr,
440 const ThreadPoolDevice& thread_pool,
442 : evaluator(expr, thread_pool), on_done(std::move(done)) {}
444 ~TensorAsyncExecutorContext() {
452 DoneCallback on_done;
456 template <
typename Expression,
typename DoneCallback,
bool Vectorizable>
458 Vectorizable, TiledEvaluation::
On> {
468 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
475 const ThreadPoolDevice& device,
478 TensorAsyncExecutorContext*
const ctx =
479 new TensorAsyncExecutorContext(expr, device, std::move(done));
481 const auto on_eval_subexprs = [ctx](
bool need_assign) ->
void {
487 ctx->tiling = internal::GetTensorExecutorTilingContext<
488 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
490 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491 TensorBlockScratch scratch(ctx->device);
493 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
495 TensorBlockDesc desc =
496 ctx->tiling.block_mapper.blockDescriptor(block_idx);
497 ctx->evaluator.evalBlock(desc, scratch);
503 if (ctx->tiling.block_mapper.blockCount() == 1) {
504 TensorBlockScratch scratch(ctx->device);
505 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
506 ctx->evaluator.evalBlock(desc, scratch);
509 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510 ctx->tiling.cost, eval_block,
511 [ctx]() {
delete ctx; });
515 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
519 struct TensorAsyncExecutorContext {
520 TensorAsyncExecutorContext(
const Expression& expr,
521 const ThreadPoolDevice& thread_pool,
523 : device(thread_pool),
524 evaluator(expr, thread_pool),
525 on_done(std::move(done)) {}
527 ~TensorAsyncExecutorContext() {
532 const ThreadPoolDevice& device;
534 TilingContext tiling;
537 DoneCallback on_done;
541 #endif // EIGEN_USE_THREADS 544 #if defined(EIGEN_USE_GPU) 546 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
550 static void run(
const Expression& expr,
const GpuDevice& device);
553 #if defined(EIGEN_GPUCC) 554 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
555 struct EigenMetaKernelEval {
557 void run(Evaluator&
eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
558 for (StorageIndex
i = firstIdx;
i < lastIdx;
i += step_size) {
564 template <
typename Evaluator,
typename StorageIndex>
565 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
567 void run(Evaluator&
eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
569 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
570 const StorageIndex vectorized_step_size = step_size * PacketSize;
573 for (StorageIndex
i = firstIdx * PacketSize;
i < vectorized_size;
574 i += vectorized_step_size) {
577 for (StorageIndex
i = vectorized_size + firstIdx;
i < lastIdx;
i += step_size) {
583 template <
typename Evaluator,
typename StorageIndex>
585 __launch_bounds__(1024)
586 EigenMetaKernel(Evaluator
eval, StorageIndex
size) {
589 const StorageIndex step_size =
blockDim.x * gridDim.x;
591 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
596 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
598 const Expression& expr,
const GpuDevice& device) {
603 const int block_size = device.maxGpuThreadsPerBlock();
604 const int max_blocks = device.getNumGpuMultiProcessors() *
605 device.maxGpuThreadsPerMultiProcessor() / block_size;
608 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(
size, block_size)), 1);
612 num_blocks, block_size, 0, device, evaluator, size);
617 #endif // EIGEN_GPUCC 618 #endif // EIGEN_USE_GPU 621 #ifdef EIGEN_USE_SYCL 623 template <
typename Evaluator>
624 struct ExecExprFunctorKernel {
628 template <
typename Scratch>
630 const Scratch, Evaluator evaluator_,
const Index range_)
631 : evaluator(evaluator_),
range(range_) {}
634 cl::sycl::nd_item<1> itemID) {
637 template <
bool is_vec = Evaluator::PacketAccess>
639 compute(
const cl::sycl::nd_item<1>& itemID) {
640 Index gId =
static_cast<Index
>(itemID.get_global_linear_id());
641 Index total_threads = itemID.get_global_range(0);
643 for (Index
i = gId;
i <
range;
i += total_threads) {
644 evaluator.evalScalar(
i);
647 template <
bool is_vec = Evaluator::PacketAccess>
649 compute(
const cl::sycl::nd_item<1>& itemID) {
650 const Index vectorizedRange =
651 (range / Evaluator::PacketSize) * Evaluator::PacketSize;
652 Index gId =
static_cast<Index
>(itemID.get_global_linear_id());
653 const Index
step = Evaluator::PacketSize * itemID.get_global_range(0);
654 const Index start = Evaluator::PacketSize * gId;
655 for (Index
i = start;
i < vectorizedRange;
i +=
step) {
656 evaluator.evalPacket(
i);
658 gId += vectorizedRange;
659 for (Index
i = gId;
i <
range;
i += itemID.get_global_range(0)) {
660 evaluator.evalScalar(
i);
665 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
666 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
670 const Eigen::SyclDevice& dev) {
672 Evaluator evaluator(expr, dev);
675 Index
range, GRange, tileSize;
677 total_size = (total_size == 0) ? 1 : total_size;
678 const int PacketSize =
681 Index vectorizable_threads =
static_cast<Index
>(total_size / PacketSize);
682 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
685 dev.template nullary_kernel_launcher<
686 typename Evaluator::CoeffReturnType,
687 ExecExprFunctorKernel<Evaluator> >(
689 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
690 cl::sycl::range<1>(tileSize)),
703 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const
traits< Expression >::Index StorageIndex
def step(data, isam, result, truth, currPoseIndex)
traits< Expression >::Scalar Scalar
#define EIGEN_ALWAYS_INLINE
EIGEN_STRONG_INLINE void cleanup()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
Expression::Index StorageIndex
Derived::Scalar CoeffReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Namespace containing all symbols from the Eigen library.
A cost model used to limit the number of threads used for evaluating tensor expression.
TensorOpCost cost_per_coeff
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
static Similarity3 align(const Point3Pairs &d_abPointPairs, const Rot3 &aRb, const Point3Pair ¢roids)
This method estimates the similarity transform from differences point pairs,.
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
Generic expression where a coefficient-wise binary operator is applied to two expressions.
remove_const< Scalar >::type ScalarNoConst
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
EIGEN_CONSTEXPR Index size(const T &x)
#define EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Double_ range(const Point2_ &p, const Point2_ &q)
Expression::Index StorageIndex
TensorEvaluator< Expression, DefaultDevice > Evaluator
EIGEN_DONT_INLINE void compute(Solver &solver, const MatrixType &A)
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())