10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
41 template<
typename Expression>
46 template<
typename LhsXprType,
typename RhsXprType>
52 template<
typename UnaryOp,
typename XprType>
58 template<
typename BinaryOp,
typename LhsXprType,
typename RhsXprType>
67 template<
typename Broadcast,
typename XprType>
79 template <
typename Expression,
typename Device,
bool Vectorizable,
91 "Default executor instantiated with non-default device. "
92 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
93 "EIGEN_USE_SYCL before including Eigen headers.");
97 const Device& device = Device()) {
99 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(
NULL);
114 template <
typename Expression,
typename Device,
typename DoneCallback,
121 template <
typename Expression>
131 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(
NULL);
141 (
size / (4 * PacketSize)) * 4 * PacketSize;
148 for (
StorageIndex i = UnrolledSize;
i < VectorizedSize;
i += PacketSize) {
163 template <
typename Expression,
bool Vectorizable>
189 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(
NULL);
197 typename TensorBlockDesc::Dimensions(
evaluator.dimensions()),
201 TensorBlockScratch scratch(device);
225 #ifdef EIGEN_USE_THREADS
227 template <
typename TensorBlockMapper>
228 struct TensorExecutorTilingContext {
229 TensorExecutorTilingContext() =
default;
230 TensorExecutorTilingContext(
const TensorBlockMapper& b_mapper,
232 : block_mapper(b_mapper),
234 aligned_blocksize(b_aligned_size) {}
236 TensorBlockMapper block_mapper;
238 size_t aligned_blocksize;
243 template <
typename Evaluator,
typename TensorBlockMapper,
bool Vectorizable>
244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
245 const Evaluator& evaluator) {
247 TensorBlockResourceRequirements requirements =
248 evaluator.getResourceRequirements();
252 1, requirements.cost_per_coeff);
253 requirements.size =
static_cast<size_t>(1.0 / taskSize);
255 TensorBlockMapper block_mapper(
259 size_t block_size = block_mapper.blockTotalSize();
261 const size_t aligned_blocksize =
265 return {block_mapper, requirements.cost_per_coeff * block_size,
269 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
271 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
272 const StorageIndex lastIdx) {
273 Evaluator evaluator = *evaluator_in;
275 for (StorageIndex
i = firstIdx;
i < lastIdx; ++
i) {
276 evaluator.evalScalar(
i);
280 static StorageIndex alignBlockSize(StorageIndex
size) {
return size; }
283 template <
typename Evaluator,
typename StorageIndex>
284 struct EvalRange<Evaluator, StorageIndex, true> {
285 static const int PacketSize =
288 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
289 const StorageIndex lastIdx) {
290 Evaluator evaluator = *evaluator_in;
292 StorageIndex
i = firstIdx;
293 if (lastIdx - firstIdx >= PacketSize) {
295 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
299 for (;
i <= last_chunk_offset;
i += 4 * PacketSize) {
300 for (StorageIndex
j = 0;
j < 4;
j++) {
301 evaluator.evalPacket(
i +
j * PacketSize);
304 last_chunk_offset = lastIdx - PacketSize;
305 for (;
i <= last_chunk_offset;
i += PacketSize) {
306 evaluator.evalPacket(
i);
309 for (;
i < lastIdx; ++
i) {
310 evaluator.evalScalar(
i);
314 static StorageIndex alignBlockSize(StorageIndex
size) {
316 if (
size >= 16 * PacketSize) {
317 return (
size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
320 return (
size + PacketSize - 1) & ~(PacketSize - 1);
324 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
330 const ThreadPoolDevice& device) {
331 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
332 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
334 Evaluator evaluator(expr, device);
335 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
338 device.parallelFor(
size, evaluator.costPerCoeff(Vectorizable),
339 EvalRange::alignBlockSize,
341 EvalRange::run(&evaluator, firstIdx, lastIdx);
348 template <
typename Expression,
bool Vectorizable>
356 static const int NumDims = traits<Expression>::NumDimensions;
358 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
359 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
360 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
362 typedef internal::TensorBlockDescriptor<NumDims, IndexType>
364 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
368 const ThreadPoolDevice& device) {
369 Evaluator evaluator(expr, device);
371 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
373 const TilingContext tiling =
374 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
375 Vectorizable>(evaluator);
377 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
378 IndexType lastBlockIdx) {
379 TensorBlockScratch scratch(device);
381 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
383 TensorBlockDesc
desc = tiling.block_mapper.blockDescriptor(block_idx);
384 evaluator.evalBlock(
desc, scratch);
390 if (tiling.block_mapper.blockCount() == 1) {
391 TensorBlockScratch scratch(device);
392 TensorBlockDesc
desc(0, tiling.block_mapper.blockDimensions());
393 evaluator.evalBlock(
desc, scratch);
395 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
403 template <
typename Expression,
typename DoneCallback,
bool Vectorizable,
405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
406 Vectorizable, Tiling> {
409 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
412 const ThreadPoolDevice& device,
414 TensorAsyncExecutorContext*
const ctx =
415 new TensorAsyncExecutorContext(expr, device, std::move(done));
417 const auto on_eval_subexprs = [ctx, &device](
bool need_assign) ->
void {
423 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424 const StorageIndex
size =
array_prod(ctx->evaluator.dimensions());
425 device.parallelForAsync(
426 size, ctx->evaluator.costPerCoeff(Vectorizable),
427 EvalRange::alignBlockSize,
428 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
429 EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
431 [ctx]() { delete ctx; });
434 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
438 struct TensorAsyncExecutorContext {
439 TensorAsyncExecutorContext(
const Expression& expr,
440 const ThreadPoolDevice& thread_pool,
442 : evaluator(expr, thread_pool), on_done(
std::
move(done)) {}
444 ~TensorAsyncExecutorContext() {
452 DoneCallback on_done;
456 template <
typename Expression,
typename DoneCallback,
bool Vectorizable>
457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
464 static const int NumDims = traits<Expression>::NumDimensions;
466 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
467 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
468 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
470 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
471 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
475 const ThreadPoolDevice& device,
478 TensorAsyncExecutorContext*
const ctx =
479 new TensorAsyncExecutorContext(expr, device, std::move(done));
481 const auto on_eval_subexprs = [ctx](
bool need_assign) ->
void {
487 ctx->tiling = internal::GetTensorExecutorTilingContext<
488 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
490 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491 TensorBlockScratch scratch(ctx->device);
493 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
495 TensorBlockDesc
desc =
496 ctx->tiling.block_mapper.blockDescriptor(block_idx);
497 ctx->evaluator.evalBlock(
desc, scratch);
503 if (ctx->tiling.block_mapper.blockCount() == 1) {
504 TensorBlockScratch scratch(ctx->device);
505 TensorBlockDesc
desc(0, ctx->tiling.block_mapper.blockDimensions());
506 ctx->evaluator.evalBlock(
desc, scratch);
509 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510 ctx->tiling.cost, eval_block,
511 [ctx]() { delete ctx; });
515 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
519 struct TensorAsyncExecutorContext {
520 TensorAsyncExecutorContext(
const Expression& expr,
521 const ThreadPoolDevice& thread_pool,
523 : device(thread_pool),
524 evaluator(expr, thread_pool),
527 ~TensorAsyncExecutorContext() {
532 const ThreadPoolDevice& device;
534 TilingContext tiling;
537 DoneCallback on_done;
541 #endif // EIGEN_USE_THREADS
544 #if defined(EIGEN_USE_GPU)
546 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
550 static void run(
const Expression& expr,
const GpuDevice& device);
553 #if defined(EIGEN_GPUCC)
554 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
555 struct EigenMetaKernelEval {
557 void run(Evaluator&
eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
558 for (StorageIndex
i = firstIdx;
i < lastIdx;
i += step_size) {
564 template <
typename Evaluator,
typename StorageIndex>
565 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
567 void run(Evaluator&
eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
569 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
570 const StorageIndex vectorized_step_size = step_size * PacketSize;
573 for (StorageIndex
i = firstIdx * PacketSize;
i < vectorized_size;
574 i += vectorized_step_size) {
577 for (StorageIndex
i = vectorized_size + firstIdx;
i < lastIdx;
i += step_size) {
583 template <
typename Evaluator,
typename StorageIndex>
585 __launch_bounds__(1024)
586 EigenMetaKernel(Evaluator
eval, StorageIndex
size) {
589 const StorageIndex step_size =
blockDim.x * gridDim.x;
591 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
596 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
598 const Expression& expr,
const GpuDevice& device) {
599 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
600 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
603 const int block_size = device.maxGpuThreadsPerBlock();
604 const int max_blocks = device.getNumGpuMultiProcessors() *
605 device.maxGpuThreadsPerMultiProcessor() / block_size;
608 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(
size, block_size)), 1);
611 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
612 num_blocks, block_size, 0, device, evaluator,
size);
617 #endif // EIGEN_GPUCC
618 #endif // EIGEN_USE_GPU
621 #ifdef EIGEN_USE_SYCL
623 template <
typename Evaluator>
624 struct ExecExprFunctorKernel {
628 template <
typename Scratch>
630 const Scratch, Evaluator evaluator_,
const Index range_)
631 : evaluator(evaluator_),
range(range_) {}
634 cl::sycl::nd_item<1> itemID) {
637 template <
bool is_vec = Evaluator::PacketAccess>
639 compute(
const cl::sycl::nd_item<1>& itemID) {
640 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
641 Index total_threads = itemID.get_global_range(0);
644 evaluator.evalScalar(
i);
647 template <
bool is_vec = Evaluator::PacketAccess>
649 compute(
const cl::sycl::nd_item<1>& itemID) {
650 const Index vectorizedRange =
651 (
range / Evaluator::PacketSize) * Evaluator::PacketSize;
652 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
653 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
654 const Index start = Evaluator::PacketSize * gId;
656 evaluator.evalPacket(
i);
658 gId += vectorizedRange;
659 for (
Index i = gId;
i <
range;
i += itemID.get_global_range(0)) {
660 evaluator.evalScalar(
i);
665 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
670 const Eigen::SyclDevice& dev) {
672 Evaluator evaluator(expr, dev);
673 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
NULL);
677 total_size = (total_size == 0) ? 1 : total_size;
678 const int PacketSize =
681 Index vectorizable_threads =
static_cast<Index>(total_size / PacketSize);
682 dev.parallel_for_setup(vectorizable_threads, tileSize,
range, GRange);
685 dev.template nullary_kernel_launcher<
686 typename Evaluator::CoeffReturnType,
687 ExecExprFunctorKernel<Evaluator> >(
689 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
690 cl::sycl::range<1>(tileSize)),
703 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H