gtsam: TensorExecutor.h Source File

Go to the documentation of this file.
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
  
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
  
 namespace Eigen {
  
 namespace internal {
  
 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
  
 template<typename Expression>
 struct ExpressionHasTensorBroadcastingOp {
   enum { value = false };
 };
  
 template<typename LhsXprType, typename RhsXprType>
 struct ExpressionHasTensorBroadcastingOp<
     const TensorAssignOp<LhsXprType, RhsXprType> > {
   enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
 };
  
 template<typename UnaryOp, typename XprType>
 struct ExpressionHasTensorBroadcastingOp<
     const TensorCwiseUnaryOp<UnaryOp, XprType> > {
   enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
 };
  
 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
 struct ExpressionHasTensorBroadcastingOp<
     const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
   enum {
     value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
         ExpressionHasTensorBroadcastingOp<RhsXprType>::value
   };
 };
  
 template<typename Broadcast, typename XprType>
 struct ExpressionHasTensorBroadcastingOp<
     const TensorBroadcastingOp<Broadcast, XprType> > {
   enum { value = true };
 };
  
 // -------------------------------------------------------------------------- //
  
 template <typename Expression, typename Device, bool Vectorizable,
           TiledEvaluation Tiling>
 class TensorExecutor {
  public:
   typedef typename Expression::Index StorageIndex;
  
   // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
   // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
   // violation. If this template is instantiated with a non-default device, it
   // means that this header file was included without defining
   // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
   static_assert(std::is_same<Device, DefaultDevice>::value,
                 "Default executor instantiated with non-default device. "
                 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
                 "EIGEN_USE_SYCL before including Eigen headers.");
  
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                                       const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
       const StorageIndex size = array_prod(evaluator.dimensions());
       for (StorageIndex i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
     evaluator.cleanup();
   }
 };
  
 template <typename Expression, typename Device, typename DoneCallback,
           bool Vectorizable, TiledEvaluation Tiling>
 class TensorAsyncExecutor {};
  
 template <typename Expression>
 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
                      /*Tiling=*/TiledEvaluation::Off> {
  public:
   typedef typename Expression::Index StorageIndex;
  
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(
       const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
       const StorageIndex size = array_prod(evaluator.dimensions());
       const int PacketSize = unpacket_traits<typename TensorEvaluator<
           Expression, DefaultDevice>::PacketReturnType>::size;
  
       // Give compiler a strong possibility to unroll the loop. But don't insist
       // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
       const StorageIndex UnrolledSize =
           (size / (4 * PacketSize)) * 4 * PacketSize;
       for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
         for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
       const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
       for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
       for (StorageIndex i = VectorizedSize; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
     evaluator.cleanup();
   }
 };
  
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
                      /*Tiling=*/TiledEvaluation::On> {
  public:
   typedef typename traits<Expression>::Scalar Scalar;
   typedef typename remove_const<Scalar>::type ScalarNoConst;
  
   typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
   typedef typename traits<Expression>::Index StorageIndex;
  
   static const int NumDims = traits<Expression>::NumDimensions;
  
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const DefaultDevice& device = DefaultDevice()) {
     typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
         TensorBlockMapper;
  
     typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
         TensorBlockDesc;
     typedef internal::TensorBlockScratchAllocator<DefaultDevice>
         TensorBlockScratch;
  
     Evaluator evaluator(expr, device);
  
     // TODO(ezhulenev): Do not use tiling for small tensors?
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
  
     if (needs_assign) {
       // Query expression tree for desired block size/shape.
       const TensorBlockResourceRequirements requirements =
           evaluator.getResourceRequirements();
  
       const TensorBlockMapper block_mapper(
           typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
           requirements);
  
       // Share scratch memory allocator between all blocks.
       TensorBlockScratch scratch(device);
  
       const StorageIndex total_block_count = block_mapper.blockCount();
       for (StorageIndex i = 0; i < total_block_count; ++i) {
         TensorBlockDesc desc = block_mapper.blockDescriptor(i);
         evaluator.evalBlock(desc, scratch);
         scratch.reset();
       }
     }
     evaluator.cleanup();
   }
 };
  
 #ifdef EIGEN_USE_THREADS
  
 template <typename TensorBlockMapper>
 struct TensorExecutorTilingContext {
   TensorExecutorTilingContext() = default;
   TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
                               const TensorOpCost& b_cost, size_t b_aligned_size)
       : block_mapper(b_mapper),
         cost(b_cost),
         aligned_blocksize(b_aligned_size) {}
  
   TensorBlockMapper block_mapper;  // navigate through blocks
   TensorOpCost cost;               // cost of computing a single block
   size_t aligned_blocksize;        // block size after memory alignment
 };
  
 // Computes a block evaluation parameters, and allocates temporary memory buffer
 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
     const Evaluator& evaluator) {
   // Query expression tree for desired block size/shape.
   TensorBlockResourceRequirements requirements =
       evaluator.getResourceRequirements();
  
   // Update target block size based on cost model.
   double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
       1, requirements.cost_per_coeff);
   requirements.size = static_cast<size_t>(1.0 / taskSize);
  
   TensorBlockMapper block_mapper(
       typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
       requirements);
  
   size_t block_size = block_mapper.blockTotalSize();
   const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
   const size_t aligned_blocksize =
       align *
       divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
  
   return {block_mapper, requirements.cost_per_coeff * block_size,
           aligned_blocksize};
 }
  
 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EvalRange {
   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
                   const StorageIndex lastIdx) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(lastIdx >= firstIdx);
     for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
       evaluator.evalScalar(i);
     }
   }
  
   static StorageIndex alignBlockSize(StorageIndex size) { return size; }
 };
  
 template <typename Evaluator, typename StorageIndex>
 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
   static const int PacketSize =
       unpacket_traits<typename Evaluator::PacketReturnType>::size;
  
   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
                   const StorageIndex lastIdx) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(lastIdx >= firstIdx);
     StorageIndex i = firstIdx;
     if (lastIdx - firstIdx >= PacketSize) {
       eigen_assert(firstIdx % PacketSize == 0);
       StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
       // Give compiler a strong possibility to unroll the loop. But don't insist
       // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
       for (; i <= last_chunk_offset; i += 4 * PacketSize) {
         for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
       last_chunk_offset = lastIdx - PacketSize;
       for (; i <= last_chunk_offset; i += PacketSize) {
         evaluator.evalPacket(i);
       }
     }
     for (; i < lastIdx; ++i) {
       evaluator.evalScalar(i);
     }
   }
  
   static StorageIndex alignBlockSize(StorageIndex size) {
     // Align block size to packet size and account for unrolling in run above.
     if (size >= 16 * PacketSize) {
       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
     }
     // Aligning to 4 * PacketSize would increase block size by more than 25%.
     return (size + PacketSize - 1) & ~(PacketSize - 1);
   }
 };
  
 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
  public:
   typedef typename Expression::Index StorageIndex;
  
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const ThreadPoolDevice& device) {
     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
     typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
  
     Evaluator evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
       const StorageIndex size = array_prod(evaluator.dimensions());
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
                          EvalRange::alignBlockSize,
                          [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
                            EvalRange::run(&evaluator, firstIdx, lastIdx);
                          });
     }
     evaluator.cleanup();
   }
 };
  
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
                      /*Tiling=*/TiledEvaluation::On> {
  public:
   typedef typename traits<Expression>::Index IndexType;
   typedef typename traits<Expression>::Scalar Scalar;
   typedef typename remove_const<Scalar>::type ScalarNoConst;
  
   static const int NumDims = traits<Expression>::NumDimensions;
  
   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
   typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
  
   typedef internal::TensorBlockDescriptor<NumDims, IndexType>
       TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
       TensorBlockScratch;
  
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                                       const ThreadPoolDevice& device) {
     Evaluator evaluator(expr, device);
  
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
       const TilingContext tiling =
           internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
                                                    Vectorizable>(evaluator);
  
       auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
                                                        IndexType lastBlockIdx) {
         TensorBlockScratch scratch(device);
  
         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
              ++block_idx) {
           TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
           evaluator.evalBlock(desc, scratch);
           scratch.reset();
         }
       };
  
       // Evaluate small expressions directly as a single block.
       if (tiling.block_mapper.blockCount() == 1) {
         TensorBlockScratch scratch(device);
         TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
         evaluator.evalBlock(desc, scratch);
       } else {
         device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
                            eval_block);
       }
     }
     evaluator.cleanup();
   }
 };
  
 template <typename Expression, typename DoneCallback, bool Vectorizable,
           TiledEvaluation Tiling>
 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
                           Vectorizable, Tiling> {
  public:
   typedef typename Expression::Index StorageIndex;
   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
  
   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
                                            const ThreadPoolDevice& device,
                                            DoneCallback done) {
     TensorAsyncExecutorContext* const ctx =
         new TensorAsyncExecutorContext(expr, device, std::move(done));
  
     const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
       if (!need_assign) {
         delete ctx;
         return;
       }
  
       typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
       const StorageIndex size = array_prod(ctx->evaluator.dimensions());
       device.parallelForAsync(
           size, ctx->evaluator.costPerCoeff(Vectorizable),
           EvalRange::alignBlockSize,
           [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
             EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
           },
           [ctx]() { delete ctx; });
     };
  
     ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
   }
  
  private:
   struct TensorAsyncExecutorContext {
     TensorAsyncExecutorContext(const Expression& expr,
                                const ThreadPoolDevice& thread_pool,
                                DoneCallback done)
         : evaluator(expr, thread_pool), on_done(std::move(done)) {}
  
     ~TensorAsyncExecutorContext() {
       evaluator.cleanup();
       on_done();
     }
  
     Evaluator evaluator;
  
    private:
     DoneCallback on_done;
   };
 };
  
 template <typename Expression, typename DoneCallback, bool Vectorizable>
 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
                           Vectorizable, /*Tileable*/ TiledEvaluation::On> {
  public:
   typedef typename traits<Expression>::Index IndexType;
   typedef typename traits<Expression>::Scalar Scalar;
   typedef typename remove_const<Scalar>::type ScalarNoConst;
  
   static const int NumDims = traits<Expression>::NumDimensions;
  
   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
   typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
  
   typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
       TensorBlockScratch;
  
   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
                                            const ThreadPoolDevice& device,
                                            DoneCallback done) {
  
     TensorAsyncExecutorContext* const ctx =
         new TensorAsyncExecutorContext(expr, device, std::move(done));
  
     const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
       if (!need_assign) {
         delete ctx;
         return;
       }
  
       ctx->tiling = internal::GetTensorExecutorTilingContext<
           Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
  
       auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
         TensorBlockScratch scratch(ctx->device);
  
         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
              ++block_idx) {
           TensorBlockDesc desc =
               ctx->tiling.block_mapper.blockDescriptor(block_idx);
           ctx->evaluator.evalBlock(desc, scratch);
           scratch.reset();
         }
       };
  
       // Evaluate small expressions directly as a single block.
       if (ctx->tiling.block_mapper.blockCount() == 1) {
         TensorBlockScratch scratch(ctx->device);
         TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
         ctx->evaluator.evalBlock(desc, scratch);
         delete ctx;
       } else {
         ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
                                      ctx->tiling.cost, eval_block,
                                      [ctx]() { delete ctx; });
       }
     };
  
     ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
   }
  
  private:
   struct TensorAsyncExecutorContext {
     TensorAsyncExecutorContext(const Expression& expr,
                                const ThreadPoolDevice& thread_pool,
                                DoneCallback done)
         : device(thread_pool),
           evaluator(expr, thread_pool),
           on_done(std::move(done)) {}
  
     ~TensorAsyncExecutorContext() {
       evaluator.cleanup();
       on_done();
     }
  
     const ThreadPoolDevice& device;
     Evaluator evaluator;
     TilingContext tiling;
  
    private:
     DoneCallback on_done;
   };
 };
  
 #endif  // EIGEN_USE_THREADS
  
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU)
  
 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
  public:
   typedef typename Expression::Index StorageIndex;
   static void run(const Expression& expr, const GpuDevice& device);
 };
  
 #if defined(EIGEN_GPUCC)
 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EigenMetaKernelEval {
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
     for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
  
 template <typename Evaluator, typename StorageIndex>
 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
     const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
     const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
     const StorageIndex vectorized_step_size = step_size * PacketSize;
  
     // Use the vector path
     for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
          i += vectorized_step_size) {
       eval.evalPacket(i);
     }
     for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
  
 template <typename Evaluator, typename StorageIndex>
 __global__ void
 __launch_bounds__(1024)
 EigenMetaKernel(Evaluator eval, StorageIndex size) {
  
   const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const StorageIndex step_size = blockDim.x * gridDim.x;
  
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
   EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
 }
  
 /*static*/
 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
 EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
     const Expression& expr, const GpuDevice& device) {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
   if (needs_assign) {
  
     const int block_size = device.maxGpuThreadsPerBlock();
     const int max_blocks = device.getNumGpuMultiProcessors() *
                            device.maxGpuThreadsPerMultiProcessor() / block_size;
     const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
  
     LAUNCH_GPU_KERNEL(
         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
 }
  
 #endif  // EIGEN_GPUCC
 #endif  // EIGEN_USE_GPU
  
 // SYCL Executor policy
 #ifdef EIGEN_USE_SYCL
  
 template <typename Evaluator>
 struct ExecExprFunctorKernel {
   typedef typename Evaluator::Index Index;
   Evaluator evaluator;
   const Index range;
   template <typename Scratch>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
       const Scratch, Evaluator evaluator_, const Index range_)
       : evaluator(evaluator_), range(range_) {}
  
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
       cl::sycl::nd_item<1> itemID) {
     compute(itemID);
   }
   template <bool is_vec = Evaluator::PacketAccess>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
   compute(const cl::sycl::nd_item<1>& itemID) {
     Index gId = static_cast<Index>(itemID.get_global_linear_id());
     Index total_threads = itemID.get_global_range(0);
  
     for (Index i = gId; i < range; i += total_threads) {
       evaluator.evalScalar(i);
     }
   }
   template <bool is_vec = Evaluator::PacketAccess>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
   compute(const cl::sycl::nd_item<1>& itemID) {
     const Index vectorizedRange =
         (range / Evaluator::PacketSize) * Evaluator::PacketSize;
     Index gId = static_cast<Index>(itemID.get_global_linear_id());
     const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
     const Index start = Evaluator::PacketSize * gId;
     for (Index i = start; i < vectorizedRange; i += step) {
       evaluator.evalPacket(i);
     }
     gId += vectorizedRange;
     for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
       evaluator.evalScalar(i);
     }
   }
 };
  
 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
  public:
   typedef typename Expression::Index Index;
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                                       const Eigen::SyclDevice& dev) {
     typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
     Evaluator evaluator(expr, dev);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
       Index range, GRange, tileSize;
       Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
       total_size = (total_size == 0) ? 1 : total_size;
       const int PacketSize =
           Eigen::PacketType<typename Evaluator::CoeffReturnType,
                             Eigen::SyclDevice>::size;
       Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
       dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
       range = total_size;
  
       dev.template nullary_kernel_launcher<
           typename Evaluator::CoeffReturnType,
           ExecExprFunctorKernel<Evaluator> >(
           evaluator,
           cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
                                 cl::sycl::range<1>(tileSize)),
           Index(1), range);
     }
     evaluator.cleanup();
   }
 };
  
 #endif
  
 } // end namespace internal
  
 } // end namespace Eigen
  
 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H