10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 26 template<
typename Expression,
typename Device,
bool Vectorizable>
32 static inline void run(
const Expression& expr,
const Device& device = Device())
39 for (Index i = 0; i <
size; ++i) {
40 evaluator.evalScalar(i);
48 template<
typename Expression>
65 const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
66 for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
67 for (Index j = 0; j < 4; j++) {
68 evaluator.evalPacket(i + j * PacketSize);
71 const Index VectorizedSize = (size / PacketSize) * PacketSize;
72 for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
73 evaluator.evalPacket(i);
75 for (Index i = VectorizedSize; i <
size; ++i) {
76 evaluator.evalScalar(i);
86 #ifdef EIGEN_USE_THREADS 87 template <
typename Evaluator,
typename Index,
bool Vectorizable>
89 static void run(Evaluator* evaluator_in,
const Index first,
const Index last) {
92 for (Index i = first; i < last; ++i) {
93 evaluator.evalScalar(i);
97 static Index alignBlockSize(Index
size) {
102 template <
typename Evaluator,
typename Index>
103 struct EvalRange<Evaluator, Index, true> {
106 static void run(Evaluator* evaluator_in,
const Index first,
const Index last) {
110 if (last - first >= PacketSize) {
112 Index last_chunk_offset = last - 4 * PacketSize;
116 for (; i <= last_chunk_offset; i += 4*PacketSize) {
117 for (Index j = 0; j < 4; j++) {
118 evaluator.evalPacket(i + j * PacketSize);
121 last_chunk_offset = last - PacketSize;
122 for (; i <= last_chunk_offset; i += PacketSize) {
123 evaluator.evalPacket(i);
126 for (; i < last; ++i) {
127 evaluator.evalScalar(i);
131 static Index alignBlockSize(Index
size) {
133 if (size >= 16 * PacketSize) {
134 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
137 return (size + PacketSize - 1) & ~(PacketSize - 1);
141 template <
typename Expression,
bool Vectorizable>
142 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
145 static inline void run(
const Expression& expr,
const ThreadPoolDevice& device)
149 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
153 #if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) 154 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
155 EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
156 [&evaluator](Index first, Index last) {
160 size_t num_threads = device.numThreads();
161 if (num_threads > 1) {
163 size, evaluator.costPerCoeff(Vectorizable), num_threads);
165 if (num_threads == 1) {
169 Index blocksz = std::ceil<Index>(
static_cast<float>(
size)/num_threads) + PacketSize - 1;
170 const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
171 const Index numblocks = size / blocksize;
173 Barrier barrier(numblocks);
174 for (
int i = 0; i < numblocks; ++i) {
175 device.enqueue_with_barrier(
177 &evaluator, i * blocksize, (i + 1) * blocksize);
179 if (numblocks * blocksize < size) {
181 &evaluator, numblocks * blocksize, size);
185 #endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) 190 #endif // EIGEN_USE_THREADS 194 #if defined(EIGEN_USE_GPU) 196 template <
typename Expression,
bool Vectorizable>
200 static void run(
const Expression& expr,
const GpuDevice& device);
204 #if defined(__CUDACC__) 205 template <
typename Evaluator,
typename Index,
bool Vectorizable>
206 struct EigenMetaKernelEval {
208 void run(Evaluator&
eval, Index first, Index last, Index step_size) {
209 for (Index i = first; i < last; i += step_size) {
215 template <
typename Evaluator,
typename Index>
216 struct EigenMetaKernelEval<Evaluator, Index, true> {
218 void run(Evaluator&
eval, Index first, Index last, Index step_size) {
220 const Index vectorized_size = (last / PacketSize) * PacketSize;
221 const Index vectorized_step_size = step_size * PacketSize;
224 for (Index i = first * PacketSize; i < vectorized_size;
225 i += vectorized_step_size) {
228 for (Index i = vectorized_size + first; i < last; i += step_size) {
234 template <
typename Evaluator,
typename Index>
236 __launch_bounds__(1024)
237 EigenMetaKernel(Evaluator
eval, Index
size) {
239 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
240 const Index step_size = blockDim.x * gridDim.x;
242 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
247 template <
typename Expression,
bool Vectorizable>
249 const Expression& expr,
const GpuDevice& device) {
253 const int block_size = device.maxCudaThreadsPerBlock();
254 const int max_blocks = device.getNumCudaMultiProcessors() *
255 device.maxCudaThreadsPerMultiProcessor() / block_size;
258 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(
size, block_size)), 1);
262 num_blocks, block_size, 0, device, evaluator, size);
268 #endif // EIGEN_USE_GPU 271 #ifdef EIGEN_USE_SYCL 273 template <
typename Expression,
bool Vectorizable>
276 static inline void run(
const Expression &expr,
const SyclDevice &device) {
288 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
#define EIGEN_ALWAYS_INLINE
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
static EIGEN_DEVICE_FUNC void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
static constexpr size_t size(Tuple< Args... > &)
Provides access to the number of elements in a tuple as a compile-time constant expression.
A cost model used to limit the number of threads used for evaluating tensor expression.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
static EIGEN_DEVICE_FUNC void run(const Expression &expr, const Device &device=Device())
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType *dest)
void run(Expr &expr, Dev &dev)