10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
26 template<
typename Expression,
typename Device,
bool Vectorizable>
32 static inline void run(
const Expression& expr,
const Device& device = Device())
35 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
48 template<
typename Expression>
57 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
65 const Index UnrolledSize = (
size / (4 * PacketSize)) * 4 * PacketSize;
66 for (
Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
67 for (
Index j = 0; j < 4; j++) {
71 const Index VectorizedSize = (
size / PacketSize) * PacketSize;
72 for (
Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
75 for (
Index i = VectorizedSize; i <
size; ++i) {
86 #ifdef EIGEN_USE_THREADS
87 template <
typename Evaluator,
typename Index,
bool Vectorizable>
89 static void run(Evaluator* evaluator_in,
const Index first,
const Index last) {
90 Evaluator evaluator = *evaluator_in;
92 for (
Index i = first; i < last; ++i) {
93 evaluator.evalScalar(i);
102 template <
typename Evaluator,
typename Index>
103 struct EvalRange<Evaluator,
Index, true> {
106 static void run(Evaluator* evaluator_in,
const Index first,
const Index last) {
107 Evaluator evaluator = *evaluator_in;
110 if (last - first >= PacketSize) {
112 Index last_chunk_offset = last - 4 * PacketSize;
116 for (; i <= last_chunk_offset; i += 4*PacketSize) {
117 for (
Index j = 0; j < 4; j++) {
118 evaluator.evalPacket(i + j * PacketSize);
121 last_chunk_offset = last - PacketSize;
122 for (; i <= last_chunk_offset; i += PacketSize) {
123 evaluator.evalPacket(i);
126 for (; i < last; ++i) {
127 evaluator.evalScalar(i);
133 if (
size >= 16 * PacketSize) {
134 return (
size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
137 return (
size + PacketSize - 1) & ~(PacketSize - 1);
141 template <
typename Expression,
bool Vectorizable>
142 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
145 static inline void run(
const Expression& expr,
const ThreadPoolDevice& device)
147 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
148 Evaluator evaluator(expr, device);
149 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
153 #if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
154 device.parallelFor(
size, evaluator.costPerCoeff(Vectorizable),
155 EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
157 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
160 size_t num_threads = device.numThreads();
161 if (num_threads > 1) {
163 size, evaluator.costPerCoeff(Vectorizable), num_threads);
165 if (num_threads == 1) {
169 Index blocksz = std::ceil<Index>(
static_cast<float>(
size)/num_threads) + PacketSize - 1;
170 const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
171 const Index numblocks =
size / blocksize;
173 Barrier barrier(numblocks);
174 for (
int i = 0; i < numblocks; ++i) {
175 device.enqueue_with_barrier(
177 &evaluator, i * blocksize, (i + 1) * blocksize);
179 if (numblocks * blocksize <
size) {
181 &evaluator, numblocks * blocksize,
size);
185 #endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
190 #endif // EIGEN_USE_THREADS
194 #if defined(EIGEN_USE_GPU)
196 template <
typename Expression,
bool Vectorizable>
200 static void run(
const Expression& expr,
const GpuDevice& device);
204 #if defined(__CUDACC__)
205 template <
typename Evaluator,
typename Index,
bool Vectorizable>
206 struct EigenMetaKernelEval {
209 for (
Index i = first; i < last; i += step_size) {
215 template <
typename Evaluator,
typename Index>
216 struct EigenMetaKernelEval<Evaluator,
Index, true> {
220 const Index vectorized_size = (last / PacketSize) * PacketSize;
221 const Index vectorized_step_size = step_size * PacketSize;
224 for (
Index i = first * PacketSize; i < vectorized_size;
225 i += vectorized_step_size) {
228 for (
Index i = vectorized_size + first; i < last; i += step_size) {
234 template <
typename Evaluator,
typename Index>
236 __launch_bounds__(1024)
237 EigenMetaKernel(Evaluator eval,
Index size) {
239 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
240 const Index step_size = blockDim.x * gridDim.x;
242 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
247 template <
typename Expression,
bool Vectorizable>
249 const Expression& expr,
const GpuDevice& device) {
250 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
251 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
253 const int block_size = device.maxCudaThreadsPerBlock();
254 const int max_blocks = device.getNumCudaMultiProcessors() *
255 device.maxCudaThreadsPerMultiProcessor() / block_size;
258 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(
size, block_size)), 1);
261 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>,
Index>),
262 num_blocks, block_size, 0, device, evaluator,
size);
268 #endif // EIGEN_USE_GPU
271 #ifdef EIGEN_USE_SYCL
273 template <
typename Expression,
bool Vectorizable>
276 static inline void run(
const Expression &expr,
const SyclDevice &device) {
288 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H