10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 17 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 24 template <
typename T,
typename R>
26 #if __CUDA_ARCH__ >= 300 29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
63 assert(0 &&
"Wordsize not supported");
66 assert(0 &&
"Shouldn't be called on unsupported device");
71 template <
typename Type>
72 __device__
inline Type atomicExchCustom(
Type* address,
Type val) {
73 return atomicExch(address, val);
77 __device__
inline double atomicExchCustom(
double* address,
double val) {
78 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
82 #ifdef EIGEN_HAS_CUDA_FP16 83 template <
template <
typename T>
class R>
84 __device__
inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
85 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88 if (newval == oldval) {
91 unsigned int readback;
92 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
95 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96 if (newval == oldval) {
104 __device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
105 #if __CUDA_ARCH__ >= 300 106 atomicAdd(output, accum);
108 assert(0 &&
"Shouldn't be called on unsupported device");
113 template <
typename CoeffType,
typename Index>
114 __global__
void ReductionInitKernel(
const CoeffType val,
Index num_preserved_coeffs, CoeffType* output) {
115 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
116 const Index num_threads = blockDim.x * gridDim.x;
117 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
123 template <
int BlockSize,
int NumPerThread,
typename Self,
124 typename Reducer,
typename Index>
125 __global__
void FullReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs,
126 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
127 #if __CUDA_ARCH__ >= 300 129 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
130 if (gridDim.x == 1) {
131 if (first_index == 0) {
132 *output = reducer.initialize();
136 if (threadIdx.x == 0) {
137 unsigned int block = atomicCAS(semaphore, 0u, 1u);
140 atomicExchCustom(output, reducer.initialize());
142 atomicExch(semaphore, 2u);
149 val = atomicCAS(semaphore, 2u, 2u);
160 typename Self::CoeffReturnType accum = reducer.initialize();
161 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
162 for (
Index i = 0; i < max_iter; i+=BlockSize) {
163 const Index index = first_index + i;
165 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
166 reducer.reduce(val, &accum);
170 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
171 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
174 if ((threadIdx.x & (warpSize - 1)) == 0) {
175 atomicReduce(output, accum, reducer);
178 if (gridDim.x > 1 && threadIdx.x == 0) {
180 atomicInc(semaphore, gridDim.x + 1);
183 assert(0 &&
"Shouldn't be called on unsupported device");
188 #ifdef EIGEN_HAS_CUDA_FP16 189 template <
typename Self,
190 typename Reducer,
typename Index>
191 __global__
void ReductionInitFullReduxKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs, half2* scratch) {
194 if (num_coeffs % 2 != 0) {
195 half last = input.m_impl.coeff(num_coeffs-1);
196 *scratch = __halves2half2(last, reducer.initialize());
198 *scratch = reducer.template initializePacket<half2>();
202 template <
typename Self,
203 typename Reducer,
typename Index>
204 __global__
void ReductionInitKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs, half* output) {
205 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
206 const Index num_threads = blockDim.x * gridDim.x;
207 const Index num_packets = num_coeffs / 2;
208 for (
Index i = thread_id; i < num_packets; i += num_threads) {
209 ((half2*)output)[i] = reducer.template initializePacket<half2>();
212 if (thread_id == 0 && num_coeffs % 2 != 0) {
213 output[num_coeffs-1] = reducer.initialize();
217 template <
int BlockSize,
int NumPerThread,
typename Self,
218 typename Reducer,
typename Index>
219 __global__
void FullReductionKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs,
220 half* output, half2* scratch) {
223 const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
226 if (gridDim.x == 1 && first_index == 0) {
227 if (num_coeffs % 2 != 0) {
228 half last = input.m_impl.coeff(num_coeffs-1);
229 *scratch = __halves2half2(last, reducer.initialize());
231 *scratch = reducer.template initializePacket<half2>();
236 half2 accum = reducer.template initializePacket<half2>();
237 const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
238 for (
Index i = 0; i < max_iter; i += BlockSize) {
239 const Index index = first_index + 2*i;
241 half2 val = input.m_impl.template packet<Unaligned>(index);
242 reducer.reducePacket(val, &accum);
246 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
247 reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
250 if ((threadIdx.x & (warpSize - 1)) == 0) {
251 atomicReduce(scratch, accum, reducer);
256 if (gridDim.x == 1 && first_index == 0) {
257 half tmp = __low2half(*scratch);
258 reducer.reduce(__high2half(*scratch), &tmp);
263 template <
typename Op>
264 __global__
void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
266 half tmp = __low2half(*scratch);
267 reducer.reduce(__high2half(*scratch), &tmp);
273 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
274 struct FullReductionLauncher {
275 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
276 assert(
false &&
"Should only be called on doubles, floats and half floats");
281 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
282 struct FullReductionLauncher<
283 Self, Op, OutputType, PacketAccess,
285 internal::is_same<float, OutputType>::value ||
286 internal::is_same<double, OutputType>::value,
288 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs) {
290 typedef typename Self::CoeffReturnType Scalar;
291 const int block_size = 256;
292 const int num_per_thread = 128;
293 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
295 unsigned int* semaphore = NULL;
296 if (num_blocks > 1) {
297 semaphore = device.semaphore();
300 LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
301 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, semaphore);
305 #ifdef EIGEN_HAS_CUDA_FP16 306 template <
typename Self,
typename Op>
307 struct FullReductionLauncher<Self, Op,
Eigen::half, false> {
308 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
309 assert(
false &&
"Should not be called since there is no packet accessor");
313 template <
typename Self,
typename Op>
314 struct FullReductionLauncher<Self, Op,
Eigen::half, true> {
315 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
318 const int block_size = 256;
319 const int num_per_thread = 128;
320 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
321 half2* scratch =
static_cast<half2*
>(device.scratchpad());
323 if (num_blocks > 1) {
326 LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
327 1, 1, 0, device, reducer,
self, num_coeffs, scratch);
330 LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
331 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, scratch);
333 if (num_blocks > 1) {
334 LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
335 1, 1, 0, device, reducer, output, scratch);
342 template <
typename Self,
typename Op,
bool Vectorizable>
343 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
347 #ifdef EIGEN_HAS_CUDA_FP16 348 static const bool HasOptimizedImplementation = !Op::IsStateful &&
353 static const bool HasOptimizedImplementation = !Op::IsStateful &&
358 template <
typename OutputType>
359 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output) {
360 assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
361 const Index num_coeffs =
array_prod(
self.m_impl.dimensions());
363 if (num_coeffs == 0) {
367 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device, output, num_coeffs);
372 template <
int NumPerThread,
typename Self,
373 typename Reducer,
typename Index>
374 __global__
void InnerReductionKernel(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
375 typename Self::CoeffReturnType* output) {
376 #if __CUDA_ARCH__ >= 300 377 typedef typename Self::CoeffReturnType
Type;
383 const int unroll_times = 16;
386 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
387 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
389 const Index num_threads = blockDim.x * gridDim.x;
390 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
393 if (gridDim.x == 1) {
394 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
395 output[i] = reducer.initialize();
400 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
401 const Index
row = i / input_col_blocks;
403 if (row < num_preserved_coeffs) {
404 const Index col_block = i % input_col_blocks;
405 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
407 Type reduced_val = reducer.initialize();
409 for (Index j = 0; j < NumPerThread; j += unroll_times) {
410 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
411 if (last_col >= num_coeffs_to_reduce) {
412 for (Index
col = col_begin + blockDim.x * j;
col < num_coeffs_to_reduce;
col += blockDim.x) {
413 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce +
col);
414 reducer.reduce(val, &reduced_val);
420 for (
int k = 0; k < unroll_times; ++k) {
421 const Index
col = col_begin + blockDim.x * (j + k);
422 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
428 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
429 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
432 if ((threadIdx.x & (warpSize - 1)) == 0) {
433 atomicReduce(&(output[row]), reduced_val, reducer);
438 assert(0 &&
"Shouldn't be called on unsupported device");
442 #ifdef EIGEN_HAS_CUDA_FP16 444 template <
int NumPerThread,
typename Self,
445 typename Reducer,
typename Index>
446 __global__
void InnerReductionKernelHalfFloat(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
453 const int unroll_times = 16;
457 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
458 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
460 const Index num_threads = blockDim.x * gridDim.x;
461 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
464 if (gridDim.x == 1) {
465 Index i = 2*thread_id;
466 for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
467 half* loc = output + i;
468 *((half2*)loc) = reducer.template initializePacket<half2>();
470 if (i < num_preserved_coeffs) {
471 output[i] = reducer.initialize();
476 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
477 const Index row = 2 * (i / input_col_blocks);
479 if (row + 1 < num_preserved_coeffs) {
480 const Index col_block = i % input_col_blocks;
481 const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
483 half2 reduced_val1 = reducer.template initializePacket<half2>();
484 half2 reduced_val2 = reducer.template initializePacket<half2>();
486 for (Index j = 0; j < NumPerThread; j += unroll_times) {
487 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
488 if (last_col >= num_coeffs_to_reduce) {
489 Index
col = col_begin + blockDim.x * j;
490 for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
491 const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce +
col);
492 reducer.reducePacket(val1, &reduced_val1);
493 const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
494 reducer.reducePacket(val2, &reduced_val2);
496 if (col < num_coeffs_to_reduce) {
498 const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
499 const half2 val1 = __halves2half2(last1, reducer.initialize());
500 reducer.reducePacket(val1, &reduced_val1);
501 const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
502 const half2 val2 = __halves2half2(last2, reducer.initialize());
503 reducer.reducePacket(val2, &reduced_val2);
509 for (
int k = 0; k < unroll_times; ++k) {
510 const Index col = col_begin + blockDim.x * (j + k) * 2;
511 reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
512 reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
518 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
519 reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
520 reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
523 half val1 = __low2half(reduced_val1);
524 reducer.reduce(__high2half(reduced_val1), &val1);
525 half val2 = __low2half(reduced_val2);
526 reducer.reduce(__high2half(reduced_val2), &val2);
527 half2 val = __halves2half2(val1, val2);
529 if ((threadIdx.x & (warpSize - 1)) == 0) {
530 half* loc = output +
row;
531 atomicReduce((half2*)loc, val, reducer);
539 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
540 struct InnerReductionLauncher {
541 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index,
typename Self::Index) {
542 assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
548 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
549 struct InnerReductionLauncher<
550 Self, Op, OutputType, PacketAccess,
552 internal::is_same<float, OutputType>::value ||
553 internal::is_same<double, OutputType>::value,
555 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
558 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
559 const int block_size = 256;
560 const int num_per_thread = 128;
561 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
562 const int max_blocks = device.getNumCudaMultiProcessors() *
563 device.maxCudaThreadsPerMultiProcessor() / block_size;
564 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
566 if (num_blocks > 1) {
569 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
570 const int max_blocks = device.getNumCudaMultiProcessors() *
571 device.maxCudaThreadsPerMultiProcessor() / 1024;
572 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
573 LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
574 num_blocks, 1024, 0, device, reducer.initialize(),
575 num_preserved_vals, output);
578 LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
579 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
585 #ifdef EIGEN_HAS_CUDA_FP16 586 template <
typename Self,
typename Op>
587 struct InnerReductionLauncher<Self, Op,
Eigen::half, false> {
589 assert(
false &&
"Should not be called since there is no packet accessor");
594 template <
typename Self,
typename Op>
595 struct InnerReductionLauncher<Self, Op,
Eigen::half, true> {
596 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
599 if (num_preserved_vals % 2 != 0) {
604 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
605 const int block_size = 128;
606 const int num_per_thread = 64;
607 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
608 const int max_blocks = device.getNumCudaMultiProcessors() *
609 device.maxCudaThreadsPerMultiProcessor() / block_size;
610 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
612 if (num_blocks > 1) {
615 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
616 const int max_blocks = device.getNumCudaMultiProcessors() *
617 device.maxCudaThreadsPerMultiProcessor() / 1024;
618 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
619 LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
620 1, 1, 0, device, reducer,
self, num_preserved_vals, output);
623 LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
624 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
632 template <
typename Self,
typename Op>
633 struct InnerReducer<Self, Op, GpuDevice> {
637 #ifdef EIGEN_HAS_CUDA_FP16 638 static const bool HasOptimizedImplementation = !Op::IsStateful &&
643 static const bool HasOptimizedImplementation = !Op::IsStateful &&
648 template <
typename OutputType>
649 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
650 assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
651 const Index num_coeffs =
array_prod(
self.m_impl.dimensions());
653 if (num_coeffs == 0) {
657 if (num_coeffs_to_reduce <= 128) {
661 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
665 template <
int NumPerThread,
typename Self,
666 typename Reducer,
typename Index>
667 __global__
void OuterReductionKernel(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
668 typename Self::CoeffReturnType* output) {
669 const Index num_threads = blockDim.x * gridDim.x;
670 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
672 if (gridDim.x == 1) {
673 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
674 output[i] = reducer.initialize();
680 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
681 for (Index i = thread_id; i < max_iter; i += num_threads) {
682 const Index input_col = i % num_preserved_coeffs;
683 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
684 typename Self::CoeffReturnType reduced_val = reducer.initialize();
685 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
686 for (Index j = input_row; j < max_row; j++) {
687 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
688 reducer.reduce(val, &reduced_val);
690 atomicReduce(&(output[input_col]), reduced_val, reducer);
695 template <
typename Self,
typename Op>
696 struct OuterReducer<Self, Op, GpuDevice> {
700 static const bool HasOptimizedImplementation = !Op::IsStateful &&
703 template <
typename Device,
typename OutputType>
704 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const Device&, OutputType*,
typename Self::Index,
typename Self::Index) {
705 assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
709 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
713 if (num_coeffs_to_reduce <= 32) {
717 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
718 const int block_size = 256;
719 const int num_per_thread = 16;
720 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
721 const int max_blocks = device.getNumCudaMultiProcessors() *
722 device.maxCudaThreadsPerMultiProcessor() / block_size;
723 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
725 if (num_blocks > 1) {
728 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
729 const int max_blocks = device.getNumCudaMultiProcessors() *
730 device.maxCudaThreadsPerMultiProcessor() / 1024;
731 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
732 LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
733 num_blocks, 1024, 0, device, reducer.initialize(),
734 num_preserved_vals, output);
737 LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
738 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
750 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #define EIGEN_ALWAYS_INLINE
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
EIGEN_DEVICE_FUNC ColXpr col(Index i)
This is the const version of col().
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL EIGEN_DEVICE_FUNC BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols)
This is the const version of block(Index,Index,Index,Index). */.
EIGEN_DEVICE_FUNC RowXpr row(Index i)
This is the const version of row(). */.
void run(Expr &expr, Dev &dev)