10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
17 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
24 template <
typename T,
typename R>
26 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(
T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
57 if (newval == oldval) {
63 gpu_assert(0 &&
"Wordsize not supported");
65 #else // EIGEN_CUDA_ARCH >= 300
66 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
67 #endif // EIGEN_CUDA_ARCH >= 300
71 template <
typename Type>
72 __device__
inline Type atomicExchCustom(
Type* address,
Type val) {
73 return atomicExch(address, val);
77 __device__
inline double atomicExchCustom(
double* address,
double val) {
78 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
82 #ifdef EIGEN_HAS_GPU_FP16
84 __device__
inline void atomicReduce(half2* output, half2 accum,
R& reducer) {
85 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
88 if (newval == oldval) {
91 unsigned int readback;
92 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
95 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
96 if (newval == oldval) {
102 template <
typename R>
103 __device__
inline void atomicReduce(Packet4h2* output, Packet4h2 accum,
R& reducer) {
104 half2* houtput=
reinterpret_cast<half2*
>(output);
105 half2* haccum=
reinterpret_cast<half2*
>(&accum);
106 for(
int i=0;
i<4;++
i){
107 atomicReduce(houtput+
i,*(haccum+
i),reducer);
110 #endif // EIGEN_HAS_GPU_FP16
113 __device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
114 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
115 atomicAdd(output, accum);
116 #else // EIGEN_CUDA_ARCH >= 300
117 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
118 #endif // EIGEN_CUDA_ARCH >= 300
122 template <
typename CoeffType,
typename Index>
126 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
132 template <
int BlockSize,
int NumPerThread,
typename Self,
133 typename Reducer,
typename Index>
135 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
136 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
139 if (gridDim.x == 1) {
140 if (first_index == 0) {
141 *output = reducer.initialize();
146 unsigned int block = atomicCAS(semaphore, 0u, 1u);
149 atomicExchCustom(output, reducer.initialize());
151 atomicExch(semaphore, 2u);
158 val = atomicCAS(semaphore, 2u, 2u);
169 typename Self::CoeffReturnType accum = reducer.initialize();
170 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
171 for (
Index i = 0;
i < max_iter;
i+=BlockSize) {
172 const Index index = first_index +
i;
174 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
175 reducer.reduce(val, &accum);
180 #if defined(EIGEN_HIPCC)
185 reducer.reduce(__shfl_down(
static_cast<float>(accum),
offset, warpSize), &accum);
187 reducer.reduce(__shfl_down(
static_cast<int>(accum),
offset, warpSize), &accum);
189 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
190 reducer.reduce(__shfl_down(accum,
offset, warpSize), &accum);
192 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum,
offset, warpSize), &accum);
196 if ((
threadIdx.x & (warpSize - 1)) == 0) {
197 atomicReduce(output, accum, reducer);
202 atomicInc(semaphore, gridDim.x + 1);
203 #if defined(EIGEN_HIPCC)
204 __threadfence_system();
207 #else // EIGEN_CUDA_ARCH >= 300
208 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
209 #endif // EIGEN_CUDA_ARCH >= 300
213 #ifdef EIGEN_HAS_GPU_FP16
214 template <
typename Self,
215 typename Reducer,
typename Index>
221 Index packet_remainder =
223 if (packet_remainder != 0) {
224 half2* h2scratch =
reinterpret_cast<half2*
>(scratch);
225 for (
Index i = num_coeffs - packet_remainder;
i + 2 <= num_coeffs;
i += 2) {
227 __halves2half2(input.m_impl.coeff(
i), input.m_impl.coeff(
i + 1));
230 if ((num_coeffs & 1) != 0) {
231 half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
232 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
235 *scratch = reducer.template initializePacket<packet_type>();
239 template <
typename Self,
240 typename Reducer,
typename Index>
246 const Index num_packets =
248 PacketType* p_output =
reinterpret_cast<PacketType*
>(output);
249 for (
Index i = thread_id;
i < num_packets;
i += num_threads) {
250 p_output[
i] = reducer.template initializePacket<PacketType>();
252 Index packet_remainder =
254 if (thread_id < packet_remainder) {
255 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
259 template <
int BlockSize,
int NumPerThread,
typename Self,
260 typename Reducer,
typename Index>
266 const Index first_index =
271 if (gridDim.x == 1) {
272 if (first_index == 0) {
273 int rem = num_coeffs % packet_width;
275 half2* p_scratch =
reinterpret_cast<half2*
>(scratch);
276 *scratch = reducer.template initializePacket<PacketType>();
277 for (
int i = 0;
i < rem / 2;
i++) {
278 *p_scratch = __halves2half2(
279 input.m_impl.coeff(num_coeffs - packet_width + 2 *
i),
280 input.m_impl.coeff(num_coeffs - packet_width + 2 *
i + 1));
283 if ((num_coeffs & 1) != 0) {
284 half
last = input.m_impl.coeff(num_coeffs - 1);
285 *p_scratch = __halves2half2(
last, reducer.initialize());
288 *scratch = reducer.template initializePacket<PacketType>();
294 PacketType accum = reducer.template initializePacket<PacketType>();
295 const Index max_iter =
296 numext::mini<Index>((num_coeffs - first_index) / packet_width,
297 NumPerThread * BlockSize / packet_width);
298 for (
Index i = 0;
i < max_iter;
i += BlockSize) {
299 const Index index = first_index + packet_width *
i;
301 PacketType val = input.m_impl.template packet<Unaligned>(index);
302 reducer.reducePacket(val, &accum);
307 #if defined(EIGEN_HIPCC)
309 half2* hr =
reinterpret_cast<half2*
>(&
r1);
310 half2* hacc =
reinterpret_cast<half2*
>(&accum);
311 for (
int i = 0;
i < packet_width / 2;
i++) {
313 union {
int i; half2
h; } wka_in, wka_out;
315 wka_out.i = __shfl_down(wka_in.i,
offset, warpSize);
318 reducer.reducePacket(
r1, &accum);
319 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
321 half2* hr =
reinterpret_cast<half2*
>(&
r1);
322 half2* hacc =
reinterpret_cast<half2*
>(&accum);
323 for (
int i = 0;
i < packet_width / 2;
i++) {
324 hr[
i] = __shfl_down(hacc[
i],
offset, warpSize);
326 reducer.reducePacket(
r1, &accum);
329 half2* hr =
reinterpret_cast<half2*
>(&
r1);
330 half2* hacc =
reinterpret_cast<half2*
>(&accum);
331 for (
int i = 0;
i < packet_width / 2;
i++) {
332 hr[
i] = __shfl_down_sync(0xFFFFFFFF, hacc[
i], (
unsigned)
offset, warpSize);
334 reducer.reducePacket(
r1, &accum);
339 if ((
threadIdx.x & (warpSize - 1)) == 0) {
340 atomicReduce(scratch, accum, reducer);
344 half2* rv1 =
reinterpret_cast<half2*
>(scratch);
345 if (packet_width > 2) {
346 reducer.reducePacket(rv1[2], rv1);
347 reducer.reducePacket(rv1[3], rv1 + 1);
348 reducer.reducePacket(rv1[1], rv1);
350 if (gridDim.x == 1) {
351 if (first_index == 0) {
352 half tmp = __low2half(*rv1);
353 reducer.reduce(__high2half(*rv1), &tmp);
359 template <
typename Op>
362 half2* pscratch =
reinterpret_cast<half2*
>(scratch);
363 half tmp = __float2half(0.
f);
366 reducer.reduce(__low2half(*pscratch), &tmp);
367 reducer.reduce(__high2half(*pscratch), &tmp);
373 #endif // EIGEN_HAS_GPU_FP16
375 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
376 struct FullReductionLauncher {
377 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
378 gpu_assert(
false &&
"Should only be called on doubles, floats and half floats");
383 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
384 struct FullReductionLauncher<
385 Self, Op, OutputType, PacketAccess,
387 internal::is_same<float, OutputType>::value ||
388 internal::is_same<double, OutputType>::value,
390 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs) {
393 const int block_size = 256;
394 const int num_per_thread = 128;
395 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
397 unsigned int* semaphore =
NULL;
398 if (num_blocks > 1) {
399 semaphore = device.semaphore();
402 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
403 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, semaphore);
407 #ifdef EIGEN_HAS_GPU_FP16
408 template <
typename Self,
typename Op>
409 struct FullReductionLauncher<Self, Op,
Eigen::half, false> {
410 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
411 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
415 template <
typename Self,
typename Op>
416 struct FullReductionLauncher<Self, Op,
Eigen::half, true> {
417 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
421 const int block_size = 256;
422 const int num_per_thread = 128;
423 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
424 PacketType* scratch =
static_cast<PacketType*
>(device.scratchpad());
427 if (num_blocks > 1) {
430 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
431 1, 1, 0, device, reducer,
self, num_coeffs, scratch);
434 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
435 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, scratch);
437 if (num_blocks > 1) {
438 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
439 1, 1, 0, device, reducer, output, scratch);
443 #endif // EIGEN_HAS_GPU_FP16
446 template <
typename Self,
typename Op,
bool Vectorizable>
447 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
451 #ifdef EIGEN_HAS_GPU_FP16
456 #else // EIGEN_HAS_GPU_FP16
460 #endif // EIGEN_HAS_GPU_FP16
462 template <
typename OutputType>
463 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output) {
467 if (num_coeffs == 0) {
471 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device, output, num_coeffs);
476 template <
int NumPerThread,
typename Self,
477 typename Reducer,
typename Index>
479 typename Self::CoeffReturnType* output) {
480 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
481 typedef typename Self::CoeffReturnType
Type;
487 const int unroll_times = 16;
490 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread);
491 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
497 if (gridDim.x == 1) {
498 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
499 output[
i] = reducer.initialize();
507 if (
row < num_preserved_coeffs) {
508 const Index col_block =
i % input_col_blocks;
511 Type reduced_val = reducer.initialize();
513 for (
Index j = 0;
j < NumPerThread;
j += unroll_times) {
514 const Index last_col = col_begin +
blockDim.x * (
j + unroll_times - 1);
515 if (last_col >= num_coeffs_to_reduce) {
517 const Type val = input.m_impl.coeff(
row * num_coeffs_to_reduce +
col);
518 reducer.reduce(val, &reduced_val);
524 for (
int k = 0; k < unroll_times; ++k) {
526 reducer.reduce(input.m_impl.coeff(
row * num_coeffs_to_reduce +
col), &reduced_val);
533 #if defined(EIGEN_HIPCC)
538 reducer.reduce(__shfl_down(
static_cast<float>(reduced_val),
offset), &reduced_val);
540 reducer.reduce(__shfl_down(
static_cast<int>(reduced_val),
offset), &reduced_val);
542 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
543 reducer.reduce(__shfl_down(reduced_val,
offset), &reduced_val);
545 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val,
offset), &reduced_val);
549 if ((
threadIdx.x & (warpSize - 1)) == 0) {
550 atomicReduce(&(output[
row]), reduced_val, reducer);
554 #else // EIGEN_CUDA_ARCH >= 300
555 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
556 #endif // EIGEN_CUDA_ARCH >= 300
559 #ifdef EIGEN_HAS_GPU_FP16
561 template <
int NumPerThread,
typename Self,
562 typename Reducer,
typename Index>
572 const int unroll_times = 16 / packet_width;
576 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread * 2);
577 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
583 if (gridDim.x == 1) {
584 Index i = packet_width * thread_id;
585 for (;
i + packet_width <= num_preserved_coeffs;
586 i += packet_width * num_threads) {
587 PacketType* poutput =
reinterpret_cast<PacketType*
>(output +
i);
588 *poutput = reducer.template initializePacket<PacketType>();
590 if (
i < num_preserved_coeffs) {
591 output[
i] = reducer.initialize();
597 const Index row = 2 * (
i / input_col_blocks);
599 if (
row + 1 < num_preserved_coeffs) {
600 const Index col_block =
i % input_col_blocks;
601 const Index col_begin =
604 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
605 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
607 for (
Index j = 0;
j < NumPerThread;
j += unroll_times) {
608 const Index last_col =
609 col_begin +
blockDim.x * (
j + unroll_times - 1) * packet_width;
610 if (last_col >= num_coeffs_to_reduce) {
612 for (;
col + packet_width <= num_coeffs_to_reduce;
614 const PacketType val1 = input.m_impl.template packet<Unaligned>(
615 row * num_coeffs_to_reduce +
col);
616 reducer.reducePacket(val1, &reduced_val1);
617 const PacketType val2 = input.m_impl.template packet<Unaligned>(
618 (
row + 1) * num_coeffs_to_reduce +
col);
619 reducer.reducePacket(val2, &reduced_val2);
621 if (
col < num_coeffs_to_reduce) {
622 PacketType
r1 = reducer.template initializePacket<PacketType>();
623 PacketType
r2 = reducer.template initializePacket<PacketType>();
624 half2* hr1 =
reinterpret_cast<half2*
>(&
r1);
625 half2* hr2 =
reinterpret_cast<half2*
>(&
r2);
626 while (
col + 1 < num_coeffs_to_reduce) {
627 *hr1 = __halves2half2(
628 input.m_impl.coeff(
row * num_coeffs_to_reduce +
col),
629 input.m_impl.coeff(
row * num_coeffs_to_reduce +
col + 1));
630 *hr2 = __halves2half2(
631 input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col),
632 input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col +
638 if (
col < num_coeffs_to_reduce) {
641 input.m_impl.coeff(
row * num_coeffs_to_reduce +
col);
642 *hr1 = __halves2half2(last1, reducer.initialize());
644 input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col);
645 *hr2 = __halves2half2(last2, reducer.initialize());
647 reducer.reducePacket(
r1, &reduced_val1);
648 reducer.reducePacket(
r2, &reduced_val2);
654 for (
int k = 0; k < unroll_times; ++k) {
656 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
657 row * num_coeffs_to_reduce +
col),
659 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
660 (
row + 1) * num_coeffs_to_reduce +
col),
668 #if defined(EIGEN_HIPCC)
671 half2* hr1 =
reinterpret_cast<half2*
>(&
r1);
672 half2* hr2 =
reinterpret_cast<half2*
>(&
r2);
673 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
674 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
675 for (
int i = 0;
i < packet_width / 2;
i++) {
677 union {
int i; half2
h; } wka_in1, wka_out1;
679 wka_out1.i = __shfl_down(wka_in1.i,
offset, warpSize);
682 union {
int i; half2
h; } wka_in2, wka_out2;
684 wka_out2.i = __shfl_down(wka_in2.i,
offset, warpSize);
687 reducer.reducePacket(
r1, &reduced_val1);
688 reducer.reducePacket(
r2, &reduced_val2);
689 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
692 half2* hr1 =
reinterpret_cast<half2*
>(&
r1);
693 half2* hr2 =
reinterpret_cast<half2*
>(&
r2);
694 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
695 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
696 for (
int i = 0;
i < packet_width / 2;
i++) {
697 hr1[
i] = __shfl_down(rv1[
i],
offset, warpSize);
698 hr2[
i] = __shfl_down(rv2[
i],
offset, warpSize);
700 reducer.reducePacket(
r1, &reduced_val1);
701 reducer.reducePacket(
r2, &reduced_val2);
705 half2* hr1 =
reinterpret_cast<half2*
>(&
r1);
706 half2* hr2 =
reinterpret_cast<half2*
>(&
r2);
707 half2* rr1 =
reinterpret_cast<half2*
>(&reduced_val1);
708 half2* rr2 =
reinterpret_cast<half2*
>(&reduced_val2);
709 for (
int i = 0;
i < packet_width / 2;
i++) {
711 __shfl_down_sync(0xFFFFFFFF, rr1[
i], (
unsigned)
offset, warpSize);
713 __shfl_down_sync(0xFFFFFFFF, rr2[
i], (
unsigned)
offset, warpSize);
715 reducer.reducePacket(
r1, &reduced_val1);
716 reducer.reducePacket(
r2, &reduced_val2);
720 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
721 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
723 if (packet_width > 2) {
724 reducer.reducePacket(rv1[2], rv1);
725 reducer.reducePacket(rv1[3], rv1 + 1);
726 reducer.reducePacket(rv1[1], rv1);
727 reducer.reducePacket(rv2[2], rv2);
728 reducer.reducePacket(rv2[3], rv2 + 1);
729 reducer.reducePacket(rv2[1], rv2);
731 half val1 = __low2half(*rv1);
732 reducer.reduce(__high2half(*rv1), &val1);
733 half val2 = __low2half(*rv2);
734 reducer.reduce(__high2half(*rv2), &val2);
735 val = __halves2half2(val1, val2);
736 if ((
threadIdx.x & (warpSize - 1)) == 0) {
738 atomicReduce((half2*)
loc, val, reducer);
744 #endif // EIGEN_HAS_GPU_FP16
746 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
747 struct InnerReductionLauncher {
749 gpu_assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
755 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
756 struct InnerReductionLauncher<
757 Self, Op, OutputType, PacketAccess,
759 internal::is_same<float, OutputType>::value ||
760 internal::is_same<double, OutputType>::value,
762 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
765 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
766 const int block_size = 256;
767 const int num_per_thread = 128;
768 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
769 const int max_blocks = device.getNumGpuMultiProcessors() *
770 device.maxGpuThreadsPerMultiProcessor() / block_size;
771 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
773 if (num_blocks > 1) {
776 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
777 const int max_blocks = device.getNumGpuMultiProcessors() *
778 device.maxGpuThreadsPerMultiProcessor() / 1024;
779 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
780 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
781 num_blocks, 1024, 0, device, reducer.initialize(),
782 num_preserved_vals, output);
785 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
786 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
792 #ifdef EIGEN_HAS_GPU_FP16
793 template <
typename Self,
typename Op>
794 struct InnerReductionLauncher<Self, Op,
Eigen::half, false> {
796 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
801 template <
typename Self,
typename Op>
802 struct InnerReductionLauncher<Self, Op,
Eigen::half, true> {
803 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
806 if (num_preserved_vals % 2 != 0) {
811 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
812 const int block_size = 128;
813 const int num_per_thread = 64;
814 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
815 const int max_blocks = device.getNumGpuMultiProcessors() *
816 device.maxGpuThreadsPerMultiProcessor() / block_size;
817 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
819 if (num_blocks > 1) {
822 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
823 1, 1, 0, device, reducer,
self, num_preserved_vals, output);
826 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
827 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
832 #endif // EIGEN_HAS_GPU_FP16
835 template <
typename Self,
typename Op>
836 struct InnerReducer<Self, Op, GpuDevice> {
840 #ifdef EIGEN_HAS_GPU_FP16
845 #else // EIGEN_HAS_GPU_FP16
849 #endif // EIGEN_HAS_GPU_FP16
851 template <
typename OutputType>
852 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
856 if (num_coeffs == 0) {
860 if (num_coeffs_to_reduce <= 128) {
864 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
868 template <
int NumPerThread,
typename Self,
869 typename Reducer,
typename Index>
871 typename Self::CoeffReturnType* output) {
875 if (gridDim.x == 1) {
876 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
877 output[
i] = reducer.initialize();
883 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
884 for (
Index i = thread_id;
i < max_iter;
i += num_threads) {
885 const Index input_col =
i % num_preserved_coeffs;
886 const Index input_row = (
i / num_preserved_coeffs) * NumPerThread;
887 typename Self::CoeffReturnType reduced_val = reducer.initialize();
888 const Index max_row =
numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
889 for (
Index j = input_row;
j < max_row;
j++) {
890 typename Self::CoeffReturnType val = input.m_impl.coeff(
j * num_preserved_coeffs + input_col);
891 reducer.reduce(val, &reduced_val);
893 atomicReduce(&(output[input_col]), reduced_val, reducer);
898 template <
typename Self,
typename Op>
899 struct OuterReducer<Self, Op, GpuDevice> {
906 template <
typename Device,
typename OutputType>
908 #if !defined(EIGEN_HIPCC)
921 gpu_assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
925 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
929 if (num_coeffs_to_reduce <= 32) {
933 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
934 const int block_size = 256;
935 const int num_per_thread = 16;
936 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
937 const int max_blocks = device.getNumGpuMultiProcessors() *
938 device.maxGpuThreadsPerMultiProcessor() / block_size;
939 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
941 if (num_blocks > 1) {
944 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
945 const int max_blocks = device.getNumGpuMultiProcessors() *
946 device.maxGpuThreadsPerMultiProcessor() / 1024;
947 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
948 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
949 num_blocks, 1024, 0, device, reducer.initialize(),
950 num_preserved_vals, output);
953 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
954 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
960 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
966 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H