10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H 17 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) 24 template <
typename T,
typename R>
26 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) 29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(
T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
63 gpu_assert(0 &&
"Wordsize not supported");
65 #else // EIGEN_CUDA_ARCH >= 300 66 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
67 #endif // EIGEN_CUDA_ARCH >= 300 71 template <
typename Type>
72 __device__
inline Type atomicExchCustom(
Type* address,
Type val) {
73 return atomicExch(address, val);
77 __device__
inline double atomicExchCustom(
double* address,
double val) {
78 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
82 #ifdef EIGEN_HAS_GPU_FP16 84 __device__
inline void atomicReduce(half2* output, half2 accum,
R& reducer) {
85 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88 if (newval == oldval) {
91 unsigned int readback;
92 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
95 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96 if (newval == oldval) {
102 template <
typename R>
103 __device__
inline void atomicReduce(Packet4h2* output, Packet4h2 accum,
R& reducer) {
104 half2* houtput=
reinterpret_cast<half2*
>(output);
105 half2* haccum=
reinterpret_cast<half2*
>(&accum);
106 for(
int i=0;
i<4;++
i){
107 atomicReduce(houtput+
i,*(haccum+
i),reducer);
110 #endif // EIGEN_HAS_GPU_FP16 113 __device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
114 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) 115 atomicAdd(output, accum);
116 #else // EIGEN_CUDA_ARCH >= 300 117 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
118 #endif // EIGEN_CUDA_ARCH >= 300 122 template <
typename CoeffType,
typename Index>
126 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
132 template <
int BlockSize,
int NumPerThread,
typename Self,
133 typename Reducer,
typename Index>
135 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
136 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) 139 if (gridDim.x == 1) {
140 if (first_index == 0) {
141 *output = reducer.initialize();
146 unsigned int block = atomicCAS(semaphore, 0u, 1u);
149 atomicExchCustom(output, reducer.initialize());
151 atomicExch(semaphore, 2u);
158 val = atomicCAS(semaphore, 2u, 2u);
169 typename Self::CoeffReturnType accum = reducer.initialize();
170 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
171 for (
Index i = 0;
i < max_iter;
i+=BlockSize) {
172 const Index index = first_index +
i;
174 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
175 reducer.reduce(val, &accum);
180 #if defined(EIGEN_HIPCC) 185 reducer.reduce(__shfl_down(static_cast<float>(accum),
offset, warpSize), &accum);
187 reducer.reduce(__shfl_down(static_cast<int>(accum),
offset, warpSize), &accum);
189 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 190 reducer.reduce(__shfl_down(accum,
offset, warpSize), &accum);
192 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum,
offset, warpSize), &accum);
196 if ((
threadIdx.x & (warpSize - 1)) == 0) {
197 atomicReduce(output, accum, reducer);
202 atomicInc(semaphore, gridDim.x + 1);
203 #if defined(EIGEN_HIPCC) 204 __threadfence_system();
207 #else // EIGEN_CUDA_ARCH >= 300 208 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
209 #endif // EIGEN_CUDA_ARCH >= 300 213 #ifdef EIGEN_HAS_GPU_FP16 214 template <
typename Self,
215 typename Reducer,
typename Index>
221 Index packet_remainder =
223 if (packet_remainder != 0) {
224 half2* h2scratch =
reinterpret_cast<half2*
>(scratch);
225 for (
Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
227 __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
230 if ((num_coeffs & 1) != 0) {
231 half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
232 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
235 *scratch = reducer.template initializePacket<packet_type>();
239 template <
typename Self,
240 typename Reducer,
typename Index>
246 const Index num_packets =
248 PacketType* p_output =
reinterpret_cast<PacketType*
>(output);
249 for (
Index i = thread_id; i < num_packets; i += num_threads) {
250 p_output[
i] = reducer.template initializePacket<PacketType>();
252 Index packet_remainder =
254 if (thread_id < packet_remainder) {
255 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
259 template <
int BlockSize,
int NumPerThread,
typename Self,
260 typename Reducer,
typename Index>
266 const Index first_index =
271 if (gridDim.x == 1) {
272 if (first_index == 0) {
273 int rem = num_coeffs % packet_width;
275 half2* p_scratch =
reinterpret_cast<half2*
>(scratch);
276 *scratch = reducer.template initializePacket<PacketType>();
277 for (
int i = 0; i < rem / 2; i++) {
278 *p_scratch = __halves2half2(
279 input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
280 input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
283 if ((num_coeffs & 1) != 0) {
284 half
last = input.m_impl.coeff(num_coeffs - 1);
285 *p_scratch = __halves2half2(last, reducer.initialize());
288 *scratch = reducer.template initializePacket<PacketType>();
294 PacketType accum = reducer.template initializePacket<PacketType>();
295 const Index max_iter =
296 numext::mini<Index>((num_coeffs - first_index) / packet_width,
297 NumPerThread * BlockSize / packet_width);
298 for (
Index i = 0; i < max_iter; i += BlockSize) {
299 const Index index = first_index + packet_width *
i;
301 PacketType val = input.m_impl.template packet<Unaligned>(index);
302 reducer.reducePacket(val, &accum);
307 #if defined(EIGEN_HIPCC) 309 half2* hr =
reinterpret_cast<half2*
>(&
r1);
310 half2* hacc =
reinterpret_cast<half2*
>(&accum);
311 for (
int i = 0; i < packet_width / 2; i++) {
313 union {
int i; half2
h; } wka_in, wka_out;
315 wka_out.i = __shfl_down(wka_in.i,
offset, warpSize);
318 reducer.reducePacket(r1, &accum);
319 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 321 half2* hr =
reinterpret_cast<half2*
>(&
r1);
322 half2* hacc =
reinterpret_cast<half2*
>(&accum);
323 for (
int i = 0; i < packet_width / 2; i++) {
324 hr[
i] = __shfl_down(hacc[i],
offset, warpSize);
326 reducer.reducePacket(r1, &accum);
329 half2* hr =
reinterpret_cast<half2*
>(&
r1);
330 half2* hacc =
reinterpret_cast<half2*
>(&accum);
331 for (
int i = 0; i < packet_width / 2; i++) {
332 hr[
i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (
unsigned)
offset, warpSize);
334 reducer.reducePacket(r1, &accum);
339 if ((
threadIdx.x & (warpSize - 1)) == 0) {
340 atomicReduce(scratch, accum, reducer);
344 half2* rv1 =
reinterpret_cast<half2*
>(scratch);
345 if (packet_width > 2) {
346 reducer.reducePacket(rv1[2], rv1);
347 reducer.reducePacket(rv1[3], rv1 + 1);
348 reducer.reducePacket(rv1[1], rv1);
350 if (gridDim.x == 1) {
351 if (first_index == 0) {
352 half tmp = __low2half(*rv1);
353 reducer.reduce(__high2half(*rv1), &tmp);
359 template <
typename Op>
362 half2* pscratch =
reinterpret_cast<half2*
>(scratch);
363 half tmp = __float2half(0.
f);
366 reducer.reduce(__low2half(*pscratch), &tmp);
367 reducer.reduce(__high2half(*pscratch), &tmp);
373 #endif // EIGEN_HAS_GPU_FP16 375 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
376 struct FullReductionLauncher {
377 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
378 gpu_assert(
false &&
"Should only be called on doubles, floats and half floats");
383 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
384 struct FullReductionLauncher<
385 Self, Op, OutputType, PacketAccess,
387 internal::is_same<float, OutputType>::value ||
388 internal::is_same<double, OutputType>::value,
390 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs) {
393 const int block_size = 256;
394 const int num_per_thread = 128;
395 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
397 unsigned int* semaphore =
NULL;
398 if (num_blocks > 1) {
399 semaphore = device.semaphore();
402 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
403 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, semaphore);
407 #ifdef EIGEN_HAS_GPU_FP16 408 template <
typename Self,
typename Op>
409 struct FullReductionLauncher<Self, Op,
Eigen::half, false> {
410 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
411 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
415 template <
typename Self,
typename Op>
416 struct FullReductionLauncher<Self, Op,
Eigen::half, true> {
417 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
421 const int block_size = 256;
422 const int num_per_thread = 128;
423 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
424 PacketType* scratch =
static_cast<PacketType*
>(device.scratchpad());
427 if (num_blocks > 1) {
430 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
431 1, 1, 0, device, reducer,
self, num_coeffs, scratch);
434 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
435 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, scratch);
437 if (num_blocks > 1) {
438 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
439 1, 1, 0, device, reducer, output, scratch);
443 #endif // EIGEN_HAS_GPU_FP16 446 template <
typename Self,
typename Op,
bool Vectorizable>
447 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
451 #ifdef EIGEN_HAS_GPU_FP16 452 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
456 #else // EIGEN_HAS_GPU_FP16 457 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
460 #endif // EIGEN_HAS_GPU_FP16 462 template <
typename OutputType>
463 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output) {
464 gpu_assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
465 const Index num_coeffs =
array_prod(
self.m_impl.dimensions());
467 if (num_coeffs == 0) {
471 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device, output, num_coeffs);
476 template <
int NumPerThread,
typename Self,
477 typename Reducer,
typename Index>
478 __global__
EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
479 typename Self::CoeffReturnType* output) {
480 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) 481 typedef typename Self::CoeffReturnType
Type;
487 const int unroll_times = 16;
490 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread);
491 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
493 const Index num_threads =
blockDim.x * gridDim.x;
497 if (gridDim.x == 1) {
498 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
499 output[
i] = reducer.initialize();
504 for (Index i =
blockIdx.x; i < num_input_blocks; i += gridDim.x) {
505 const Index
row = i / input_col_blocks;
507 if (row < num_preserved_coeffs) {
508 const Index col_block = i % input_col_blocks;
511 Type reduced_val = reducer.initialize();
513 for (Index
j = 0;
j < NumPerThread;
j += unroll_times) {
514 const Index last_col = col_begin +
blockDim.x * (
j + unroll_times - 1);
515 if (last_col >= num_coeffs_to_reduce) {
517 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce +
col);
518 reducer.reduce(val, &reduced_val);
524 for (
int k = 0; k < unroll_times; ++k) {
526 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
533 #if defined(EIGEN_HIPCC) 538 reducer.reduce(__shfl_down(static_cast<float>(reduced_val),
offset), &reduced_val);
540 reducer.reduce(__shfl_down(static_cast<int>(reduced_val),
offset), &reduced_val);
542 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 543 reducer.reduce(__shfl_down(reduced_val,
offset), &reduced_val);
545 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val,
offset), &reduced_val);
549 if ((
threadIdx.x & (warpSize - 1)) == 0) {
550 atomicReduce(&(output[row]), reduced_val, reducer);
554 #else // EIGEN_CUDA_ARCH >= 300 555 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
556 #endif // EIGEN_CUDA_ARCH >= 300 559 #ifdef EIGEN_HAS_GPU_FP16 561 template <
int NumPerThread,
typename Self,
562 typename Reducer,
typename Index>
563 __global__
EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
572 const int unroll_times = 16 / packet_width;
576 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread * 2);
577 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
579 const Index num_threads =
blockDim.x * gridDim.x;
583 if (gridDim.x == 1) {
584 Index i = packet_width * thread_id;
585 for (; i + packet_width <= num_preserved_coeffs;
586 i += packet_width * num_threads) {
587 PacketType* poutput =
reinterpret_cast<PacketType*
>(output +
i);
588 *poutput = reducer.template initializePacket<PacketType>();
590 if (i < num_preserved_coeffs) {
591 output[
i] = reducer.initialize();
596 for (Index i =
blockIdx.x; i < num_input_blocks; i += gridDim.x) {
597 const Index row = 2 * (i / input_col_blocks);
599 if (row + 1 < num_preserved_coeffs) {
600 const Index col_block = i % input_col_blocks;
601 const Index col_begin =
604 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
605 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
607 for (Index
j = 0;
j < NumPerThread;
j += unroll_times) {
608 const Index last_col =
609 col_begin +
blockDim.x * (
j + unroll_times - 1) * packet_width;
610 if (last_col >= num_coeffs_to_reduce) {
612 for (; col + packet_width <= num_coeffs_to_reduce;
614 const PacketType val1 = input.m_impl.template packet<Unaligned>(
615 row * num_coeffs_to_reduce +
col);
616 reducer.reducePacket(val1, &reduced_val1);
617 const PacketType val2 = input.m_impl.template packet<Unaligned>(
618 (row + 1) * num_coeffs_to_reduce + col);
619 reducer.reducePacket(val2, &reduced_val2);
621 if (col < num_coeffs_to_reduce) {
622 PacketType r1 = reducer.template initializePacket<PacketType>();
623 PacketType
r2 = reducer.template initializePacket<PacketType>();
624 half2* hr1 =
reinterpret_cast<half2*
>(&
r1);
625 half2* hr2 =
reinterpret_cast<half2*
>(&
r2);
626 while (col + 1 < num_coeffs_to_reduce) {
627 *hr1 = __halves2half2(
628 input.m_impl.coeff(row * num_coeffs_to_reduce + col),
629 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
630 *hr2 = __halves2half2(
631 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
632 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
638 if (col < num_coeffs_to_reduce) {
641 input.m_impl.coeff(row * num_coeffs_to_reduce + col);
642 *hr1 = __halves2half2(last1, reducer.initialize());
644 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
645 *hr2 = __halves2half2(last2, reducer.initialize());
647 reducer.reducePacket(r1, &reduced_val1);
648 reducer.reducePacket(r2, &reduced_val2);
654 for (
int k = 0; k < unroll_times; ++k) {
655 const Index col = col_begin +
blockDim.x * (
j + k) * packet_width;
656 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
657 row * num_coeffs_to_reduce + col),
659 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
660 (row + 1) * num_coeffs_to_reduce + col),
668 #if defined(EIGEN_HIPCC) 671 half2* hr1 =
reinterpret_cast<half2*
>(&
r1);
672 half2* hr2 =
reinterpret_cast<half2*
>(&
r2);
673 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
674 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
675 for (
int i = 0; i < packet_width / 2; i++) {
677 union {
int i; half2
h; } wka_in1, wka_out1;
679 wka_out1.i = __shfl_down(wka_in1.i,
offset, warpSize);
682 union {
int i; half2
h; } wka_in2, wka_out2;
684 wka_out2.i = __shfl_down(wka_in2.i,
offset, warpSize);
687 reducer.reducePacket(r1, &reduced_val1);
688 reducer.reducePacket(r2, &reduced_val2);
689 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 692 half2* hr1 =
reinterpret_cast<half2*
>(&
r1);
693 half2* hr2 =
reinterpret_cast<half2*
>(&
r2);
694 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
695 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
696 for (
int i = 0; i < packet_width / 2; i++) {
697 hr1[
i] = __shfl_down(rv1[i],
offset, warpSize);
698 hr2[
i] = __shfl_down(rv2[i],
offset, warpSize);
700 reducer.reducePacket(r1, &reduced_val1);
701 reducer.reducePacket(r2, &reduced_val2);
705 half2* hr1 =
reinterpret_cast<half2*
>(&
r1);
706 half2* hr2 =
reinterpret_cast<half2*
>(&
r2);
707 half2* rr1 =
reinterpret_cast<half2*
>(&reduced_val1);
708 half2* rr2 =
reinterpret_cast<half2*
>(&reduced_val2);
709 for (
int i = 0; i < packet_width / 2; i++) {
711 __shfl_down_sync(0xFFFFFFFF, rr1[i], (
unsigned)
offset, warpSize);
713 __shfl_down_sync(0xFFFFFFFF, rr2[i], (
unsigned)
offset, warpSize);
715 reducer.reducePacket(r1, &reduced_val1);
716 reducer.reducePacket(r2, &reduced_val2);
720 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
721 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
723 if (packet_width > 2) {
724 reducer.reducePacket(rv1[2], rv1);
725 reducer.reducePacket(rv1[3], rv1 + 1);
726 reducer.reducePacket(rv1[1], rv1);
727 reducer.reducePacket(rv2[2], rv2);
728 reducer.reducePacket(rv2[3], rv2 + 1);
729 reducer.reducePacket(rv2[1], rv2);
731 half val1 = __low2half(*rv1);
732 reducer.reduce(__high2half(*rv1), &val1);
733 half val2 = __low2half(*rv2);
734 reducer.reduce(__high2half(*rv2), &val2);
735 val = __halves2half2(val1, val2);
736 if ((
threadIdx.x & (warpSize - 1)) == 0) {
738 atomicReduce((half2*)loc, val, reducer);
744 #endif // EIGEN_HAS_GPU_FP16 746 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
747 struct InnerReductionLauncher {
749 gpu_assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
755 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
756 struct InnerReductionLauncher<
757 Self, Op, OutputType, PacketAccess,
759 internal::is_same<float, OutputType>::value ||
760 internal::is_same<double, OutputType>::value,
762 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
765 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
766 const int block_size = 256;
767 const int num_per_thread = 128;
768 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
769 const int max_blocks = device.getNumGpuMultiProcessors() *
770 device.maxGpuThreadsPerMultiProcessor() / block_size;
771 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
773 if (num_blocks > 1) {
776 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
777 const int max_blocks = device.getNumGpuMultiProcessors() *
778 device.maxGpuThreadsPerMultiProcessor() / 1024;
779 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
780 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
781 num_blocks, 1024, 0, device, reducer.initialize(),
782 num_preserved_vals, output);
785 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
786 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
792 #ifdef EIGEN_HAS_GPU_FP16 793 template <
typename Self,
typename Op>
794 struct InnerReductionLauncher<Self, Op,
Eigen::half, false> {
796 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
801 template <
typename Self,
typename Op>
802 struct InnerReductionLauncher<Self, Op,
Eigen::half, true> {
803 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
806 if (num_preserved_vals % 2 != 0) {
811 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
812 const int block_size = 128;
813 const int num_per_thread = 64;
814 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
815 const int max_blocks = device.getNumGpuMultiProcessors() *
816 device.maxGpuThreadsPerMultiProcessor() / block_size;
817 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
819 if (num_blocks > 1) {
822 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
823 1, 1, 0, device, reducer,
self, num_preserved_vals, output);
826 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
827 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
832 #endif // EIGEN_HAS_GPU_FP16 835 template <
typename Self,
typename Op>
836 struct InnerReducer<Self, Op, GpuDevice> {
840 #ifdef EIGEN_HAS_GPU_FP16 841 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
845 #else // EIGEN_HAS_GPU_FP16 846 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
849 #endif // EIGEN_HAS_GPU_FP16 851 template <
typename OutputType>
852 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
853 gpu_assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
854 const Index num_coeffs =
array_prod(
self.m_impl.dimensions());
856 if (num_coeffs == 0) {
860 if (num_coeffs_to_reduce <= 128) {
864 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
868 template <
int NumPerThread,
typename Self,
869 typename Reducer,
typename Index>
870 __global__
EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
871 typename Self::CoeffReturnType* output) {
872 const Index num_threads =
blockDim.x * gridDim.x;
875 if (gridDim.x == 1) {
876 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
877 output[
i] = reducer.initialize();
883 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
884 for (Index i = thread_id; i < max_iter; i += num_threads) {
885 const Index input_col = i % num_preserved_coeffs;
886 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
887 typename Self::CoeffReturnType reduced_val = reducer.initialize();
888 const Index max_row =
numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
889 for (Index
j = input_row;
j < max_row;
j++) {
890 typename Self::CoeffReturnType val = input.m_impl.coeff(
j * num_preserved_coeffs + input_col);
891 reducer.reduce(val, &reduced_val);
893 atomicReduce(&(output[input_col]), reduced_val, reducer);
898 template <
typename Self,
typename Op>
899 struct OuterReducer<Self, Op, GpuDevice> {
903 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
906 template <
typename Device,
typename OutputType>
908 #if !defined(EIGEN_HIPCC) 921 gpu_assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
925 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
929 if (num_coeffs_to_reduce <= 32) {
933 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
934 const int block_size = 256;
935 const int num_per_thread = 16;
936 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
937 const int max_blocks = device.getNumGpuMultiProcessors() *
938 device.maxGpuThreadsPerMultiProcessor() / block_size;
939 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
941 if (num_blocks > 1) {
944 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
945 const int max_blocks = device.getNumGpuMultiProcessors() *
946 device.maxGpuThreadsPerMultiProcessor() / 1024;
947 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
948 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
949 num_blocks, 1024, 0, device, reducer.initialize(),
950 num_preserved_vals, output);
953 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
954 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
960 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) 966 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H #define EIGEN_ALWAYS_INLINE
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
m m block(1, 0, 2, 2)<< 4
Rot2 R(Rot2::fromAngle(0.1))
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
static const symbolic::SymbolExpr< internal::symbolic_last_tag > last
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
#define EIGEN_DEVICE_FUNC