10 #ifndef EIGEN_PACKET_MATH_GPU_H
11 #define EIGEN_PACKET_MATH_GPU_H
18 #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
19 #define EIGEN_GPU_HAS_LDG 1
23 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
24 #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
27 #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
28 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
34 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
36 template<>
struct is_arithmetic<float4> {
enum {
value =
true }; };
37 template<>
struct is_arithmetic<
double2> {
enum {
value =
true }; };
39 template<>
struct packet_traits<
float> : default_packet_traits
75 template<>
struct packet_traits<double> : default_packet_traits
114 return make_float4(from, from, from, from);
117 return make_double2(from, from);
123 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
128 return __int_as_float(__float_as_int(
a) & __float_as_int(
b));
132 return __longlong_as_double(__double_as_longlong(
a) &
133 __double_as_longlong(
b));
138 return __int_as_float(__float_as_int(
a) | __float_as_int(
b));
142 return __longlong_as_double(__double_as_longlong(
a) |
143 __double_as_longlong(
b));
148 return __int_as_float(__float_as_int(
a) ^ __float_as_int(
b));
152 return __longlong_as_double(__double_as_longlong(
a) ^
153 __double_as_longlong(
b));
158 return __int_as_float(__float_as_int(
a) & ~__float_as_int(
b));
162 return __longlong_as_double(__double_as_longlong(
a) &
163 ~__double_as_longlong(
b));
167 return __int_as_float(
a ==
b ? 0xffffffffu : 0u);
171 return __longlong_as_double(
a ==
b ? 0xffffffffffffffffull : 0ull);
176 return __int_as_float(
a <
b ? 0xffffffffu : 0u);
180 return __longlong_as_double(
a <
b ? 0xffffffffffffffffull : 0ull);
188 return make_float4(bitwise_and(
a.x,
b.x), bitwise_and(
a.y,
b.y),
189 bitwise_and(
a.z,
b.z), bitwise_and(
a.w,
b.w));
194 return make_double2(bitwise_and(
a.x,
b.x), bitwise_and(
a.y,
b.y));
200 return make_float4(bitwise_or(
a.x,
b.x), bitwise_or(
a.y,
b.y),
201 bitwise_or(
a.z,
b.z), bitwise_or(
a.w,
b.w));
206 return make_double2(bitwise_or(
a.x,
b.x), bitwise_or(
a.y,
b.y));
212 return make_float4(bitwise_xor(
a.x,
b.x), bitwise_xor(
a.y,
b.y),
213 bitwise_xor(
a.z,
b.z), bitwise_xor(
a.w,
b.w));
218 return make_double2(bitwise_xor(
a.x,
b.x), bitwise_xor(
a.y,
b.y));
224 return make_float4(bitwise_andnot(
a.x,
b.x), bitwise_andnot(
a.y,
b.y),
225 bitwise_andnot(
a.z,
b.z), bitwise_andnot(
a.w,
b.w));
230 return make_double2(bitwise_andnot(
a.x,
b.x), bitwise_andnot(
a.y,
b.y));
236 return make_float4(eq_mask(
a.x,
b.x), eq_mask(
a.y,
b.y), eq_mask(
a.z,
b.z),
242 return make_float4(lt_mask(
a.x,
b.x), lt_mask(
a.y,
b.y), lt_mask(
a.z,
b.z),
248 return make_double2(eq_mask(
a.x,
b.x), eq_mask(
a.y,
b.y));
253 return make_double2(lt_mask(
a.x,
b.x), lt_mask(
a.y,
b.y));
255 #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
258 return make_float4(
a,
a+1,
a+2,
a+3);
261 return make_double2(
a,
a+1);
265 return make_float4(
a.x+
b.x,
a.y+
b.y,
a.z+
b.z,
a.w+
b.w);
268 return make_double2(
a.x+
b.x,
a.y+
b.y);
272 return make_float4(
a.x-
b.x,
a.y-
b.y,
a.z-
b.z,
a.w-
b.w);
275 return make_double2(
a.x-
b.x,
a.y-
b.y);
279 return make_float4(-
a.x, -
a.y, -
a.z, -
a.w);
282 return make_double2(-
a.x, -
a.y);
289 return make_float4(
a.x*
b.x,
a.y*
b.y,
a.z*
b.z,
a.w*
b.w);
292 return make_double2(
a.x*
b.x,
a.y*
b.y);
296 return make_float4(
a.x/
b.x,
a.y/
b.y,
a.z/
b.z,
a.w/
b.w);
299 return make_double2(
a.x/
b.x,
a.y/
b.y);
303 return make_float4(fminf(
a.x,
b.x), fminf(
a.y,
b.y), fminf(
a.z,
b.z), fminf(
a.w,
b.w));
310 return make_float4(fmaxf(
a.x,
b.x), fmaxf(
a.y,
b.y), fmaxf(
a.z,
b.z), fmaxf(
a.w,
b.w));
317 return *
reinterpret_cast<const float4*
>(from);
321 return *
reinterpret_cast<const double2*
>(from);
325 return make_float4(from[0], from[1], from[2], from[3]);
328 return make_double2(from[0], from[1]);
332 return make_float4(from[0], from[0], from[1], from[1]);
335 return make_double2(from[0], from[0]);
339 *
reinterpret_cast<float4*
>(to) = from;
343 *
reinterpret_cast<double2*
>(to) = from;
360 #if defined(EIGEN_GPU_HAS_LDG)
361 return __ldg((
const float4*)from);
363 return make_float4(from[0], from[1], from[2], from[3]);
368 #if defined(EIGEN_GPU_HAS_LDG)
369 return __ldg((
const double2*)from);
371 return make_double2(from[0], from[1]);
377 #if defined(EIGEN_GPU_HAS_LDG)
378 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
380 return make_float4(from[0], from[1], from[2], from[3]);
385 #if defined(EIGEN_GPU_HAS_LDG)
386 return make_double2(__ldg(from+0), __ldg(from+1));
388 return make_double2(from[0], from[1]);
393 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
397 return make_double2(from[0*stride], from[1*stride]);
400 template<>
EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(
float* to,
const float4& from,
Index stride) {
401 to[stride*0] = from.x;
402 to[stride*1] = from.y;
403 to[stride*2] = from.z;
404 to[stride*3] = from.w;
407 to[stride*0] = from.
x;
408 to[stride*1] = from.y;
419 return a.x +
a.y +
a.z +
a.w;
426 return fmaxf(fmaxf(
a.x,
a.y), fmaxf(
a.z,
a.w));
433 return fminf(fminf(
a.x,
a.y), fminf(
a.z,
a.w));
440 return a.x *
a.y *
a.z *
a.w;
447 return make_float4(fabsf(
a.x), fabsf(
a.y), fabsf(
a.z), fabsf(
a.w));
454 return make_float4(floorf(
a.x), floorf(
a.y), floorf(
a.z), floorf(
a.w));
462 float tmp = kernel.packet[0].y;
463 kernel.packet[0].y = kernel.packet[1].x;
464 kernel.packet[1].x = tmp;
466 tmp = kernel.packet[0].z;
467 kernel.packet[0].z = kernel.packet[2].x;
468 kernel.packet[2].x = tmp;
470 tmp = kernel.packet[0].w;
471 kernel.packet[0].w = kernel.packet[3].x;
472 kernel.packet[3].x = tmp;
474 tmp = kernel.packet[1].z;
475 kernel.packet[1].z = kernel.packet[2].y;
476 kernel.packet[2].y = tmp;
478 tmp = kernel.packet[1].w;
479 kernel.packet[1].w = kernel.packet[3].y;
480 kernel.packet[3].y = tmp;
482 tmp = kernel.packet[2].w;
483 kernel.packet[2].w = kernel.packet[3].z;
484 kernel.packet[3].z = tmp;
489 double tmp = kernel.packet[0].y;
490 kernel.packet[0].y = kernel.packet[1].x;
491 kernel.packet[1].x = tmp;
494 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
498 #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
500 typedef ulonglong2 Packet4h2;
502 template<>
struct is_arithmetic<Packet4h2> {
enum {
value =
true }; };
505 template<>
struct is_arithmetic<half2> {
enum {
value =
true }; };
507 template<>
struct packet_traits<
Eigen::half> : default_packet_traits
509 typedef Packet4h2
type;
510 typedef Packet4h2
half;
532 #if defined(EIGEN_GPU_COMPILE_PHASE)
533 return __halves2half2(
a,
b);
536 return __floats2half2_rn(__half2float(
a), __half2float(
b));
541 #if defined(EIGEN_GPU_COMPILE_PHASE)
542 return __low2half(
a);
544 return __float2half(__low2float(
a));
549 #if defined(EIGEN_GPU_COMPILE_PHASE)
550 return __high2half(
a);
552 return __float2half(__high2float(
a));
559 #if defined(EIGEN_GPU_COMPILE_PHASE)
560 return __half2half2(from);
562 const float f = __half2float(from);
563 return __floats2half2_rn(
f,
f);
571 half2* p_alias =
reinterpret_cast<half2*
>(&r);
572 p_alias[0] = pset1<half2>(from);
573 p_alias[1] = pset1<half2>(from);
574 p_alias[2] = pset1<half2>(from);
575 p_alias[3] = pset1<half2>(from);
584 return *
reinterpret_cast<const half2*
>(from);
588 return combine_half(from[0], from[1]);
592 return combine_half(from[0], from[0]);
597 *
reinterpret_cast<half2*
>(to) = from;
602 to[0] = get_half2_low(from);
603 to[1] = get_half2_high(from);
609 #if defined(EIGEN_GPU_HAS_LDG)
611 return __ldg(
reinterpret_cast<const half2*
>(from));
613 return combine_half(*(from+0), *(from+1));
619 #if defined(EIGEN_GPU_HAS_LDG)
620 return __halves2half2(__ldg(from+0), __ldg(from+1));
622 return combine_half(*(from+0), *(from+1));
628 return combine_half(from[0*stride], from[1*stride]);
633 to[stride*0] = get_half2_low(from);
634 to[stride*1] = get_half2_high(from);
638 return get_half2_low(
a);
642 half
a1 = get_half2_low(
a);
643 half
a2 = get_half2_high(
a);
646 return combine_half(result1, result2);
651 return pset1<half2>(true_half);
656 return pset1<half2>(false_half);
661 __half
a1 = get_half2_low(kernel.packet[0]);
662 __half
a2 = get_half2_high(kernel.packet[0]);
663 __half
b1 = get_half2_low(kernel.packet[1]);
664 __half
b2 = get_half2_high(kernel.packet[1]);
665 kernel.packet[0] = combine_half(
a1,
b1);
666 kernel.packet[1] = combine_half(
a2,
b2);
670 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
671 return __halves2half2(
a, __hadd(
a, __float2half(1.0
f)));
673 float f = __half2float(
a) + 1.0f;
674 return combine_half(
a, __float2half(
f));
681 half mask_low = get_half2_low(mask);
682 half mask_high = get_half2_high(mask);
683 half result_low = mask_low == half(0) ? get_half2_low(
b) : get_half2_low(
a);
684 half result_high = mask_high == half(0) ? get_half2_high(
b) : get_half2_high(
a);
685 return combine_half(result_low, result_high);
692 half
a1 = get_half2_low(
a);
693 half
a2 = get_half2_high(
a);
694 half
b1 = get_half2_low(
b);
695 half
b2 = get_half2_high(
b);
696 half eq1 = __half2float(
a1) == __half2float(
b1) ? true_half : false_half;
697 half eq2 = __half2float(
a2) == __half2float(
b2) ? true_half : false_half;
698 return combine_half(eq1, eq2);
705 half
a1 = get_half2_low(
a);
706 half
a2 = get_half2_high(
a);
707 half
b1 = get_half2_low(
b);
708 half
b2 = get_half2_high(
b);
709 half eq1 = __half2float(
a1) < __half2float(
b1) ? true_half : false_half;
710 half eq2 = __half2float(
a2) < __half2float(
b2) ? true_half : false_half;
711 return combine_half(eq1, eq2);
716 half
a1 = get_half2_low(
a);
717 half
a2 = get_half2_high(
a);
718 half
b1 = get_half2_low(
b);
719 half
b2 = get_half2_high(
b);
722 return combine_half(result1, result2);
727 half
a1 = get_half2_low(
a);
728 half
a2 = get_half2_high(
a);
729 half
b1 = get_half2_low(
b);
730 half
b2 = get_half2_high(
b);
733 return combine_half(result1, result2);
738 half
a1 = get_half2_low(
a);
739 half
a2 = get_half2_high(
a);
740 half
b1 = get_half2_low(
b);
741 half
b2 = get_half2_high(
b);
744 return combine_half(result1, result2);
749 half
a1 = get_half2_low(
a);
750 half
a2 = get_half2_high(
a);
751 half
b1 = get_half2_low(
b);
752 half
b2 = get_half2_high(
b);
755 return combine_half(result1, result2);
760 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
761 return __hadd2(
a,
b);
763 float a1 = __low2float(
a);
764 float a2 = __high2float(
a);
765 float b1 = __low2float(
b);
766 float b2 = __high2float(
b);
769 return __floats2half2_rn(
r1,
r2);
775 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
776 return __hsub2(
a,
b);
778 float a1 = __low2float(
a);
779 float a2 = __high2float(
a);
780 float b1 = __low2float(
b);
781 float b2 = __high2float(
b);
784 return __floats2half2_rn(
r1,
r2);
789 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
792 float a1 = __low2float(
a);
793 float a2 = __high2float(
a);
794 return __floats2half2_rn(-
a1, -
a2);
802 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
803 return __hmul2(
a,
b);
805 float a1 = __low2float(
a);
806 float a2 = __high2float(
a);
807 float b1 = __low2float(
b);
808 float b2 = __high2float(
b);
811 return __floats2half2_rn(
r1,
r2);
818 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
819 return __hfma2(
a,
b,
c);
821 float a1 = __low2float(
a);
822 float a2 = __high2float(
a);
823 float b1 = __low2float(
b);
824 float b2 = __high2float(
b);
825 float c1 = __low2float(
c);
826 float c2 = __high2float(
c);
829 return __floats2half2_rn(
r1,
r2);
835 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
836 return __h2div(
a,
b);
838 float a1 = __low2float(
a);
839 float a2 = __high2float(
a);
840 float b1 = __low2float(
b);
841 float b2 = __high2float(
b);
844 return __floats2half2_rn(
r1,
r2);
850 float a1 = __low2float(
a);
851 float a2 = __high2float(
a);
852 float b1 = __low2float(
b);
853 float b2 = __high2float(
b);
854 __half
r1 =
a1 <
b1 ? get_half2_low(
a) : get_half2_low(
b);
855 __half
r2 =
a2 <
b2 ? get_half2_high(
a) : get_half2_high(
b);
856 return combine_half(
r1,
r2);
861 float a1 = __low2float(
a);
862 float a2 = __high2float(
a);
863 float b1 = __low2float(
b);
864 float b2 = __high2float(
b);
865 __half
r1 =
a1 >
b1 ? get_half2_low(
a) : get_half2_low(
b);
866 __half
r2 =
a2 >
b2 ? get_half2_high(
a) : get_half2_high(
b);
867 return combine_half(
r1,
r2);
871 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
872 return __hadd(__low2half(
a), __high2half(
a));
874 float a1 = __low2float(
a);
875 float a2 = __high2float(
a);
881 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
882 __half
first = __low2half(
a);
883 __half
second = __high2half(
a);
886 float a1 = __low2float(
a);
887 float a2 = __high2float(
a);
888 return a1 >
a2 ? get_half2_low(
a) : get_half2_high(
a);
893 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
894 __half
first = __low2half(
a);
895 __half
second = __high2half(
a);
898 float a1 = __low2float(
a);
899 float a2 = __high2float(
a);
900 return a1 <
a2 ? get_half2_low(
a) : get_half2_high(
a);
905 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
906 return __hmul(__low2half(
a), __high2half(
a));
908 float a1 = __low2float(
a);
909 float a2 = __high2float(
a);
915 float a1 = __low2float(
a);
916 float a2 = __high2float(
a);
917 float r1 = log1pf(
a1);
918 float r2 = log1pf(
a2);
919 return __floats2half2_rn(
r1,
r2);
923 float a1 = __low2float(
a);
924 float a2 = __high2float(
a);
925 float r1 = expm1f(
a1);
926 float r2 = expm1f(
a2);
927 return __floats2half2_rn(
r1,
r2);
930 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
931 defined(EIGEN_HIP_DEVICE_COMPILE)
934 half2
plog(
const half2&
a) {
939 half2
pexp(
const half2&
a) {
944 half2
psqrt(
const half2&
a) {
956 float a1 = __low2float(
a);
957 float a2 = __high2float(
a);
960 return __floats2half2_rn(
r1,
r2);
964 float a1 = __low2float(
a);
965 float a2 = __high2float(
a);
968 return __floats2half2_rn(
r1,
r2);
972 float a1 = __low2float(
a);
973 float a2 = __high2float(
a);
974 float r1 = sqrtf(
a1);
975 float r2 = sqrtf(
a2);
976 return __floats2half2_rn(
r1,
r2);
980 float a1 = __low2float(
a);
981 float a2 = __high2float(
a);
982 float r1 = rsqrtf(
a1);
983 float r2 = rsqrtf(
a2);
984 return __floats2half2_rn(
r1,
r2);
992 return *
reinterpret_cast<const Packet4h2*
>(from);
1000 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1001 p_alias[0] =
ploadu(from + 0);
1002 p_alias[1] =
ploadu(from + 2);
1003 p_alias[2] =
ploadu(from + 4);
1004 p_alias[3] =
ploadu(from + 6);
1012 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1023 *
reinterpret_cast<Packet4h2*
>(to) = from;
1029 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1030 pstoreu(to + 0,from_alias[0]);
1031 pstoreu(to + 2,from_alias[1]);
1032 pstoreu(to + 4,from_alias[2]);
1033 pstoreu(to + 6,from_alias[3]);
1038 ploadt_ro<Packet4h2, Aligned>(
const Eigen::half* from) {
1039 #if defined(EIGEN_GPU_HAS_LDG)
1041 r = __ldg(
reinterpret_cast<const Packet4h2*
>(from));
1045 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1046 r_alias[0] = ploadt_ro_aligned(from + 0);
1047 r_alias[1] = ploadt_ro_aligned(from + 2);
1048 r_alias[2] = ploadt_ro_aligned(from + 4);
1049 r_alias[3] = ploadt_ro_aligned(from + 6);
1056 ploadt_ro<Packet4h2, Unaligned>(
const Eigen::half* from) {
1058 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1059 r_alias[0] = ploadt_ro_unaligned(from + 0);
1060 r_alias[1] = ploadt_ro_unaligned(from + 2);
1061 r_alias[2] = ploadt_ro_unaligned(from + 4);
1062 r_alias[3] = ploadt_ro_unaligned(from + 6);
1070 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1071 p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
1072 p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
1073 p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
1074 p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
1081 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1082 pscatter(to + stride * 0, from_alias[0], stride);
1083 pscatter(to + stride * 2, from_alias[1], stride);
1084 pscatter(to + stride * 4, from_alias[2], stride);
1085 pscatter(to + stride * 6, from_alias[3], stride);
1090 const Packet4h2&
a) {
1091 return pfirst(*(
reinterpret_cast<const half2*
>(&
a)));
1096 const Packet4h2&
a) {
1098 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1099 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1100 p_alias[0] =
pabs(a_alias[0]);
1101 p_alias[1] =
pabs(a_alias[1]);
1102 p_alias[2] =
pabs(a_alias[2]);
1103 p_alias[3] =
pabs(a_alias[3]);
1109 const Packet4h2& ) {
1111 return pset1<Packet4h2>(true_half);
1117 return pset1<Packet4h2>(false_half);
1121 double* d_row0,
double* d_row1,
double* d_row2,
double* d_row3,
1122 double* d_row4,
double* d_row5,
double* d_row6,
double* d_row7) {
1125 d_row0[1] = d_row4[0];
1129 d_row1[1] = d_row5[0];
1133 d_row2[1] = d_row6[0];
1137 d_row3[1] = d_row7[0];
1142 half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
1145 f_row0[1] = f_row2[0];
1149 f_row1[1] = f_row3[0];
1154 ptranspose_half(half2&
f0, half2&
f1) {
1155 __half
a1 = get_half2_low(
f0);
1156 __half
a2 = get_half2_high(
f0);
1157 __half
b1 = get_half2_low(
f1);
1158 __half
b2 = get_half2_high(
f1);
1159 f0 = combine_half(
a1,
b1);
1160 f1 = combine_half(
a2,
b2);
1164 ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1165 double* d_row0 =
reinterpret_cast<double*
>(&kernel.packet[0]);
1166 double* d_row1 =
reinterpret_cast<double*
>(&kernel.packet[1]);
1167 double* d_row2 =
reinterpret_cast<double*
>(&kernel.packet[2]);
1168 double* d_row3 =
reinterpret_cast<double*
>(&kernel.packet[3]);
1169 double* d_row4 =
reinterpret_cast<double*
>(&kernel.packet[4]);
1170 double* d_row5 =
reinterpret_cast<double*
>(&kernel.packet[5]);
1171 double* d_row6 =
reinterpret_cast<double*
>(&kernel.packet[6]);
1172 double* d_row7 =
reinterpret_cast<double*
>(&kernel.packet[7]);
1173 ptranspose_double(d_row0, d_row1, d_row2, d_row3,
1174 d_row4, d_row5, d_row6, d_row7);
1177 half2* f_row0 =
reinterpret_cast<half2*
>(d_row0);
1178 half2* f_row1 =
reinterpret_cast<half2*
>(d_row1);
1179 half2* f_row2 =
reinterpret_cast<half2*
>(d_row2);
1180 half2* f_row3 =
reinterpret_cast<half2*
>(d_row3);
1181 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1182 ptranspose_half(f_row0[0], f_row1[0]);
1183 ptranspose_half(f_row0[1], f_row1[1]);
1184 ptranspose_half(f_row2[0], f_row3[0]);
1185 ptranspose_half(f_row2[1], f_row3[1]);
1187 f_row0 =
reinterpret_cast<half2*
>(d_row0 + 1);
1188 f_row1 =
reinterpret_cast<half2*
>(d_row1 + 1);
1189 f_row2 =
reinterpret_cast<half2*
>(d_row2 + 1);
1190 f_row3 =
reinterpret_cast<half2*
>(d_row3 + 1);
1191 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1192 ptranspose_half(f_row0[0], f_row1[0]);
1193 ptranspose_half(f_row0[1], f_row1[1]);
1194 ptranspose_half(f_row2[0], f_row3[0]);
1195 ptranspose_half(f_row2[1], f_row3[1]);
1197 f_row0 =
reinterpret_cast<half2*
>(d_row4);
1198 f_row1 =
reinterpret_cast<half2*
>(d_row5);
1199 f_row2 =
reinterpret_cast<half2*
>(d_row6);
1200 f_row3 =
reinterpret_cast<half2*
>(d_row7);
1201 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1202 ptranspose_half(f_row0[0], f_row1[0]);
1203 ptranspose_half(f_row0[1], f_row1[1]);
1204 ptranspose_half(f_row2[0], f_row3[0]);
1205 ptranspose_half(f_row2[1], f_row3[1]);
1207 f_row0 =
reinterpret_cast<half2*
>(d_row4 + 1);
1208 f_row1 =
reinterpret_cast<half2*
>(d_row5 + 1);
1209 f_row2 =
reinterpret_cast<half2*
>(d_row6 + 1);
1210 f_row3 =
reinterpret_cast<half2*
>(d_row7 + 1);
1211 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1212 ptranspose_half(f_row0[0], f_row1[0]);
1213 ptranspose_half(f_row0[1], f_row1[1]);
1214 ptranspose_half(f_row2[0], f_row3[0]);
1215 ptranspose_half(f_row2[1], f_row3[1]);
1222 #if defined(EIGEN_HIP_DEVICE_COMPILE)
1225 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1226 p_alias[0] = __halves2half2(
a, __hadd(
a, __float2half(1.0
f)));
1227 p_alias[1] = __halves2half2(__hadd(
a, __float2half(2.0
f)),
1228 __hadd(
a, __float2half(3.0
f)));
1229 p_alias[2] = __halves2half2(__hadd(
a, __float2half(4.0
f)),
1230 __hadd(
a, __float2half(5.0
f)));
1231 p_alias[3] = __halves2half2(__hadd(
a, __float2half(6.0
f)),
1232 __hadd(
a, __float2half(7.0
f)));
1234 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1236 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1238 half2
b = pset1<half2>(
a);
1240 half2 half_offset0 = __halves2half2(__float2half(0.0
f),__float2half(2.0
f));
1241 half2 half_offset1 = __halves2half2(__float2half(4.0
f),__float2half(6.0
f));
1243 c = __hadd2(
b, half_offset0);
1244 r_alias[0] =
plset(__low2half(
c));
1245 r_alias[1] =
plset(__high2half(
c));
1247 c = __hadd2(
b, half_offset1);
1248 r_alias[2] =
plset(__low2half(
c));
1249 r_alias[3] =
plset(__high2half(
c));
1254 float f = __half2float(
a);
1256 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1257 p_alias[0] = combine_half(
a, __float2half(
f + 1.0
f));
1258 p_alias[1] = combine_half(__float2half(
f + 2.0
f), __float2half(
f + 3.0
f));
1259 p_alias[2] = combine_half(__float2half(
f + 4.0
f), __float2half(
f + 5.0
f));
1260 p_alias[3] = combine_half(__float2half(
f + 6.0
f), __float2half(
f + 7.0
f));
1267 pselect<Packet4h2>(
const Packet4h2& mask,
const Packet4h2&
a,
1268 const Packet4h2&
b) {
1270 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1271 const half2* mask_alias =
reinterpret_cast<const half2*
>(&mask);
1272 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1273 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1274 r_alias[0] =
pselect(mask_alias[0], a_alias[0], b_alias[0]);
1275 r_alias[1] =
pselect(mask_alias[1], a_alias[1], b_alias[1]);
1276 r_alias[2] =
pselect(mask_alias[2], a_alias[2], b_alias[2]);
1277 r_alias[3] =
pselect(mask_alias[3], a_alias[3], b_alias[3]);
1283 pcmp_eq<Packet4h2>(
const Packet4h2&
a,
const Packet4h2&
b) {
1285 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1286 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1287 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1288 r_alias[0] =
pcmp_eq(a_alias[0], b_alias[0]);
1289 r_alias[1] =
pcmp_eq(a_alias[1], b_alias[1]);
1290 r_alias[2] =
pcmp_eq(a_alias[2], b_alias[2]);
1291 r_alias[3] =
pcmp_eq(a_alias[3], b_alias[3]);
1297 const Packet4h2&
a,
const Packet4h2&
b) {
1299 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1300 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1301 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1302 r_alias[0] =
pand(a_alias[0], b_alias[0]);
1303 r_alias[1] =
pand(a_alias[1], b_alias[1]);
1304 r_alias[2] =
pand(a_alias[2], b_alias[2]);
1305 r_alias[3] =
pand(a_alias[3], b_alias[3]);
1311 const Packet4h2&
a,
const Packet4h2&
b) {
1313 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1314 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1315 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1316 r_alias[0] =
por(a_alias[0], b_alias[0]);
1317 r_alias[1] =
por(a_alias[1], b_alias[1]);
1318 r_alias[2] =
por(a_alias[2], b_alias[2]);
1319 r_alias[3] =
por(a_alias[3], b_alias[3]);
1325 const Packet4h2&
a,
const Packet4h2&
b) {
1327 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1328 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1329 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1330 r_alias[0] =
pxor(a_alias[0], b_alias[0]);
1331 r_alias[1] =
pxor(a_alias[1], b_alias[1]);
1332 r_alias[2] =
pxor(a_alias[2], b_alias[2]);
1333 r_alias[3] =
pxor(a_alias[3], b_alias[3]);
1339 pandnot<Packet4h2>(
const Packet4h2&
a,
const Packet4h2&
b) {
1341 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1342 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1343 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1344 r_alias[0] =
pandnot(a_alias[0], b_alias[0]);
1345 r_alias[1] =
pandnot(a_alias[1], b_alias[1]);
1346 r_alias[2] =
pandnot(a_alias[2], b_alias[2]);
1347 r_alias[3] =
pandnot(a_alias[3], b_alias[3]);
1353 const Packet4h2&
a,
const Packet4h2&
b) {
1355 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1356 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1357 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1358 r_alias[0] =
padd(a_alias[0], b_alias[0]);
1359 r_alias[1] =
padd(a_alias[1], b_alias[1]);
1360 r_alias[2] =
padd(a_alias[2], b_alias[2]);
1361 r_alias[3] =
padd(a_alias[3], b_alias[3]);
1367 const Packet4h2&
a,
const Packet4h2&
b) {
1369 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1370 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1371 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1372 r_alias[0] =
psub(a_alias[0], b_alias[0]);
1373 r_alias[1] =
psub(a_alias[1], b_alias[1]);
1374 r_alias[2] =
psub(a_alias[2], b_alias[2]);
1375 r_alias[3] =
psub(a_alias[3], b_alias[3]);
1382 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1383 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1384 r_alias[0] =
pnegate(a_alias[0]);
1385 r_alias[1] =
pnegate(a_alias[1]);
1386 r_alias[2] =
pnegate(a_alias[2]);
1387 r_alias[3] =
pnegate(a_alias[3]);
1398 const Packet4h2&
a,
const Packet4h2&
b) {
1400 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1401 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1402 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1403 r_alias[0] =
pmul(a_alias[0], b_alias[0]);
1404 r_alias[1] =
pmul(a_alias[1], b_alias[1]);
1405 r_alias[2] =
pmul(a_alias[2], b_alias[2]);
1406 r_alias[3] =
pmul(a_alias[3], b_alias[3]);
1412 const Packet4h2&
a,
const Packet4h2&
b,
const Packet4h2&
c) {
1414 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1415 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1416 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1417 const half2* c_alias =
reinterpret_cast<const half2*
>(&
c);
1418 r_alias[0] =
pmadd(a_alias[0], b_alias[0], c_alias[0]);
1419 r_alias[1] =
pmadd(a_alias[1], b_alias[1], c_alias[1]);
1420 r_alias[2] =
pmadd(a_alias[2], b_alias[2], c_alias[2]);
1421 r_alias[3] =
pmadd(a_alias[3], b_alias[3], c_alias[3]);
1427 const Packet4h2&
a,
const Packet4h2&
b) {
1429 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1430 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1431 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1432 r_alias[0] =
pdiv(a_alias[0], b_alias[0]);
1433 r_alias[1] =
pdiv(a_alias[1], b_alias[1]);
1434 r_alias[2] =
pdiv(a_alias[2], b_alias[2]);
1435 r_alias[3] =
pdiv(a_alias[3], b_alias[3]);
1441 const Packet4h2&
a,
const Packet4h2&
b) {
1443 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1444 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1445 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1446 r_alias[0] =
pmin(a_alias[0], b_alias[0]);
1447 r_alias[1] =
pmin(a_alias[1], b_alias[1]);
1448 r_alias[2] =
pmin(a_alias[2], b_alias[2]);
1449 r_alias[3] =
pmin(a_alias[3], b_alias[3]);
1455 const Packet4h2&
a,
const Packet4h2&
b) {
1457 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1458 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1459 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1460 r_alias[0] =
pmax(a_alias[0], b_alias[0]);
1461 r_alias[1] =
pmax(a_alias[1], b_alias[1]);
1462 r_alias[2] =
pmax(a_alias[2], b_alias[2]);
1463 r_alias[3] =
pmax(a_alias[3], b_alias[3]);
1469 const Packet4h2&
a) {
1470 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1478 const Packet4h2&
a) {
1479 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1486 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1487 return (__hgt(
first, second) ?
first : second);
1489 float ffirst = __half2float(
first);
1490 float fsecond = __half2float(second);
1497 const Packet4h2&
a) {
1498 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1505 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1506 return (__hlt(
first, second) ?
first : second);
1508 float ffirst = __half2float(
first);
1509 float fsecond = __half2float(second);
1517 const Packet4h2&
a) {
1518 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1520 pmul(a_alias[2], a_alias[3])));
1525 plog1p<Packet4h2>(
const Packet4h2&
a) {
1527 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1528 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1529 r_alias[0] =
plog1p(a_alias[0]);
1530 r_alias[1] =
plog1p(a_alias[1]);
1531 r_alias[2] =
plog1p(a_alias[2]);
1532 r_alias[3] =
plog1p(a_alias[3]);
1538 pexpm1<Packet4h2>(
const Packet4h2&
a) {
1540 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1541 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1542 r_alias[0] =
pexpm1(a_alias[0]);
1543 r_alias[1] =
pexpm1(a_alias[1]);
1544 r_alias[2] =
pexpm1(a_alias[2]);
1545 r_alias[3] =
pexpm1(a_alias[3]);
1552 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1553 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1554 r_alias[0] =
plog(a_alias[0]);
1555 r_alias[1] =
plog(a_alias[1]);
1556 r_alias[2] =
plog(a_alias[2]);
1557 r_alias[3] =
plog(a_alias[3]);
1564 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1565 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1566 r_alias[0] =
pexp(a_alias[0]);
1567 r_alias[1] =
pexp(a_alias[1]);
1568 r_alias[2] =
pexp(a_alias[2]);
1569 r_alias[3] =
pexp(a_alias[3]);
1576 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1577 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1578 r_alias[0] =
psqrt(a_alias[0]);
1579 r_alias[1] =
psqrt(a_alias[1]);
1580 r_alias[2] =
psqrt(a_alias[2]);
1581 r_alias[3] =
psqrt(a_alias[3]);
1587 prsqrt<Packet4h2>(
const Packet4h2&
a) {
1589 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1590 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1591 r_alias[0] =
prsqrt(a_alias[0]);
1592 r_alias[1] =
prsqrt(a_alias[1]);
1593 r_alias[2] =
prsqrt(a_alias[2]);
1594 r_alias[3] =
prsqrt(a_alias[3]);
1603 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1604 return __hadd2(
a,
b);
1606 float a1 = __low2float(
a);
1607 float a2 = __high2float(
a);
1608 float b1 = __low2float(
b);
1609 float b2 = __high2float(
b);
1612 return __floats2half2_rn(
r1,
r2);
1619 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1620 return __hmul2(
a,
b);
1622 float a1 = __low2float(
a);
1623 float a2 = __high2float(
a);
1624 float b1 = __low2float(
b);
1625 float b2 = __high2float(
b);
1628 return __floats2half2_rn(
r1,
r2);
1635 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1636 return __h2div(
a,
b);
1638 float a1 = __low2float(
a);
1639 float a2 = __high2float(
a);
1640 float b1 = __low2float(
b);
1641 float b2 = __high2float(
b);
1644 return __floats2half2_rn(
r1,
r2);
1651 float a1 = __low2float(
a);
1652 float a2 = __high2float(
a);
1653 float b1 = __low2float(
b);
1654 float b2 = __high2float(
b);
1655 __half
r1 =
a1 <
b1 ? get_half2_low(
a) : get_half2_low(
b);
1656 __half
r2 =
a2 <
b2 ? get_half2_high(
a) : get_half2_high(
b);
1657 return combine_half(
r1,
r2);
1663 float a1 = __low2float(
a);
1664 float a2 = __high2float(
a);
1665 float b1 = __low2float(
b);
1666 float b2 = __high2float(
b);
1667 __half
r1 =
a1 >
b1 ? get_half2_low(
a) : get_half2_low(
b);
1668 __half
r2 =
a2 >
b2 ? get_half2_high(
a) : get_half2_high(
b);
1669 return combine_half(
r1,
r2);
1674 #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
1676 #undef EIGEN_GPU_HAS_LDG
1677 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1678 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
1685 #endif // EIGEN_PACKET_MATH_GPU_H