10 #ifndef EIGEN_PACKET_MATH_GPU_H 11 #define EIGEN_PACKET_MATH_GPU_H 18 #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) 19 #define EIGEN_GPU_HAS_LDG 1 23 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) 24 #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1 27 #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 28 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1 34 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) 36 template<>
struct is_arithmetic<float4> {
enum {
value =
true }; };
37 template<>
struct is_arithmetic<double2> {
enum {
value =
true }; };
39 template<>
struct packet_traits<
float> : default_packet_traits
66 HasGammaSampleDerAlpha = 1,
75 template<>
struct packet_traits<double> : default_packet_traits
100 HasGammaSampleDerAlpha = 1,
110 template<>
struct unpacket_traits<float4> {
typedef float type;
enum {
size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef float4 half; };
111 template<>
struct unpacket_traits<double2> {
typedef double type;
enum {
size=2, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef double2 half; };
114 return make_float4(from, from, from, from);
117 return make_double2(from, from);
123 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) 128 return __int_as_float(__float_as_int(a) & __float_as_int(b));
132 return __longlong_as_double(__double_as_longlong(a) &
133 __double_as_longlong(b));
138 return __int_as_float(__float_as_int(a) | __float_as_int(b));
142 return __longlong_as_double(__double_as_longlong(a) |
143 __double_as_longlong(b));
148 return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
152 return __longlong_as_double(__double_as_longlong(a) ^
153 __double_as_longlong(b));
158 return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
162 return __longlong_as_double(__double_as_longlong(a) &
163 ~__double_as_longlong(b));
167 return __int_as_float(a == b ? 0xffffffffu : 0u);
171 return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
176 return __int_as_float(a < b ? 0xffffffffu : 0u);
180 return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
188 return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
189 bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
194 return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
200 return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
201 bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
206 return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
212 return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
213 bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
218 return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
224 return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
225 bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
229 pandnot<double2>(
const double2&
a,
const double2&
b) {
230 return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
236 return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
242 return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
247 pcmp_eq<double2>(
const double2&
a,
const double2&
b) {
248 return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
252 pcmp_lt<double2>(
const double2&
a,
const double2&
b) {
253 return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
255 #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) 258 return make_float4(a, a+1, a+2, a+3);
261 return make_double2(a, a+1);
265 return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
268 return make_double2(a.x+b.x, a.y+b.y);
272 return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
275 return make_double2(a.x-b.x, a.y-b.y);
279 return make_float4(-a.x, -a.y, -a.z, -a.w);
282 return make_double2(-a.x, -a.y);
289 return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
292 return make_double2(a.x*b.x, a.y*b.y);
296 return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
299 return make_double2(a.x/b.x, a.y/b.y);
303 return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
306 return make_double2(
fmin(a.x, b.x),
fmin(a.y, b.y));
310 return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
313 return make_double2(
fmax(a.x, b.x),
fmax(a.y, b.y));
317 return *
reinterpret_cast<const float4*
>(from);
321 return *
reinterpret_cast<const double2*
>(from);
325 return make_float4(from[0], from[1], from[2], from[3]);
328 return make_double2(from[0], from[1]);
332 return make_float4(from[0], from[0], from[1], from[1]);
335 return make_double2(from[0], from[0]);
339 *
reinterpret_cast<float4*
>(to) = from;
343 *
reinterpret_cast<double2*
>(to) = from;
360 #if defined(EIGEN_GPU_HAS_LDG) 361 return __ldg((
const float4*)from);
363 return make_float4(from[0], from[1], from[2], from[3]);
368 #if defined(EIGEN_GPU_HAS_LDG) 369 return __ldg((
const double2*)from);
371 return make_double2(from[0], from[1]);
377 #if defined(EIGEN_GPU_HAS_LDG) 378 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
380 return make_float4(from[0], from[1], from[2], from[3]);
385 #if defined(EIGEN_GPU_HAS_LDG) 386 return make_double2(__ldg(from+0), __ldg(from+1));
388 return make_double2(from[0], from[1]);
393 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
397 return make_double2(from[0*stride], from[1*stride]);
400 template<>
EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(
float* to,
const float4& from,
Index stride) {
401 to[stride*0] = from.x;
402 to[stride*1] = from.y;
403 to[stride*2] = from.z;
404 to[stride*3] = from.w;
406 template<>
EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(
double* to,
const double2& from,
Index stride) {
407 to[stride*0] = from.x;
408 to[stride*1] = from.y;
419 return a.x + a.y + a.z + a.w;
426 return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
429 return fmax(a.x, a.y);
433 return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
436 return fmin(a.x, a.y);
440 return a.x * a.y * a.z * a.w;
447 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
450 return make_double2(
fabs(a.x),
fabs(a.y));
454 return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
462 float tmp = kernel.packet[0].y;
463 kernel.packet[0].y = kernel.packet[1].x;
464 kernel.packet[1].x = tmp;
466 tmp = kernel.packet[0].z;
467 kernel.packet[0].z = kernel.packet[2].x;
468 kernel.packet[2].x = tmp;
470 tmp = kernel.packet[0].w;
471 kernel.packet[0].w = kernel.packet[3].x;
472 kernel.packet[3].x = tmp;
474 tmp = kernel.packet[1].z;
475 kernel.packet[1].z = kernel.packet[2].y;
476 kernel.packet[2].y = tmp;
478 tmp = kernel.packet[1].w;
479 kernel.packet[1].w = kernel.packet[3].y;
480 kernel.packet[3].y = tmp;
482 tmp = kernel.packet[2].w;
483 kernel.packet[2].w = kernel.packet[3].z;
484 kernel.packet[3].z = tmp;
489 double tmp = kernel.packet[0].y;
490 kernel.packet[0].y = kernel.packet[1].x;
491 kernel.packet[1].x = tmp;
494 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) 498 #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) 500 typedef ulonglong2 Packet4h2;
501 template<>
struct unpacket_traits<Packet4h2> {
typedef Eigen::half type;
enum {
size=8, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef Packet4h2 half; };
502 template<>
struct is_arithmetic<Packet4h2> {
enum {
value =
true }; };
504 template<>
struct unpacket_traits<half2> {
typedef Eigen::half type;
enum {
size=2, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef half2 half; };
505 template<>
struct is_arithmetic<half2> {
enum {
value =
true }; };
507 template<>
struct packet_traits<
Eigen::half> : default_packet_traits
509 typedef Packet4h2
type;
510 typedef Packet4h2 half;
532 #if defined(EIGEN_GPU_COMPILE_PHASE) 533 return __halves2half2(a, b);
536 return __floats2half2_rn(__half2float(a), __half2float(b));
541 #if defined(EIGEN_GPU_COMPILE_PHASE) 542 return __low2half(a);
544 return __float2half(__low2float(a));
549 #if defined(EIGEN_GPU_COMPILE_PHASE) 550 return __high2half(a);
552 return __float2half(__high2float(a));
559 #if defined(EIGEN_GPU_COMPILE_PHASE) 560 return __half2half2(from);
562 const float f = __half2float(from);
563 return __floats2half2_rn(f, f);
571 half2* p_alias =
reinterpret_cast<half2*
>(&r);
572 p_alias[0] = pset1<half2>(from);
573 p_alias[1] = pset1<half2>(from);
574 p_alias[2] = pset1<half2>(from);
575 p_alias[3] = pset1<half2>(from);
584 return *
reinterpret_cast<const half2*
>(from);
588 return combine_half(from[0], from[1]);
592 return combine_half(from[0], from[0]);
597 *
reinterpret_cast<half2*
>(to) = from;
602 to[0] = get_half2_low(from);
603 to[1] = get_half2_high(from);
609 #if defined(EIGEN_GPU_HAS_LDG) 611 return __ldg(reinterpret_cast<const half2*>(from));
613 return combine_half(*(from+0), *(from+1));
619 #if defined(EIGEN_GPU_HAS_LDG) 620 return __halves2half2(__ldg(from+0), __ldg(from+1));
622 return combine_half(*(from+0), *(from+1));
628 return combine_half(from[0*stride], from[1*stride]);
633 to[stride*0] = get_half2_low(from);
634 to[stride*1] = get_half2_high(from);
638 return get_half2_low(a);
642 half
a1 = get_half2_low(a);
643 half
a2 = get_half2_high(a);
646 return combine_half(result1, result2);
651 return pset1<half2>(true_half);
656 return pset1<half2>(false_half);
661 __half
a1 = get_half2_low(kernel.packet[0]);
662 __half
a2 = get_half2_high(kernel.packet[0]);
663 __half
b1 = get_half2_low(kernel.packet[1]);
664 __half
b2 = get_half2_high(kernel.packet[1]);
665 kernel.packet[0] = combine_half(a1, b1);
666 kernel.packet[1] = combine_half(a2, b2);
670 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 671 return __halves2half2(a, __hadd(a, __float2half(1.0
f)));
673 float f = __half2float(a) + 1.0f;
674 return combine_half(a, __float2half(f));
681 half mask_low = get_half2_low(mask);
682 half mask_high = get_half2_high(mask);
683 half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
684 half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
685 return combine_half(result_low, result_high);
692 half
a1 = get_half2_low(a);
693 half
a2 = get_half2_high(a);
694 half
b1 = get_half2_low(b);
695 half
b2 = get_half2_high(b);
696 half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
697 half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
698 return combine_half(eq1, eq2);
705 half
a1 = get_half2_low(a);
706 half
a2 = get_half2_high(a);
707 half
b1 = get_half2_low(b);
708 half
b2 = get_half2_high(b);
709 half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
710 half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
711 return combine_half(eq1, eq2);
716 half
a1 = get_half2_low(a);
717 half
a2 = get_half2_high(a);
718 half
b1 = get_half2_low(b);
719 half
b2 = get_half2_high(b);
722 return combine_half(result1, result2);
727 half
a1 = get_half2_low(a);
728 half
a2 = get_half2_high(a);
729 half
b1 = get_half2_low(b);
730 half
b2 = get_half2_high(b);
733 return combine_half(result1, result2);
738 half
a1 = get_half2_low(a);
739 half
a2 = get_half2_high(a);
740 half
b1 = get_half2_low(b);
741 half
b2 = get_half2_high(b);
744 return combine_half(result1, result2);
749 half
a1 = get_half2_low(a);
750 half
a2 = get_half2_high(a);
751 half
b1 = get_half2_low(b);
752 half
b2 = get_half2_high(b);
755 return combine_half(result1, result2);
760 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 761 return __hadd2(a, b);
763 float a1 = __low2float(a);
764 float a2 = __high2float(a);
765 float b1 = __low2float(b);
766 float b2 = __high2float(b);
769 return __floats2half2_rn(r1, r2);
775 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 776 return __hsub2(a, b);
778 float a1 = __low2float(a);
779 float a2 = __high2float(a);
780 float b1 = __low2float(b);
781 float b2 = __high2float(b);
784 return __floats2half2_rn(r1, r2);
789 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 792 float a1 = __low2float(a);
793 float a2 = __high2float(a);
794 return __floats2half2_rn(-a1, -a2);
802 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 803 return __hmul2(a, b);
805 float a1 = __low2float(a);
806 float a2 = __high2float(a);
807 float b1 = __low2float(b);
808 float b2 = __high2float(b);
811 return __floats2half2_rn(r1, r2);
818 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 819 return __hfma2(a, b, c);
821 float a1 = __low2float(a);
822 float a2 = __high2float(a);
823 float b1 = __low2float(b);
824 float b2 = __high2float(b);
825 float c1 = __low2float(c);
826 float c2 = __high2float(c);
827 float r1 = a1 * b1 + c1;
828 float r2 = a2 * b2 + c2;
829 return __floats2half2_rn(r1, r2);
835 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 836 return __h2div(a, b);
838 float a1 = __low2float(a);
839 float a2 = __high2float(a);
840 float b1 = __low2float(b);
841 float b2 = __high2float(b);
844 return __floats2half2_rn(r1, r2);
850 float a1 = __low2float(a);
851 float a2 = __high2float(a);
852 float b1 = __low2float(b);
853 float b2 = __high2float(b);
854 __half
r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
855 __half
r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
856 return combine_half(r1, r2);
861 float a1 = __low2float(a);
862 float a2 = __high2float(a);
863 float b1 = __low2float(b);
864 float b2 = __high2float(b);
865 __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
866 __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
867 return combine_half(r1, r2);
871 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 872 return __hadd(__low2half(a), __high2half(a));
874 float a1 = __low2float(a);
875 float a2 = __high2float(a);
881 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 882 __half
first = __low2half(a);
883 __half
second = __high2half(a);
884 return __hgt(first, second) ?
first : second;
886 float a1 = __low2float(a);
887 float a2 = __high2float(a);
888 return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
893 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 894 __half first = __low2half(a);
895 __half second = __high2half(a);
896 return __hlt(first, second) ?
first : second;
898 float a1 = __low2float(a);
899 float a2 = __high2float(a);
900 return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
905 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 906 return __hmul(__low2half(a), __high2half(a));
908 float a1 = __low2float(a);
909 float a2 = __high2float(a);
915 float a1 = __low2float(a);
916 float a2 = __high2float(a);
917 float r1 = log1pf(a1);
918 float r2 = log1pf(a2);
919 return __floats2half2_rn(r1, r2);
923 float a1 = __low2float(a);
924 float a2 = __high2float(a);
925 float r1 = expm1f(a1);
926 float r2 = expm1f(a2);
927 return __floats2half2_rn(r1, r2);
930 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \ 931 defined(EIGEN_HIP_DEVICE_COMPILE) 934 half2
plog(
const half2& a) {
939 half2
pexp(
const half2& a) {
944 half2
psqrt(
const half2& a) {
949 half2
prsqrt(
const half2& a) {
956 float a1 = __low2float(a);
957 float a2 = __high2float(a);
960 return __floats2half2_rn(r1, r2);
964 float a1 = __low2float(a);
965 float a2 = __high2float(a);
968 return __floats2half2_rn(r1, r2);
972 float a1 = __low2float(a);
973 float a2 = __high2float(a);
974 float r1 = sqrtf(a1);
975 float r2 = sqrtf(a2);
976 return __floats2half2_rn(r1, r2);
980 float a1 = __low2float(a);
981 float a2 = __high2float(a);
982 float r1 = rsqrtf(a1);
983 float r2 = rsqrtf(a2);
984 return __floats2half2_rn(r1, r2);
992 return *
reinterpret_cast<const Packet4h2*
>(from);
1000 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1001 p_alias[0] =
ploadu(from + 0);
1002 p_alias[1] =
ploadu(from + 2);
1003 p_alias[2] =
ploadu(from + 4);
1004 p_alias[3] =
ploadu(from + 6);
1012 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1023 *
reinterpret_cast<Packet4h2*
>(to) = from;
1029 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1030 pstoreu(to + 0,from_alias[0]);
1031 pstoreu(to + 2,from_alias[1]);
1032 pstoreu(to + 4,from_alias[2]);
1033 pstoreu(to + 6,from_alias[3]);
1038 ploadt_ro<Packet4h2, Aligned>(
const Eigen::half* from) {
1039 #if defined(EIGEN_GPU_HAS_LDG) 1041 r = __ldg(reinterpret_cast<const Packet4h2*>(from));
1045 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1046 r_alias[0] = ploadt_ro_aligned(from + 0);
1047 r_alias[1] = ploadt_ro_aligned(from + 2);
1048 r_alias[2] = ploadt_ro_aligned(from + 4);
1049 r_alias[3] = ploadt_ro_aligned(from + 6);
1056 ploadt_ro<Packet4h2, Unaligned>(
const Eigen::half* from) {
1058 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1059 r_alias[0] = ploadt_ro_unaligned(from + 0);
1060 r_alias[1] = ploadt_ro_unaligned(from + 2);
1061 r_alias[2] = ploadt_ro_unaligned(from + 4);
1062 r_alias[3] = ploadt_ro_unaligned(from + 6);
1070 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1071 p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
1072 p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
1073 p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
1074 p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
1081 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1082 pscatter(to + stride * 0, from_alias[0], stride);
1083 pscatter(to + stride * 2, from_alias[1], stride);
1084 pscatter(to + stride * 4, from_alias[2], stride);
1085 pscatter(to + stride * 6, from_alias[3], stride);
1090 const Packet4h2&
a) {
1091 return pfirst(*(reinterpret_cast<const half2*>(&a)));
1096 const Packet4h2&
a) {
1098 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1099 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1100 p_alias[0] =
pabs(a_alias[0]);
1101 p_alias[1] =
pabs(a_alias[1]);
1102 p_alias[2] =
pabs(a_alias[2]);
1103 p_alias[3] =
pabs(a_alias[3]);
1109 const Packet4h2& ) {
1111 return pset1<Packet4h2>(true_half);
1117 return pset1<Packet4h2>(false_half);
1121 double* d_row0,
double* d_row1,
double* d_row2,
double* d_row3,
1122 double* d_row4,
double* d_row5,
double* d_row6,
double* d_row7) {
1125 d_row0[1] = d_row4[0];
1129 d_row1[1] = d_row5[0];
1133 d_row2[1] = d_row6[0];
1137 d_row3[1] = d_row7[0];
1142 half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
1145 f_row0[1] = f_row2[0];
1149 f_row1[1] = f_row3[0];
1154 ptranspose_half(half2& f0, half2&
f1) {
1155 __half a1 = get_half2_low(f0);
1156 __half a2 = get_half2_high(f0);
1157 __half b1 = get_half2_low(f1);
1158 __half b2 = get_half2_high(f1);
1159 f0 = combine_half(a1, b1);
1160 f1 = combine_half(a2, b2);
1164 ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1165 double* d_row0 =
reinterpret_cast<double*
>(&kernel.packet[0]);
1166 double* d_row1 =
reinterpret_cast<double*
>(&kernel.packet[1]);
1167 double* d_row2 =
reinterpret_cast<double*
>(&kernel.packet[2]);
1168 double* d_row3 =
reinterpret_cast<double*
>(&kernel.packet[3]);
1169 double* d_row4 =
reinterpret_cast<double*
>(&kernel.packet[4]);
1170 double* d_row5 =
reinterpret_cast<double*
>(&kernel.packet[5]);
1171 double* d_row6 =
reinterpret_cast<double*
>(&kernel.packet[6]);
1172 double* d_row7 =
reinterpret_cast<double*
>(&kernel.packet[7]);
1173 ptranspose_double(d_row0, d_row1, d_row2, d_row3,
1174 d_row4, d_row5, d_row6, d_row7);
1177 half2* f_row0 =
reinterpret_cast<half2*
>(d_row0);
1178 half2* f_row1 =
reinterpret_cast<half2*
>(d_row1);
1179 half2* f_row2 =
reinterpret_cast<half2*
>(d_row2);
1180 half2* f_row3 =
reinterpret_cast<half2*
>(d_row3);
1181 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1182 ptranspose_half(f_row0[0], f_row1[0]);
1183 ptranspose_half(f_row0[1], f_row1[1]);
1184 ptranspose_half(f_row2[0], f_row3[0]);
1185 ptranspose_half(f_row2[1], f_row3[1]);
1187 f_row0 =
reinterpret_cast<half2*
>(d_row0 + 1);
1188 f_row1 =
reinterpret_cast<half2*
>(d_row1 + 1);
1189 f_row2 =
reinterpret_cast<half2*
>(d_row2 + 1);
1190 f_row3 =
reinterpret_cast<half2*
>(d_row3 + 1);
1191 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1192 ptranspose_half(f_row0[0], f_row1[0]);
1193 ptranspose_half(f_row0[1], f_row1[1]);
1194 ptranspose_half(f_row2[0], f_row3[0]);
1195 ptranspose_half(f_row2[1], f_row3[1]);
1197 f_row0 =
reinterpret_cast<half2*
>(d_row4);
1198 f_row1 =
reinterpret_cast<half2*
>(d_row5);
1199 f_row2 =
reinterpret_cast<half2*
>(d_row6);
1200 f_row3 =
reinterpret_cast<half2*
>(d_row7);
1201 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1202 ptranspose_half(f_row0[0], f_row1[0]);
1203 ptranspose_half(f_row0[1], f_row1[1]);
1204 ptranspose_half(f_row2[0], f_row3[0]);
1205 ptranspose_half(f_row2[1], f_row3[1]);
1207 f_row0 =
reinterpret_cast<half2*
>(d_row4 + 1);
1208 f_row1 =
reinterpret_cast<half2*
>(d_row5 + 1);
1209 f_row2 =
reinterpret_cast<half2*
>(d_row6 + 1);
1210 f_row3 =
reinterpret_cast<half2*
>(d_row7 + 1);
1211 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1212 ptranspose_half(f_row0[0], f_row1[0]);
1213 ptranspose_half(f_row0[1], f_row1[1]);
1214 ptranspose_half(f_row2[0], f_row3[0]);
1215 ptranspose_half(f_row2[1], f_row3[1]);
1222 #if defined(EIGEN_HIP_DEVICE_COMPILE) 1225 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1226 p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
1227 p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
1228 __hadd(a, __float2half(3.0f)));
1229 p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
1230 __hadd(a, __float2half(5.0f)));
1231 p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
1232 __hadd(a, __float2half(7.0f)));
1234 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 1236 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1238 half2 b = pset1<half2>(
a);
1240 half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
1241 half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
1243 c = __hadd2(b, half_offset0);
1244 r_alias[0] =
plset(__low2half(c));
1245 r_alias[1] =
plset(__high2half(c));
1247 c = __hadd2(b, half_offset1);
1248 r_alias[2] =
plset(__low2half(c));
1249 r_alias[3] =
plset(__high2half(c));
1254 float f = __half2float(a);
1256 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1257 p_alias[0] = combine_half(a, __float2half(f + 1.0f));
1258 p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
1259 p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
1260 p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
1267 pselect<Packet4h2>(
const Packet4h2& mask,
const Packet4h2&
a,
1268 const Packet4h2&
b) {
1270 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1271 const half2* mask_alias =
reinterpret_cast<const half2*
>(&mask);
1272 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1273 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1274 r_alias[0] =
pselect(mask_alias[0], a_alias[0], b_alias[0]);
1275 r_alias[1] =
pselect(mask_alias[1], a_alias[1], b_alias[1]);
1276 r_alias[2] =
pselect(mask_alias[2], a_alias[2], b_alias[2]);
1277 r_alias[3] =
pselect(mask_alias[3], a_alias[3], b_alias[3]);
1283 pcmp_eq<Packet4h2>(
const Packet4h2&
a,
const Packet4h2&
b) {
1285 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1286 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1287 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1288 r_alias[0] =
pcmp_eq(a_alias[0], b_alias[0]);
1289 r_alias[1] =
pcmp_eq(a_alias[1], b_alias[1]);
1290 r_alias[2] =
pcmp_eq(a_alias[2], b_alias[2]);
1291 r_alias[3] =
pcmp_eq(a_alias[3], b_alias[3]);
1297 const Packet4h2&
a,
const Packet4h2&
b) {
1299 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1300 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1301 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1302 r_alias[0] =
pand(a_alias[0], b_alias[0]);
1303 r_alias[1] =
pand(a_alias[1], b_alias[1]);
1304 r_alias[2] =
pand(a_alias[2], b_alias[2]);
1305 r_alias[3] =
pand(a_alias[3], b_alias[3]);
1311 const Packet4h2&
a,
const Packet4h2&
b) {
1313 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1314 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1315 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1316 r_alias[0] =
por(a_alias[0], b_alias[0]);
1317 r_alias[1] =
por(a_alias[1], b_alias[1]);
1318 r_alias[2] =
por(a_alias[2], b_alias[2]);
1319 r_alias[3] =
por(a_alias[3], b_alias[3]);
1325 const Packet4h2&
a,
const Packet4h2&
b) {
1327 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1328 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1329 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1330 r_alias[0] =
pxor(a_alias[0], b_alias[0]);
1331 r_alias[1] =
pxor(a_alias[1], b_alias[1]);
1332 r_alias[2] =
pxor(a_alias[2], b_alias[2]);
1333 r_alias[3] =
pxor(a_alias[3], b_alias[3]);
1339 pandnot<Packet4h2>(
const Packet4h2&
a,
const Packet4h2&
b) {
1341 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1342 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1343 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1344 r_alias[0] =
pandnot(a_alias[0], b_alias[0]);
1345 r_alias[1] =
pandnot(a_alias[1], b_alias[1]);
1346 r_alias[2] =
pandnot(a_alias[2], b_alias[2]);
1347 r_alias[3] =
pandnot(a_alias[3], b_alias[3]);
1353 const Packet4h2&
a,
const Packet4h2&
b) {
1355 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1356 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1357 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1358 r_alias[0] =
padd(a_alias[0], b_alias[0]);
1359 r_alias[1] =
padd(a_alias[1], b_alias[1]);
1360 r_alias[2] =
padd(a_alias[2], b_alias[2]);
1361 r_alias[3] =
padd(a_alias[3], b_alias[3]);
1367 const Packet4h2&
a,
const Packet4h2&
b) {
1369 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1370 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1371 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1372 r_alias[0] =
psub(a_alias[0], b_alias[0]);
1373 r_alias[1] =
psub(a_alias[1], b_alias[1]);
1374 r_alias[2] =
psub(a_alias[2], b_alias[2]);
1375 r_alias[3] =
psub(a_alias[3], b_alias[3]);
1382 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1383 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1384 r_alias[0] =
pnegate(a_alias[0]);
1385 r_alias[1] =
pnegate(a_alias[1]);
1386 r_alias[2] =
pnegate(a_alias[2]);
1387 r_alias[3] =
pnegate(a_alias[3]);
1398 const Packet4h2&
a,
const Packet4h2&
b) {
1400 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1401 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1402 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1403 r_alias[0] =
pmul(a_alias[0], b_alias[0]);
1404 r_alias[1] =
pmul(a_alias[1], b_alias[1]);
1405 r_alias[2] =
pmul(a_alias[2], b_alias[2]);
1406 r_alias[3] =
pmul(a_alias[3], b_alias[3]);
1412 const Packet4h2&
a,
const Packet4h2&
b,
const Packet4h2&
c) {
1414 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1415 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1416 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1417 const half2* c_alias =
reinterpret_cast<const half2*
>(&
c);
1418 r_alias[0] =
pmadd(a_alias[0], b_alias[0], c_alias[0]);
1419 r_alias[1] =
pmadd(a_alias[1], b_alias[1], c_alias[1]);
1420 r_alias[2] =
pmadd(a_alias[2], b_alias[2], c_alias[2]);
1421 r_alias[3] =
pmadd(a_alias[3], b_alias[3], c_alias[3]);
1427 const Packet4h2&
a,
const Packet4h2&
b) {
1429 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1430 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1431 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1432 r_alias[0] =
pdiv(a_alias[0], b_alias[0]);
1433 r_alias[1] =
pdiv(a_alias[1], b_alias[1]);
1434 r_alias[2] =
pdiv(a_alias[2], b_alias[2]);
1435 r_alias[3] =
pdiv(a_alias[3], b_alias[3]);
1441 const Packet4h2&
a,
const Packet4h2&
b) {
1443 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1444 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1445 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1446 r_alias[0] =
pmin(a_alias[0], b_alias[0]);
1447 r_alias[1] =
pmin(a_alias[1], b_alias[1]);
1448 r_alias[2] =
pmin(a_alias[2], b_alias[2]);
1449 r_alias[3] =
pmin(a_alias[3], b_alias[3]);
1455 const Packet4h2&
a,
const Packet4h2&
b) {
1457 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1458 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1459 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1460 r_alias[0] =
pmax(a_alias[0], b_alias[0]);
1461 r_alias[1] =
pmax(a_alias[1], b_alias[1]);
1462 r_alias[2] =
pmax(a_alias[2], b_alias[2]);
1463 r_alias[3] =
pmax(a_alias[3], b_alias[3]);
1469 const Packet4h2&
a) {
1470 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1478 const Packet4h2&
a) {
1479 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1480 half2 m0 = combine_half(
predux_max(a_alias[0]),
1486 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 1487 return (__hgt(first, second) ? first : second);
1489 float ffirst = __half2float(first);
1490 float fsecond = __half2float(second);
1491 return (ffirst > fsecond)?
first: second;
1497 const Packet4h2&
a) {
1498 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1499 half2 m0 = combine_half(
predux_min(a_alias[0]),
1501 half2 m1 = combine_half(
predux_min(a_alias[2]),
1505 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 1506 return (__hlt(first, second) ? first : second);
1508 float ffirst = __half2float(first);
1509 float fsecond = __half2float(second);
1510 return (ffirst < fsecond)?
first: second;
1517 const Packet4h2&
a) {
1518 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1520 pmul(a_alias[2], a_alias[3])));
1525 plog1p<Packet4h2>(
const Packet4h2&
a) {
1527 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1528 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1529 r_alias[0] =
plog1p(a_alias[0]);
1530 r_alias[1] =
plog1p(a_alias[1]);
1531 r_alias[2] =
plog1p(a_alias[2]);
1532 r_alias[3] =
plog1p(a_alias[3]);
1538 pexpm1<Packet4h2>(
const Packet4h2&
a) {
1540 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1541 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1542 r_alias[0] =
pexpm1(a_alias[0]);
1543 r_alias[1] =
pexpm1(a_alias[1]);
1544 r_alias[2] =
pexpm1(a_alias[2]);
1545 r_alias[3] =
pexpm1(a_alias[3]);
1552 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1553 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1554 r_alias[0] =
plog(a_alias[0]);
1555 r_alias[1] =
plog(a_alias[1]);
1556 r_alias[2] =
plog(a_alias[2]);
1557 r_alias[3] =
plog(a_alias[3]);
1564 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1565 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1566 r_alias[0] =
pexp(a_alias[0]);
1567 r_alias[1] =
pexp(a_alias[1]);
1568 r_alias[2] =
pexp(a_alias[2]);
1569 r_alias[3] =
pexp(a_alias[3]);
1576 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1577 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1578 r_alias[0] =
psqrt(a_alias[0]);
1579 r_alias[1] =
psqrt(a_alias[1]);
1580 r_alias[2] =
psqrt(a_alias[2]);
1581 r_alias[3] =
psqrt(a_alias[3]);
1587 prsqrt<Packet4h2>(
const Packet4h2&
a) {
1589 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1590 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1591 r_alias[0] =
prsqrt(a_alias[0]);
1592 r_alias[1] =
prsqrt(a_alias[1]);
1593 r_alias[2] =
prsqrt(a_alias[2]);
1594 r_alias[3] =
prsqrt(a_alias[3]);
1603 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 1604 return __hadd2(a, b);
1606 float a1 = __low2float(a);
1607 float a2 = __high2float(a);
1608 float b1 = __low2float(b);
1609 float b2 = __high2float(b);
1612 return __floats2half2_rn(r1, r2);
1619 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 1620 return __hmul2(a, b);
1622 float a1 = __low2float(a);
1623 float a2 = __high2float(a);
1624 float b1 = __low2float(b);
1625 float b2 = __high2float(b);
1628 return __floats2half2_rn(r1, r2);
1635 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 1636 return __h2div(a, b);
1638 float a1 = __low2float(a);
1639 float a2 = __high2float(a);
1640 float b1 = __low2float(b);
1641 float b2 = __high2float(b);
1644 return __floats2half2_rn(r1, r2);
1651 float a1 = __low2float(a);
1652 float a2 = __high2float(a);
1653 float b1 = __low2float(b);
1654 float b2 = __high2float(b);
1655 __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
1656 __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
1657 return combine_half(r1, r2);
1663 float a1 = __low2float(a);
1664 float a2 = __high2float(a);
1665 float b1 = __low2float(b);
1666 float b2 = __high2float(b);
1667 __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
1668 __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
1669 return combine_half(r1, r2);
1674 #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) 1676 #undef EIGEN_GPU_HAS_LDG 1677 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC 1678 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC 1685 #endif // EIGEN_PACKET_MATH_GPU_H
#define EIGEN_ALWAYS_INLINE
#define EIGEN_STRONG_INLINE
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_min(const Packet &a)
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_max(const Packet &a)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16 &a, const bfloat16 &b)
Namespace containing all symbols from the Eigen library.
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
EIGEN_CONSTEXPR Index first(const T &x) EIGEN_NOEXCEPT
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
EIGEN_DEVICE_FUNC void pscatter(Scalar *to, const Packet &from, Index)
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits< Packet >::type &a)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16 &a, const bfloat16 &b)
EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f &a, const Packet4f &b)
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet &a)
EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h &a)
EIGEN_DEVICE_FUNC const FloorReturnType floor() const
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog1p(const Packet &a)
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
EIGEN_DEVICE_FUNC Packet ploadu(const typename unpacket_traits< Packet >::type *from)
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits< Packet >::type *from)
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
EIGEN_CONSTEXPR Index size(const T &x)
#define EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Point2 f1(const Point3 &p, OptionalJacobian< 2, 3 > H)
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_mul(const Packet &a)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog(const Packet &a)
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f &a)
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexpm1(const Packet &a)
EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f &a)
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)
EIGEN_DEVICE_FUNC Packet pgather(const Scalar *from, Index)
EIGEN_DEVICE_FUNC Packet pload(const typename unpacket_traits< Packet >::type *from)
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)