10 #ifndef EIGEN_PACKET_MATH_GPU_H 
   11 #define EIGEN_PACKET_MATH_GPU_H 
   18 #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) 
   19 #define EIGEN_GPU_HAS_LDG 1 
   23 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) 
   24 #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1 
   27 #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 
   28 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1 
   34 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) 
   36 template<> 
struct is_arithmetic<float4>  { 
enum { 
value = 
true }; };
 
   37 template<> 
struct is_arithmetic<
double2> { 
enum { 
value = 
true }; };
 
   39 template<> 
struct packet_traits<
float> : default_packet_traits
 
   75 template<> 
struct packet_traits<double> : default_packet_traits
 
  114   return make_float4(from, from, from, from);
 
  117   return make_double2(from, from);
 
  123 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) 
  128   return __int_as_float(__float_as_int(
a) & __float_as_int(
b));
 
  132   return __longlong_as_double(__double_as_longlong(
a) &
 
  133                               __double_as_longlong(
b));
 
  138   return __int_as_float(__float_as_int(
a) | __float_as_int(
b));
 
  142   return __longlong_as_double(__double_as_longlong(
a) |
 
  143                               __double_as_longlong(
b));
 
  148   return __int_as_float(__float_as_int(
a) ^ __float_as_int(
b));
 
  152   return __longlong_as_double(__double_as_longlong(
a) ^
 
  153                               __double_as_longlong(
b));
 
  158   return __int_as_float(__float_as_int(
a) & ~__float_as_int(
b));
 
  162   return __longlong_as_double(__double_as_longlong(
a) &
 
  163                               ~__double_as_longlong(
b));
 
  167   return __int_as_float(
a == 
b ? 0xffffffffu : 0u);
 
  171   return __longlong_as_double(
a == 
b ? 0xffffffffffffffffull : 0ull);
 
  176   return __int_as_float(
a < 
b ? 0xffffffffu : 0u);
 
  180   return __longlong_as_double(
a < 
b ? 0xffffffffffffffffull : 0ull);
 
  188   return make_float4(bitwise_and(
a.x, 
b.x), bitwise_and(
a.y, 
b.y),
 
  189                      bitwise_and(
a.z, 
b.z), bitwise_and(
a.w, 
b.w));
 
  194   return make_double2(bitwise_and(
a.x, 
b.x), bitwise_and(
a.y, 
b.y));
 
  200   return make_float4(bitwise_or(
a.x, 
b.x), bitwise_or(
a.y, 
b.y),
 
  201                      bitwise_or(
a.z, 
b.z), bitwise_or(
a.w, 
b.w));
 
  206   return make_double2(bitwise_or(
a.x, 
b.x), bitwise_or(
a.y, 
b.y));
 
  212   return make_float4(bitwise_xor(
a.x, 
b.x), bitwise_xor(
a.y, 
b.y),
 
  213                      bitwise_xor(
a.z, 
b.z), bitwise_xor(
a.w, 
b.w));
 
  218   return make_double2(bitwise_xor(
a.x, 
b.x), bitwise_xor(
a.y, 
b.y));
 
  224   return make_float4(bitwise_andnot(
a.x, 
b.x), bitwise_andnot(
a.y, 
b.y),
 
  225                      bitwise_andnot(
a.z, 
b.z), bitwise_andnot(
a.w, 
b.w));
 
  230   return make_double2(bitwise_andnot(
a.x, 
b.x), bitwise_andnot(
a.y, 
b.y));
 
  236   return make_float4(eq_mask(
a.x, 
b.x), eq_mask(
a.y, 
b.y), eq_mask(
a.z, 
b.z),
 
  242   return make_float4(lt_mask(
a.x, 
b.x), lt_mask(
a.y, 
b.y), lt_mask(
a.z, 
b.z),
 
  248   return make_double2(eq_mask(
a.x, 
b.x), eq_mask(
a.y, 
b.y));
 
  253   return make_double2(lt_mask(
a.x, 
b.x), lt_mask(
a.y, 
b.y));
 
  255 #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) 
  258   return make_float4(
a, 
a+1, 
a+2, 
a+3);
 
  261   return make_double2(
a, 
a+1);
 
  265   return make_float4(
a.x+
b.x, 
a.y+
b.y, 
a.z+
b.z, 
a.w+
b.w);
 
  268   return make_double2(
a.x+
b.x, 
a.y+
b.y);
 
  272   return make_float4(
a.x-
b.x, 
a.y-
b.y, 
a.z-
b.z, 
a.w-
b.w);
 
  275   return make_double2(
a.x-
b.x, 
a.y-
b.y);
 
  279   return make_float4(-
a.x, -
a.y, -
a.z, -
a.w);
 
  282   return make_double2(-
a.x, -
a.y);
 
  289   return make_float4(
a.x*
b.x, 
a.y*
b.y, 
a.z*
b.z, 
a.w*
b.w);
 
  292   return make_double2(
a.x*
b.x, 
a.y*
b.y);
 
  296   return make_float4(
a.x/
b.x, 
a.y/
b.y, 
a.z/
b.z, 
a.w/
b.w);
 
  299   return make_double2(
a.x/
b.x, 
a.y/
b.y);
 
  303   return make_float4(fminf(
a.x, 
b.x), fminf(
a.y, 
b.y), fminf(
a.z, 
b.z), fminf(
a.w, 
b.w));
 
  310   return make_float4(fmaxf(
a.x, 
b.x), fmaxf(
a.y, 
b.y), fmaxf(
a.z, 
b.z), fmaxf(
a.w, 
b.w));
 
  317   return *
reinterpret_cast<const float4*
>(from);
 
  321   return *
reinterpret_cast<const double2*
>(from);
 
  325   return make_float4(from[0], from[1], from[2], from[3]);
 
  328   return make_double2(from[0], from[1]);
 
  332   return make_float4(from[0], from[0], from[1], from[1]);
 
  335   return make_double2(from[0], from[0]);
 
  339   *
reinterpret_cast<float4*
>(to) = from;
 
  343   *
reinterpret_cast<double2*
>(to) = from;
 
  360 #if defined(EIGEN_GPU_HAS_LDG) 
  361   return __ldg((
const float4*)from);
 
  363   return make_float4(from[0], from[1], from[2], from[3]);
 
  368 #if defined(EIGEN_GPU_HAS_LDG) 
  369   return __ldg((
const double2*)from);
 
  371   return make_double2(from[0], from[1]);
 
  377 #if defined(EIGEN_GPU_HAS_LDG) 
  378   return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
 
  380   return make_float4(from[0], from[1], from[2], from[3]);
 
  385 #if defined(EIGEN_GPU_HAS_LDG) 
  386   return make_double2(__ldg(from+0), __ldg(from+1));
 
  388   return make_double2(from[0], from[1]);
 
  393   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
 
  397   return make_double2(from[0*stride], from[1*stride]);
 
  400 template<> 
EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(
float* to, 
const float4& from, 
Index stride) {
 
  401   to[stride*0] = from.x;
 
  402   to[stride*1] = from.y;
 
  403   to[stride*2] = from.z;
 
  404   to[stride*3] = from.w;
 
  407   to[stride*0] = from.
x;
 
  408   to[stride*1] = from.y;
 
  419   return a.x + 
a.y + 
a.z + 
a.w;
 
  426   return fmaxf(fmaxf(
a.x, 
a.y), fmaxf(
a.z, 
a.w));
 
  433   return fminf(fminf(
a.x, 
a.y), fminf(
a.z, 
a.w));
 
  440   return a.x * 
a.y * 
a.z * 
a.w;
 
  447   return make_float4(fabsf(
a.x), fabsf(
a.y), fabsf(
a.z), fabsf(
a.w));
 
  454   return make_float4(floorf(
a.x), floorf(
a.y), floorf(
a.z), floorf(
a.w));
 
  462   float tmp = kernel.packet[0].y;
 
  463   kernel.packet[0].y = kernel.packet[1].x;
 
  464   kernel.packet[1].x = tmp;
 
  466   tmp = kernel.packet[0].z;
 
  467   kernel.packet[0].z = kernel.packet[2].x;
 
  468   kernel.packet[2].x = tmp;
 
  470   tmp = kernel.packet[0].w;
 
  471   kernel.packet[0].w = kernel.packet[3].x;
 
  472   kernel.packet[3].x = tmp;
 
  474   tmp = kernel.packet[1].z;
 
  475   kernel.packet[1].z = kernel.packet[2].y;
 
  476   kernel.packet[2].y = tmp;
 
  478   tmp = kernel.packet[1].w;
 
  479   kernel.packet[1].w = kernel.packet[3].y;
 
  480   kernel.packet[3].y = tmp;
 
  482   tmp = kernel.packet[2].w;
 
  483   kernel.packet[2].w = kernel.packet[3].z;
 
  484   kernel.packet[3].z = tmp;
 
  489   double tmp = kernel.packet[0].y;
 
  490   kernel.packet[0].y = kernel.packet[1].x;
 
  491   kernel.packet[1].x = tmp;
 
  494 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) 
  498 #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) 
  500 typedef ulonglong2 Packet4h2;
 
  502 template<> 
struct is_arithmetic<Packet4h2> { 
enum { 
value = 
true }; };
 
  505 template<> 
struct is_arithmetic<half2> { 
enum { 
value = 
true }; };
 
  507 template<> 
struct packet_traits<
Eigen::half> : default_packet_traits
 
  509   typedef Packet4h2 
type;
 
  510   typedef Packet4h2 
half;
 
  532 #if defined(EIGEN_GPU_COMPILE_PHASE) 
  533   return __halves2half2(
a, 
b);
 
  536   return __floats2half2_rn(__half2float(
a), __half2float(
b));
 
  541 #if defined(EIGEN_GPU_COMPILE_PHASE) 
  542   return __low2half(
a);
 
  544   return __float2half(__low2float(
a));
 
  549 #if defined(EIGEN_GPU_COMPILE_PHASE) 
  550   return __high2half(
a);
 
  552   return __float2half(__high2float(
a));
 
  559 #if defined(EIGEN_GPU_COMPILE_PHASE) 
  560   return __half2half2(from);
 
  562   const float f = __half2float(from);
 
  563   return __floats2half2_rn(
f, 
f);
 
  571   half2* p_alias = 
reinterpret_cast<half2*
>(&r);
 
  572   p_alias[0] = pset1<half2>(from);
 
  573   p_alias[1] = pset1<half2>(from);
 
  574   p_alias[2] = pset1<half2>(from);
 
  575   p_alias[3] = pset1<half2>(from);
 
  584   return *
reinterpret_cast<const half2*
>(from);
 
  588   return combine_half(from[0], from[1]);
 
  592   return combine_half(from[0], from[0]);
 
  597   *
reinterpret_cast<half2*
>(to) = from;
 
  602   to[0] = get_half2_low(from);
 
  603   to[1] = get_half2_high(from);
 
  609 #if defined(EIGEN_GPU_HAS_LDG) 
  611   return __ldg(
reinterpret_cast<const half2*
>(from));
 
  613   return combine_half(*(from+0), *(from+1));
 
  619 #if defined(EIGEN_GPU_HAS_LDG) 
  620   return __halves2half2(__ldg(from+0), __ldg(from+1));
 
  622   return combine_half(*(from+0), *(from+1));
 
  628   return combine_half(from[0*stride], from[1*stride]);
 
  633   to[stride*0] = get_half2_low(from);
 
  634   to[stride*1] = get_half2_high(from);
 
  638   return get_half2_low(
a);
 
  642   half 
a1 = get_half2_low(
a);
 
  643   half 
a2 = get_half2_high(
a);
 
  646   return combine_half(result1, result2);
 
  651   return pset1<half2>(true_half);
 
  656   return pset1<half2>(false_half);
 
  661   __half 
a1 = get_half2_low(kernel.packet[0]);
 
  662   __half 
a2 = get_half2_high(kernel.packet[0]);
 
  663   __half 
b1 = get_half2_low(kernel.packet[1]);
 
  664   __half 
b2 = get_half2_high(kernel.packet[1]);
 
  665   kernel.packet[0] = combine_half(
a1, 
b1);
 
  666   kernel.packet[1] = combine_half(
a2, 
b2);
 
  670 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  671   return __halves2half2(
a, __hadd(
a, __float2half(1.0
f)));
 
  673   float f = __half2float(
a) + 1.0f;
 
  674   return combine_half(
a, __float2half(
f));
 
  681   half mask_low = get_half2_low(mask);
 
  682   half mask_high = get_half2_high(mask);
 
  683   half result_low = mask_low == half(0) ? get_half2_low(
b) : get_half2_low(
a);
 
  684   half result_high = mask_high == half(0) ? get_half2_high(
b) : get_half2_high(
a);
 
  685   return combine_half(result_low, result_high);
 
  692   half 
a1 = get_half2_low(
a);
 
  693   half 
a2 = get_half2_high(
a);
 
  694   half 
b1 = get_half2_low(
b);
 
  695   half 
b2 = get_half2_high(
b);
 
  696   half eq1 = __half2float(
a1) == __half2float(
b1) ? true_half : false_half;
 
  697   half eq2 = __half2float(
a2) == __half2float(
b2) ? true_half : false_half;
 
  698   return combine_half(eq1, eq2);
 
  705   half 
a1 = get_half2_low(
a);
 
  706   half 
a2 = get_half2_high(
a);
 
  707   half 
b1 = get_half2_low(
b);
 
  708   half 
b2 = get_half2_high(
b);
 
  709   half eq1 = __half2float(
a1) < __half2float(
b1) ? true_half : false_half;
 
  710   half eq2 = __half2float(
a2) < __half2float(
b2) ? true_half : false_half;
 
  711   return combine_half(eq1, eq2);
 
  716   half 
a1 = get_half2_low(
a);
 
  717   half 
a2 = get_half2_high(
a);
 
  718   half 
b1 = get_half2_low(
b);
 
  719   half 
b2 = get_half2_high(
b);
 
  722   return combine_half(result1, result2);
 
  727   half 
a1 = get_half2_low(
a);
 
  728   half 
a2 = get_half2_high(
a);
 
  729   half 
b1 = get_half2_low(
b);
 
  730   half 
b2 = get_half2_high(
b);
 
  733   return combine_half(result1, result2);
 
  738   half 
a1 = get_half2_low(
a);
 
  739   half 
a2 = get_half2_high(
a);
 
  740   half 
b1 = get_half2_low(
b);
 
  741   half 
b2 = get_half2_high(
b);
 
  744   return combine_half(result1, result2);
 
  749   half 
a1 = get_half2_low(
a);
 
  750   half 
a2 = get_half2_high(
a);
 
  751   half 
b1 = get_half2_low(
b);
 
  752   half 
b2 = get_half2_high(
b);
 
  755   return combine_half(result1, result2);
 
  760 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  761   return __hadd2(
a, 
b);
 
  763   float a1 = __low2float(
a);
 
  764   float a2 = __high2float(
a);
 
  765   float b1 = __low2float(
b);
 
  766   float b2 = __high2float(
b);
 
  769   return __floats2half2_rn(
r1, 
r2);
 
  775 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  776   return __hsub2(
a, 
b);
 
  778   float a1 = __low2float(
a);
 
  779   float a2 = __high2float(
a);
 
  780   float b1 = __low2float(
b);
 
  781   float b2 = __high2float(
b);
 
  784   return __floats2half2_rn(
r1, 
r2);
 
  789 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  792   float a1 = __low2float(
a);
 
  793   float a2 = __high2float(
a);
 
  794   return __floats2half2_rn(-
a1, -
a2);
 
  802 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  803   return __hmul2(
a, 
b);
 
  805   float a1 = __low2float(
a);
 
  806   float a2 = __high2float(
a);
 
  807   float b1 = __low2float(
b);
 
  808   float b2 = __high2float(
b);
 
  811   return __floats2half2_rn(
r1, 
r2);
 
  818 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  819    return __hfma2(
a, 
b, 
c);
 
  821   float a1 = __low2float(
a);
 
  822   float a2 = __high2float(
a);
 
  823   float b1 = __low2float(
b);
 
  824   float b2 = __high2float(
b);
 
  825   float c1 = __low2float(
c);
 
  826   float c2 = __high2float(
c);
 
  829   return __floats2half2_rn(
r1, 
r2);
 
  835 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  836   return __h2div(
a, 
b);
 
  838   float a1 = __low2float(
a);
 
  839   float a2 = __high2float(
a);
 
  840   float b1 = __low2float(
b);
 
  841   float b2 = __high2float(
b);
 
  844   return __floats2half2_rn(
r1, 
r2);
 
  850   float a1 = __low2float(
a);
 
  851   float a2 = __high2float(
a);
 
  852   float b1 = __low2float(
b);
 
  853   float b2 = __high2float(
b);
 
  854   __half 
r1 = 
a1 < 
b1 ? get_half2_low(
a) : get_half2_low(
b);
 
  855   __half 
r2 = 
a2 < 
b2 ? get_half2_high(
a) : get_half2_high(
b);
 
  856   return combine_half(
r1, 
r2);
 
  861   float a1 = __low2float(
a);
 
  862   float a2 = __high2float(
a);
 
  863   float b1 = __low2float(
b);
 
  864   float b2 = __high2float(
b);
 
  865   __half 
r1 = 
a1 > 
b1 ? get_half2_low(
a) : get_half2_low(
b);
 
  866   __half 
r2 = 
a2 > 
b2 ? get_half2_high(
a) : get_half2_high(
b);
 
  867   return combine_half(
r1, 
r2);
 
  871 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  872   return __hadd(__low2half(
a), __high2half(
a));
 
  874   float a1 = __low2float(
a);
 
  875   float a2 = __high2float(
a);
 
  881 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  882   __half 
first = __low2half(
a);
 
  883   __half 
second = __high2half(
a);
 
  886   float a1 = __low2float(
a);
 
  887   float a2 = __high2float(
a);
 
  888   return a1 > 
a2 ? get_half2_low(
a) : get_half2_high(
a);
 
  893 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  894   __half 
first = __low2half(
a);
 
  895   __half 
second = __high2half(
a);
 
  898   float a1 = __low2float(
a);
 
  899   float a2 = __high2float(
a);
 
  900   return a1 < 
a2 ? get_half2_low(
a) : get_half2_high(
a);
 
  905 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
  906   return __hmul(__low2half(
a), __high2half(
a));
 
  908   float a1 = __low2float(
a);
 
  909   float a2 = __high2float(
a);
 
  915   float a1 = __low2float(
a);
 
  916   float a2 = __high2float(
a);
 
  917   float r1 = log1pf(
a1);
 
  918   float r2 = log1pf(
a2);
 
  919   return __floats2half2_rn(
r1, 
r2);
 
  923   float a1 = __low2float(
a);
 
  924   float a2 = __high2float(
a);
 
  925   float r1 = expm1f(
a1);
 
  926   float r2 = expm1f(
a2);
 
  927   return __floats2half2_rn(
r1, 
r2);
 
  930 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \ 
  931   defined(EIGEN_HIP_DEVICE_COMPILE) 
  934 half2 
plog(
const half2& 
a) {
 
  939 half2 
pexp(
const half2& 
a) {
 
  944 half2 
psqrt(
const half2& 
a) {
 
  956   float a1 = __low2float(
a);
 
  957   float a2 = __high2float(
a);
 
  960   return __floats2half2_rn(
r1, 
r2);
 
  964   float a1 = __low2float(
a);
 
  965   float a2 = __high2float(
a);
 
  968   return __floats2half2_rn(
r1, 
r2);
 
  972   float a1 = __low2float(
a);
 
  973   float a2 = __high2float(
a);
 
  974   float r1 = sqrtf(
a1);
 
  975   float r2 = sqrtf(
a2);
 
  976   return __floats2half2_rn(
r1, 
r2);
 
  980   float a1 = __low2float(
a);
 
  981   float a2 = __high2float(
a);
 
  982   float r1 = rsqrtf(
a1);
 
  983   float r2 = rsqrtf(
a2);
 
  984   return __floats2half2_rn(
r1, 
r2);
 
  992   return *
reinterpret_cast<const Packet4h2*
>(from);
 
 1000   half2* p_alias = 
reinterpret_cast<half2*
>(&r);
 
 1001   p_alias[0] = 
ploadu(from + 0);
 
 1002   p_alias[1] = 
ploadu(from + 2);
 
 1003   p_alias[2] = 
ploadu(from + 4);
 
 1004   p_alias[3] = 
ploadu(from + 6);
 
 1012   half2* p_alias = 
reinterpret_cast<half2*
>(&r);
 
 1023   *
reinterpret_cast<Packet4h2*
>(to) = from;
 
 1029   const half2* from_alias = 
reinterpret_cast<const half2*
>(&from);
 
 1030   pstoreu(to + 0,from_alias[0]);
 
 1031   pstoreu(to + 2,from_alias[1]);
 
 1032   pstoreu(to + 4,from_alias[2]);
 
 1033   pstoreu(to + 6,from_alias[3]);
 
 1038 ploadt_ro<Packet4h2, Aligned>(
const Eigen::half* from) {
 
 1039 #if defined(EIGEN_GPU_HAS_LDG) 
 1041   r = __ldg(
reinterpret_cast<const Packet4h2*
>(from));
 
 1045   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1046   r_alias[0] = ploadt_ro_aligned(from + 0);
 
 1047   r_alias[1] = ploadt_ro_aligned(from + 2);
 
 1048   r_alias[2] = ploadt_ro_aligned(from + 4);
 
 1049   r_alias[3] = ploadt_ro_aligned(from + 6);
 
 1056 ploadt_ro<Packet4h2, Unaligned>(
const Eigen::half* from) {
 
 1058   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1059   r_alias[0] = ploadt_ro_unaligned(from + 0);
 
 1060   r_alias[1] = ploadt_ro_unaligned(from + 2);
 
 1061   r_alias[2] = ploadt_ro_unaligned(from + 4);
 
 1062   r_alias[3] = ploadt_ro_unaligned(from + 6);
 
 1070   half2* p_alias = 
reinterpret_cast<half2*
>(&r);
 
 1071   p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
 
 1072   p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
 
 1073   p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
 
 1074   p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
 
 1081   const half2* from_alias = 
reinterpret_cast<const half2*
>(&from);
 
 1082   pscatter(to + stride * 0, from_alias[0], stride);
 
 1083   pscatter(to + stride * 2, from_alias[1], stride);
 
 1084   pscatter(to + stride * 4, from_alias[2], stride);
 
 1085   pscatter(to + stride * 6, from_alias[3], stride);
 
 1090     const Packet4h2& 
a) {
 
 1091   return pfirst(*(
reinterpret_cast<const half2*
>(&
a)));
 
 1096     const Packet4h2& 
a) {
 
 1098   half2* p_alias = 
reinterpret_cast<half2*
>(&r);
 
 1099   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1100   p_alias[0] = 
pabs(a_alias[0]);
 
 1101   p_alias[1] = 
pabs(a_alias[1]);
 
 1102   p_alias[2] = 
pabs(a_alias[2]);
 
 1103   p_alias[3] = 
pabs(a_alias[3]);
 
 1109     const Packet4h2& ) {
 
 1111   return pset1<Packet4h2>(true_half);
 
 1117   return pset1<Packet4h2>(false_half);
 
 1121     double* d_row0, 
double* d_row1, 
double* d_row2, 
double* d_row3,
 
 1122     double* d_row4, 
double* d_row5, 
double* d_row6, 
double* d_row7) {
 
 1125   d_row0[1] = d_row4[0];
 
 1129   d_row1[1] = d_row5[0];
 
 1133   d_row2[1] = d_row6[0];
 
 1137   d_row3[1] = d_row7[0];
 
 1142     half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
 
 1145   f_row0[1] = f_row2[0];
 
 1149   f_row1[1] = f_row3[0];
 
 1154 ptranspose_half(half2& 
f0, half2& 
f1) {
 
 1155   __half 
a1 = get_half2_low(
f0);
 
 1156   __half 
a2 = get_half2_high(
f0);
 
 1157   __half 
b1 = get_half2_low(
f1);
 
 1158   __half 
b2 = get_half2_high(
f1);
 
 1159   f0 = combine_half(
a1, 
b1);
 
 1160   f1 = combine_half(
a2, 
b2);
 
 1164 ptranspose(PacketBlock<Packet4h2,8>& kernel) {
 
 1165   double* d_row0 = 
reinterpret_cast<double*
>(&kernel.packet[0]);
 
 1166   double* d_row1 = 
reinterpret_cast<double*
>(&kernel.packet[1]);
 
 1167   double* d_row2 = 
reinterpret_cast<double*
>(&kernel.packet[2]);
 
 1168   double* d_row3 = 
reinterpret_cast<double*
>(&kernel.packet[3]);
 
 1169   double* d_row4 = 
reinterpret_cast<double*
>(&kernel.packet[4]);
 
 1170   double* d_row5 = 
reinterpret_cast<double*
>(&kernel.packet[5]);
 
 1171   double* d_row6 = 
reinterpret_cast<double*
>(&kernel.packet[6]);
 
 1172   double* d_row7 = 
reinterpret_cast<double*
>(&kernel.packet[7]);
 
 1173   ptranspose_double(d_row0, d_row1, d_row2, d_row3,
 
 1174                     d_row4, d_row5, d_row6, d_row7);
 
 1177   half2* f_row0 = 
reinterpret_cast<half2*
>(d_row0);
 
 1178   half2* f_row1 = 
reinterpret_cast<half2*
>(d_row1);
 
 1179   half2* f_row2 = 
reinterpret_cast<half2*
>(d_row2);
 
 1180   half2* f_row3 = 
reinterpret_cast<half2*
>(d_row3);
 
 1181   ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
 
 1182   ptranspose_half(f_row0[0], f_row1[0]);
 
 1183   ptranspose_half(f_row0[1], f_row1[1]);
 
 1184   ptranspose_half(f_row2[0], f_row3[0]);
 
 1185   ptranspose_half(f_row2[1], f_row3[1]);
 
 1187   f_row0 = 
reinterpret_cast<half2*
>(d_row0 + 1);
 
 1188   f_row1 = 
reinterpret_cast<half2*
>(d_row1 + 1);
 
 1189   f_row2 = 
reinterpret_cast<half2*
>(d_row2 + 1);
 
 1190   f_row3 = 
reinterpret_cast<half2*
>(d_row3 + 1);
 
 1191   ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
 
 1192   ptranspose_half(f_row0[0], f_row1[0]);
 
 1193   ptranspose_half(f_row0[1], f_row1[1]);
 
 1194   ptranspose_half(f_row2[0], f_row3[0]);
 
 1195   ptranspose_half(f_row2[1], f_row3[1]);
 
 1197   f_row0 = 
reinterpret_cast<half2*
>(d_row4);
 
 1198   f_row1 = 
reinterpret_cast<half2*
>(d_row5);
 
 1199   f_row2 = 
reinterpret_cast<half2*
>(d_row6);
 
 1200   f_row3 = 
reinterpret_cast<half2*
>(d_row7);
 
 1201   ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
 
 1202   ptranspose_half(f_row0[0], f_row1[0]);
 
 1203   ptranspose_half(f_row0[1], f_row1[1]);
 
 1204   ptranspose_half(f_row2[0], f_row3[0]);
 
 1205   ptranspose_half(f_row2[1], f_row3[1]);
 
 1207   f_row0 = 
reinterpret_cast<half2*
>(d_row4 + 1);
 
 1208   f_row1 = 
reinterpret_cast<half2*
>(d_row5 + 1);
 
 1209   f_row2 = 
reinterpret_cast<half2*
>(d_row6 + 1);
 
 1210   f_row3 = 
reinterpret_cast<half2*
>(d_row7 + 1);
 
 1211   ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
 
 1212   ptranspose_half(f_row0[0], f_row1[0]);
 
 1213   ptranspose_half(f_row0[1], f_row1[1]);
 
 1214   ptranspose_half(f_row2[0], f_row3[0]);
 
 1215   ptranspose_half(f_row2[1], f_row3[1]);
 
 1222 #if defined(EIGEN_HIP_DEVICE_COMPILE) 
 1225   half2* p_alias = 
reinterpret_cast<half2*
>(&r);
 
 1226   p_alias[0] = __halves2half2(
a, __hadd(
a, __float2half(1.0
f)));
 
 1227   p_alias[1] = __halves2half2(__hadd(
a, __float2half(2.0
f)),
 
 1228                               __hadd(
a, __float2half(3.0
f)));
 
 1229   p_alias[2] = __halves2half2(__hadd(
a, __float2half(4.0
f)),
 
 1230                               __hadd(
a, __float2half(5.0
f)));
 
 1231   p_alias[3] = __halves2half2(__hadd(
a, __float2half(6.0
f)),
 
 1232                               __hadd(
a, __float2half(7.0
f)));
 
 1234 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 
 1236   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1238   half2 
b = pset1<half2>(
a);
 
 1240   half2 half_offset0 = __halves2half2(__float2half(0.0
f),__float2half(2.0
f));
 
 1241   half2 half_offset1 = __halves2half2(__float2half(4.0
f),__float2half(6.0
f));
 
 1243   c = __hadd2(
b, half_offset0);
 
 1244   r_alias[0] = 
plset(__low2half(
c));
 
 1245   r_alias[1] = 
plset(__high2half(
c));
 
 1247   c = __hadd2(
b, half_offset1);
 
 1248   r_alias[2] = 
plset(__low2half(
c));
 
 1249   r_alias[3] = 
plset(__high2half(
c));
 
 1254   float f = __half2float(
a);
 
 1256   half2* p_alias = 
reinterpret_cast<half2*
>(&r);
 
 1257   p_alias[0] = combine_half(
a, __float2half(
f + 1.0
f));
 
 1258   p_alias[1] = combine_half(__float2half(
f + 2.0
f), __float2half(
f + 3.0
f));
 
 1259   p_alias[2] = combine_half(__float2half(
f + 4.0
f), __float2half(
f + 5.0
f));
 
 1260   p_alias[3] = combine_half(__float2half(
f + 6.0
f), __float2half(
f + 7.0
f));
 
 1267 pselect<Packet4h2>(
const Packet4h2& mask, 
const Packet4h2& 
a,
 
 1268                    const Packet4h2& 
b) {
 
 1270   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1271   const half2* mask_alias = 
reinterpret_cast<const half2*
>(&mask);
 
 1272   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1273   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1274   r_alias[0] = 
pselect(mask_alias[0], a_alias[0], b_alias[0]);
 
 1275   r_alias[1] = 
pselect(mask_alias[1], a_alias[1], b_alias[1]);
 
 1276   r_alias[2] = 
pselect(mask_alias[2], a_alias[2], b_alias[2]);
 
 1277   r_alias[3] = 
pselect(mask_alias[3], a_alias[3], b_alias[3]);
 
 1283 pcmp_eq<Packet4h2>(
const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1285   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1286   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1287   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1288   r_alias[0] = 
pcmp_eq(a_alias[0], b_alias[0]);
 
 1289   r_alias[1] = 
pcmp_eq(a_alias[1], b_alias[1]);
 
 1290   r_alias[2] = 
pcmp_eq(a_alias[2], b_alias[2]);
 
 1291   r_alias[3] = 
pcmp_eq(a_alias[3], b_alias[3]);
 
 1297     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1299   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1300   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1301   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1302   r_alias[0] = 
pand(a_alias[0], b_alias[0]);
 
 1303   r_alias[1] = 
pand(a_alias[1], b_alias[1]);
 
 1304   r_alias[2] = 
pand(a_alias[2], b_alias[2]);
 
 1305   r_alias[3] = 
pand(a_alias[3], b_alias[3]);
 
 1311     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1313   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1314   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1315   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1316   r_alias[0] = 
por(a_alias[0], b_alias[0]);
 
 1317   r_alias[1] = 
por(a_alias[1], b_alias[1]);
 
 1318   r_alias[2] = 
por(a_alias[2], b_alias[2]);
 
 1319   r_alias[3] = 
por(a_alias[3], b_alias[3]);
 
 1325     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1327   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1328   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1329   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1330   r_alias[0] = 
pxor(a_alias[0], b_alias[0]);
 
 1331   r_alias[1] = 
pxor(a_alias[1], b_alias[1]);
 
 1332   r_alias[2] = 
pxor(a_alias[2], b_alias[2]);
 
 1333   r_alias[3] = 
pxor(a_alias[3], b_alias[3]);
 
 1339 pandnot<Packet4h2>(
const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1341   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1342   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1343   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1344   r_alias[0] = 
pandnot(a_alias[0], b_alias[0]);
 
 1345   r_alias[1] = 
pandnot(a_alias[1], b_alias[1]);
 
 1346   r_alias[2] = 
pandnot(a_alias[2], b_alias[2]);
 
 1347   r_alias[3] = 
pandnot(a_alias[3], b_alias[3]);
 
 1353     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1355   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1356   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1357   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1358   r_alias[0] = 
padd(a_alias[0], b_alias[0]);
 
 1359   r_alias[1] = 
padd(a_alias[1], b_alias[1]);
 
 1360   r_alias[2] = 
padd(a_alias[2], b_alias[2]);
 
 1361   r_alias[3] = 
padd(a_alias[3], b_alias[3]);
 
 1367     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1369   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1370   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1371   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1372   r_alias[0] = 
psub(a_alias[0], b_alias[0]);
 
 1373   r_alias[1] = 
psub(a_alias[1], b_alias[1]);
 
 1374   r_alias[2] = 
psub(a_alias[2], b_alias[2]);
 
 1375   r_alias[3] = 
psub(a_alias[3], b_alias[3]);
 
 1382   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1383   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1384   r_alias[0] = 
pnegate(a_alias[0]);
 
 1385   r_alias[1] = 
pnegate(a_alias[1]);
 
 1386   r_alias[2] = 
pnegate(a_alias[2]);
 
 1387   r_alias[3] = 
pnegate(a_alias[3]);
 
 1398     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1400   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1401   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1402   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1403   r_alias[0] = 
pmul(a_alias[0], b_alias[0]);
 
 1404   r_alias[1] = 
pmul(a_alias[1], b_alias[1]);
 
 1405   r_alias[2] = 
pmul(a_alias[2], b_alias[2]);
 
 1406   r_alias[3] = 
pmul(a_alias[3], b_alias[3]);
 
 1412     const Packet4h2& 
a, 
const Packet4h2& 
b, 
const Packet4h2& 
c) {
 
 1414   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1415   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1416   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1417   const half2* c_alias = 
reinterpret_cast<const half2*
>(&
c);
 
 1418   r_alias[0] = 
pmadd(a_alias[0], b_alias[0], c_alias[0]);
 
 1419   r_alias[1] = 
pmadd(a_alias[1], b_alias[1], c_alias[1]);
 
 1420   r_alias[2] = 
pmadd(a_alias[2], b_alias[2], c_alias[2]);
 
 1421   r_alias[3] = 
pmadd(a_alias[3], b_alias[3], c_alias[3]);
 
 1427     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1429   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1430   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1431   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1432   r_alias[0] = 
pdiv(a_alias[0], b_alias[0]);
 
 1433   r_alias[1] = 
pdiv(a_alias[1], b_alias[1]);
 
 1434   r_alias[2] = 
pdiv(a_alias[2], b_alias[2]);
 
 1435   r_alias[3] = 
pdiv(a_alias[3], b_alias[3]);
 
 1441     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1443   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1444   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1445   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1446   r_alias[0] = 
pmin(a_alias[0], b_alias[0]);
 
 1447   r_alias[1] = 
pmin(a_alias[1], b_alias[1]);
 
 1448   r_alias[2] = 
pmin(a_alias[2], b_alias[2]);
 
 1449   r_alias[3] = 
pmin(a_alias[3], b_alias[3]);
 
 1455     const Packet4h2& 
a, 
const Packet4h2& 
b) {
 
 1457   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1458   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1459   const half2* b_alias = 
reinterpret_cast<const half2*
>(&
b);
 
 1460   r_alias[0] = 
pmax(a_alias[0], b_alias[0]);
 
 1461   r_alias[1] = 
pmax(a_alias[1], b_alias[1]);
 
 1462   r_alias[2] = 
pmax(a_alias[2], b_alias[2]);
 
 1463   r_alias[3] = 
pmax(a_alias[3], b_alias[3]);
 
 1469     const Packet4h2& 
a) {
 
 1470   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1478     const Packet4h2& 
a) {
 
 1479   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1486 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 
 1487   return (__hgt(
first, second) ? 
first : second);
 
 1489   float ffirst  = __half2float(
first);
 
 1490   float fsecond = __half2float(second);
 
 1497     const Packet4h2& 
a) {
 
 1498   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1505 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) 
 1506   return (__hlt(
first, second) ? 
first : second);
 
 1508   float ffirst  = __half2float(
first);
 
 1509   float fsecond = __half2float(second);
 
 1517     const Packet4h2& 
a) {
 
 1518   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1520                                        pmul(a_alias[2], a_alias[3])));
 
 1525 plog1p<Packet4h2>(
const Packet4h2& 
a) {
 
 1527   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1528   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1529   r_alias[0] = 
plog1p(a_alias[0]);
 
 1530   r_alias[1] = 
plog1p(a_alias[1]);
 
 1531   r_alias[2] = 
plog1p(a_alias[2]);
 
 1532   r_alias[3] = 
plog1p(a_alias[3]);
 
 1538 pexpm1<Packet4h2>(
const Packet4h2& 
a) {
 
 1540   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1541   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1542   r_alias[0] = 
pexpm1(a_alias[0]);
 
 1543   r_alias[1] = 
pexpm1(a_alias[1]);
 
 1544   r_alias[2] = 
pexpm1(a_alias[2]);
 
 1545   r_alias[3] = 
pexpm1(a_alias[3]);
 
 1552   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1553   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1554   r_alias[0] = 
plog(a_alias[0]);
 
 1555   r_alias[1] = 
plog(a_alias[1]);
 
 1556   r_alias[2] = 
plog(a_alias[2]);
 
 1557   r_alias[3] = 
plog(a_alias[3]);
 
 1564   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1565   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1566   r_alias[0] = 
pexp(a_alias[0]);
 
 1567   r_alias[1] = 
pexp(a_alias[1]);
 
 1568   r_alias[2] = 
pexp(a_alias[2]);
 
 1569   r_alias[3] = 
pexp(a_alias[3]);
 
 1576   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1577   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1578   r_alias[0] = 
psqrt(a_alias[0]);
 
 1579   r_alias[1] = 
psqrt(a_alias[1]);
 
 1580   r_alias[2] = 
psqrt(a_alias[2]);
 
 1581   r_alias[3] = 
psqrt(a_alias[3]);
 
 1587 prsqrt<Packet4h2>(
const Packet4h2& 
a) {
 
 1589   half2* r_alias = 
reinterpret_cast<half2*
>(&r);
 
 1590   const half2* a_alias = 
reinterpret_cast<const half2*
>(&
a);
 
 1591   r_alias[0] = 
prsqrt(a_alias[0]);
 
 1592   r_alias[1] = 
prsqrt(a_alias[1]);
 
 1593   r_alias[2] = 
prsqrt(a_alias[2]);
 
 1594   r_alias[3] = 
prsqrt(a_alias[3]);
 
 1603 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
 1604   return __hadd2(
a, 
b);
 
 1606   float a1 = __low2float(
a);
 
 1607   float a2 = __high2float(
a);
 
 1608   float b1 = __low2float(
b);
 
 1609   float b2 = __high2float(
b);
 
 1612   return __floats2half2_rn(
r1, 
r2);
 
 1619 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
 1620   return __hmul2(
a, 
b);
 
 1622   float a1 = __low2float(
a);
 
 1623   float a2 = __high2float(
a);
 
 1624   float b1 = __low2float(
b);
 
 1625   float b2 = __high2float(
b);
 
 1628   return __floats2half2_rn(
r1, 
r2);
 
 1635 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) 
 1636   return __h2div(
a, 
b);
 
 1638   float a1 = __low2float(
a);
 
 1639   float a2 = __high2float(
a);
 
 1640   float b1 = __low2float(
b);
 
 1641   float b2 = __high2float(
b);
 
 1644   return __floats2half2_rn(
r1, 
r2);
 
 1651   float a1 = __low2float(
a);
 
 1652   float a2 = __high2float(
a);
 
 1653   float b1 = __low2float(
b);
 
 1654   float b2 = __high2float(
b);
 
 1655   __half 
r1 = 
a1 < 
b1 ? get_half2_low(
a) : get_half2_low(
b);
 
 1656   __half 
r2 = 
a2 < 
b2 ? get_half2_high(
a) : get_half2_high(
b);
 
 1657   return combine_half(
r1, 
r2);
 
 1663   float a1 = __low2float(
a);
 
 1664   float a2 = __high2float(
a);
 
 1665   float b1 = __low2float(
b);
 
 1666   float b2 = __high2float(
b);
 
 1667   __half 
r1 = 
a1 > 
b1 ? get_half2_low(
a) : get_half2_low(
b);
 
 1668   __half 
r2 = 
a2 > 
b2 ? get_half2_high(
a) : get_half2_high(
b);
 
 1669   return combine_half(
r1, 
r2);
 
 1674 #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) 
 1676 #undef EIGEN_GPU_HAS_LDG 
 1677 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC 
 1678 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC 
 1685 #endif // EIGEN_PACKET_MATH_GPU_H