10 #ifndef EIGEN_MATH_FUNCTIONS_AVX_H 11 #define EIGEN_MATH_FUNCTIONS_AVX_H 23 #ifdef EIGEN_VECTORIZE_AVX2 24 return _mm256_slli_epi32(v, n);
26 __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
27 __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
28 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
34 #ifdef EIGEN_VECTORIZE_AVX2 35 return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
37 __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
38 __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
39 return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
65 Packet8f shift = _mm256_floor_ps(
padd(z, p8f_one_over_four));
66 x =
pmadd(shift, p8f_neg_pi_first, x);
67 x =
pmadd(shift, p8f_neg_pi_second, x);
68 x =
pmadd(shift, p8f_neg_pi_third, x);
69 z =
pmul(x, p8f_four_over_pi);
73 Packet8i shift_ints = _mm256_cvtps_epi32(shift);
74 Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
79 Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
88 Packet8f right =
pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
89 right =
pmadd(right, z_minus_two2, p8f_coeff_right_2);
90 right =
pmadd(right, z_minus_two2, p8f_coeff_right_0);
98 Packet8f left =
pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
99 left =
pmadd(left, z2, p8f_coeff_left_3);
100 left =
pmadd(left, z2, p8f_coeff_left_1);
101 left =
pmul(left, z);
104 left = _mm256_andnot_ps(ival_mask, left);
105 right = _mm256_and_ps(ival_mask, right);
106 Packet8f res = _mm256_or_ps(left, right);
109 res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
147 Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ);
148 Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
151 x =
pmax(x, p8f_min_norm_pos);
154 Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
157 x = _mm256_and_ps(x, p8f_inv_mant_mask);
158 x = _mm256_or_ps(x, p8f_half);
167 Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
168 Packet8f tmp = _mm256_and_ps(x, mask);
170 e =
psub(e, _mm256_and_ps(p8f_1, mask));
179 y =
pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
180 y1 =
pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
181 y2 =
pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
182 y =
pmadd(y, x, p8f_cephes_log_p2);
183 y1 =
pmadd(y1, x, p8f_cephes_log_p5);
184 y2 =
pmadd(y2, x, p8f_cephes_log_p8);
185 y =
pmadd(y, x3, y1);
186 y =
pmadd(y, x3, y2);
190 y1 =
pmul(e, p8f_cephes_log_q1);
191 tmp =
pmul(x2, p8f_half);
194 y2 =
pmul(e, p8f_cephes_log_q2);
200 _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
201 _mm256_and_ps(iszero_mask, p8f_minus_inf));
231 Packet8f m = _mm256_floor_ps(
pmadd(x, p8f_cephes_LOG2EF, p8f_half));
237 #ifdef EIGEN_VECTORIZE_FMA 239 Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
244 r =
psub(r,
pmul(m, p8f_cephes_exp_C2));
252 y =
pmadd(y, r, p8f_cephes_exp_p1);
253 y =
pmadd(y, r, p8f_cephes_exp_p2);
254 y =
pmadd(y, r, p8f_cephes_exp_p3);
255 y =
pmadd(y, r, p8f_cephes_exp_p4);
256 y =
pmadd(y, r, p8f_cephes_exp_p5);
265 return pmax(
pmul(y, _mm256_castsi256_ps(emm0)), _x);
305 x =
pmax(
pmin(x, p4d_exp_hi), p4d_exp_lo);
307 fx =
pmadd(p4d_cephes_LOG2EF, x, p4d_half);
310 fx = _mm256_floor_pd(fx);
315 tmp =
pmul(fx, p4d_cephes_exp_C1);
324 px =
pmadd(px, x2, p4d_cephes_exp_p1);
325 px =
pmadd(px, x2, p4d_cephes_exp_p2);
330 qx =
pmadd(qx, x2, p4d_cephes_exp_q1);
331 qx =
pmadd(qx, x2, p4d_cephes_exp_q2);
332 qx =
pmadd(qx, x2, p4d_cephes_exp_q3);
337 x = _mm256_div_pd(px,
psub(qx, px));
338 x =
pmadd(p4d_2, x, p4d_1);
342 __m128i emm0 = _mm256_cvtpd_epi32(fx);
343 emm0 = _mm_add_epi32(emm0, p4i_1023);
344 emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
345 __m128i lo = _mm_slli_epi64(emm0, 52);
346 __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
347 __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
348 e = _mm256_insertf128_si256(e, hi, 1);
352 return pmax(
pmul(x, _mm256_castsi256_pd(e)), _x);
368 Packet8f denormal_mask = _mm256_and_ps(
369 _mm256_cmp_ps(_x,
pset1<Packet8f>((std::numeric_limits<float>::min)()),
371 _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
378 return _mm256_andnot_ps(denormal_mask,
pmul(_x,x));
383 return _mm256_sqrt_ps(x);
388 return _mm256_sqrt_pd(x);
404 Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
405 Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
408 Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
409 Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
410 Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
411 _mm256_and_ps(zero_mask, p8f_inf));
414 x =
pmul(x,
pmadd(neg_half,
pmul(x, x), p8f_one_point_five));
417 return _mm256_or_ps(x, infs_and_nans);
424 return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
431 return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
439 #endif // EIGEN_MATH_FUNCTIONS_AVX_H static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f ptanh< Packet8f >(const Packet8f &x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d pexp< Packet4d >(const Packet4d &_x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d psqrt< Packet4d >(const Packet4d &x)
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f prsqrt< Packet8f >(const Packet8f &x)
T generic_fast_tanh_float(const T &a_x)
#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
#define _EIGEN_DECLARE_CONST_Packet8i(NAME, X)
#define _EIGEN_DECLARE_CONST_Packet8f(NAME, X)
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f psin< Packet8f >(const Packet8f &_x)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f plog< Packet8f >(const Packet8f &_x)
#define _EIGEN_DECLARE_CONST_Packet4d(NAME, X)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d prsqrt< Packet4d >(const Packet4d &x)
__vector short int Packet8i
#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME, X)
EIGEN_STRONG_INLINE Packet8f pset1< Packet8f >(const float &from)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f pexp< Packet8f >(const Packet8f &_x)
TFSIMD_FORCE_INLINE const tfScalar & z() const
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f psqrt< Packet8f >(const Packet8f &x)
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Packet8f pshiftright(Packet8f v, int n)
Packet8i pshiftleft(Packet8i v, int n)
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)