10 #ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
11 #define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
18 #if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923
20 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
21 const Packet16f p16f_##NAME = pset1<Packet16f>(X)
23 #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
24 const Packet16f p16f_##NAME = preinterpret<Packet16f,Packet16i>(pset1<Packet16i>(X))
26 #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
27 const Packet8d p8d_##NAME = pset1<Packet8d>(X)
29 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
30 const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
32 #define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \
33 const Packet16bf p16bf_##NAME = pset1<Packet16bf>(X)
35 #define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \
36 const Packet16bf p16bf_##NAME = preinterpret<Packet16bf,Packet16i>(pset1<Packet16i>(X))
61 plog2<Packet8d>(
const Packet8d& _x) {
74 _EIGEN_DECLARE_CONST_Packet16f(1, 1.0
f);
75 _EIGEN_DECLARE_CONST_Packet16f(half, 0.5
f);
76 _EIGEN_DECLARE_CONST_Packet16f(127, 127.0
f);
78 _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950
f);
79 _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949
f);
81 _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341
f);
83 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500
E-4
f);
84 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507
E-3
f);
85 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073
E-3
f);
86 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894
E-2
f);
87 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459
E-1
f);
88 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201
E-1
f);
99 _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453
f);
106 y =
pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1);
107 y1 =
pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4);
108 y2 =
padd(r, p16f_1);
109 y =
pmadd(
y, r, p16f_cephes_exp_p2);
116 emm0 = _mm512_slli_epi32(emm0, 23);
119 return pmax(
pmul(
y, _mm512_castsi512_ps(emm0)), _x);
124 pexp<Packet8d>(
const Packet8d& _x) {
168 __mmask16 denormal_mask = _mm512_kand(
171 _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
179 return _mm512_mask_blend_ps(denormal_mask,
pmul(_x,
x), _mm512_setzero_ps());
184 psqrt<Packet8d>(
const Packet8d& _x) {
186 __mmask16 denormal_mask = _mm512_kand(
189 _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
199 return _mm512_mask_blend_pd(denormal_mask,
pmul(_x,
x), _mm512_setzero_pd());
204 return _mm512_sqrt_ps(
x);
209 return _mm512_sqrt_pd(
x);
217 #if defined(EIGEN_VECTORIZE_AVX512ER)
221 return _mm512_rsqrt28_ps(
x);
223 #elif EIGEN_FAST_MATH
228 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(
inf, 0x7f800000);
229 _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5
f);
230 _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5
f);
235 __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
236 __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
237 __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
241 Packet16f y_approx = _mm512_rsqrt14_ps(_x);
252 return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
258 _EIGEN_DECLARE_CONST_Packet16f(one, 1.0
f);
259 return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(
x));
270 prsqrt<Packet8d>(
const Packet8d& _x) {
271 _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
272 _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
273 _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(
inf, 0x7ff0000000000000LL);
278 __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
279 __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
280 __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
284 #if defined(EIGEN_VECTORIZE_AVX512ER)
285 Packet8d y_approx = _mm512_rsqrt28_pd(_x);
287 Packet8d y_approx = _mm512_rsqrt14_pd(_x);
297 #if !defined(EIGEN_VECTORIZE_AVX512ER)
298 y_newton =
pmul(y_newton,
pmadd(y_newton,
pmul(neg_half, y_newton), p8d_one_point_five));
303 return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
308 _EIGEN_DECLARE_CONST_Packet8d(one, 1.0
f);
309 return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(
x));
362 #endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_