00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef EIGEN_MATH_FUNCTIONS_SSE_H
00016 #define EIGEN_MATH_FUNCTIONS_SSE_H
00017
00018 namespace Eigen {
00019
00020 namespace internal {
00021
00022 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00023 Packet4f plog<Packet4f>(const Packet4f& _x)
00024 {
00025 Packet4f x = _x;
00026 _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
00027 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
00028 _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
00029
00030 _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
00031
00032
00033 _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
00034 _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);
00035
00036
00037
00038
00039 _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
00040 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
00041 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
00042 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
00043 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
00044 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
00045 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
00046 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
00047 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
00048 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
00049 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
00050 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
00051
00052
00053 Packet4i emm0;
00054
00055 Packet4f invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
00056 Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
00057
00058 x = pmax(x, p4f_min_norm_pos);
00059 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
00060
00061
00062 x = _mm_and_ps(x, p4f_inv_mant_mask);
00063 x = _mm_or_ps(x, p4f_half);
00064
00065 emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
00066 Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
00067
00068
00069
00070
00071
00072
00073
00074 Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
00075 Packet4f tmp = _mm_and_ps(x, mask);
00076 x = psub(x, p4f_1);
00077 e = psub(e, _mm_and_ps(p4f_1, mask));
00078 x = padd(x, tmp);
00079
00080 Packet4f x2 = pmul(x,x);
00081 Packet4f x3 = pmul(x2,x);
00082
00083 Packet4f y, y1, y2;
00084 y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
00085 y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
00086 y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
00087 y = pmadd(y , x, p4f_cephes_log_p2);
00088 y1 = pmadd(y1, x, p4f_cephes_log_p5);
00089 y2 = pmadd(y2, x, p4f_cephes_log_p8);
00090 y = pmadd(y, x3, y1);
00091 y = pmadd(y, x3, y2);
00092 y = pmul(y, x3);
00093
00094 y1 = pmul(e, p4f_cephes_log_q1);
00095 tmp = pmul(x2, p4f_half);
00096 y = padd(y, y1);
00097 x = psub(x, tmp);
00098 y2 = pmul(e, p4f_cephes_log_q2);
00099 x = padd(x, y);
00100 x = padd(x, y2);
00101
00102 return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
00103 _mm_and_ps(iszero_mask, p4f_minus_inf));
00104 }
00105
00106 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00107 Packet4f pexp<Packet4f>(const Packet4f& _x)
00108 {
00109 Packet4f x = _x;
00110 _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
00111 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
00112 _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
00113
00114
00115 _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
00116 _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
00117
00118 _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
00119 _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
00120 _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
00121
00122 _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
00123 _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
00124 _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
00125 _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
00126 _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
00127 _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
00128
00129 Packet4f tmp = _mm_setzero_ps(), fx;
00130 Packet4i emm0;
00131
00132
00133 x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
00134
00135
00136 fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
00137
00138 #ifdef EIGEN_VECTORIZE_SSE4_1
00139 fx = _mm_floor_ps(fx);
00140 #else
00141 emm0 = _mm_cvttps_epi32(fx);
00142 tmp = _mm_cvtepi32_ps(emm0);
00143
00144 Packet4f mask = _mm_cmpgt_ps(tmp, fx);
00145 mask = _mm_and_ps(mask, p4f_1);
00146 fx = psub(tmp, mask);
00147 #endif
00148
00149 tmp = pmul(fx, p4f_cephes_exp_C1);
00150 Packet4f z = pmul(fx, p4f_cephes_exp_C2);
00151 x = psub(x, tmp);
00152 x = psub(x, z);
00153
00154 z = pmul(x,x);
00155
00156 Packet4f y = p4f_cephes_exp_p0;
00157 y = pmadd(y, x, p4f_cephes_exp_p1);
00158 y = pmadd(y, x, p4f_cephes_exp_p2);
00159 y = pmadd(y, x, p4f_cephes_exp_p3);
00160 y = pmadd(y, x, p4f_cephes_exp_p4);
00161 y = pmadd(y, x, p4f_cephes_exp_p5);
00162 y = pmadd(y, z, x);
00163 y = padd(y, p4f_1);
00164
00165
00166 emm0 = _mm_cvttps_epi32(fx);
00167 emm0 = _mm_add_epi32(emm0, p4i_0x7f);
00168 emm0 = _mm_slli_epi32(emm0, 23);
00169 return pmul(y, _mm_castsi128_ps(emm0));
00170 }
00171 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00172 Packet2d pexp<Packet2d>(const Packet2d& _x)
00173 {
00174 Packet2d x = _x;
00175
00176 _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
00177 _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
00178 _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
00179
00180 _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
00181 _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
00182
00183 _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
00184
00185 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
00186 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
00187 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
00188
00189 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
00190 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
00191 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
00192 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
00193
00194 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
00195 _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
00196 static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
00197
00198 Packet2d tmp = _mm_setzero_pd(), fx;
00199 Packet4i emm0;
00200
00201
00202 x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
00203
00204 fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
00205
00206 #ifdef EIGEN_VECTORIZE_SSE4_1
00207 fx = _mm_floor_pd(fx);
00208 #else
00209 emm0 = _mm_cvttpd_epi32(fx);
00210 tmp = _mm_cvtepi32_pd(emm0);
00211
00212 Packet2d mask = _mm_cmpgt_pd(tmp, fx);
00213 mask = _mm_and_pd(mask, p2d_1);
00214 fx = psub(tmp, mask);
00215 #endif
00216
00217 tmp = pmul(fx, p2d_cephes_exp_C1);
00218 Packet2d z = pmul(fx, p2d_cephes_exp_C2);
00219 x = psub(x, tmp);
00220 x = psub(x, z);
00221
00222 Packet2d x2 = pmul(x,x);
00223
00224 Packet2d px = p2d_cephes_exp_p0;
00225 px = pmadd(px, x2, p2d_cephes_exp_p1);
00226 px = pmadd(px, x2, p2d_cephes_exp_p2);
00227 px = pmul (px, x);
00228
00229 Packet2d qx = p2d_cephes_exp_q0;
00230 qx = pmadd(qx, x2, p2d_cephes_exp_q1);
00231 qx = pmadd(qx, x2, p2d_cephes_exp_q2);
00232 qx = pmadd(qx, x2, p2d_cephes_exp_q3);
00233
00234 x = pdiv(px,psub(qx,px));
00235 x = pmadd(p2d_2,x,p2d_1);
00236
00237
00238 emm0 = _mm_cvttpd_epi32(fx);
00239 emm0 = _mm_add_epi32(emm0, p4i_1023_0);
00240 emm0 = _mm_slli_epi32(emm0, 20);
00241 emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
00242 return pmul(x, _mm_castsi128_pd(emm0));
00243 }
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00258 Packet4f psin<Packet4f>(const Packet4f& _x)
00259 {
00260 Packet4f x = _x;
00261 _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
00262 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
00263
00264 _EIGEN_DECLARE_CONST_Packet4i(1, 1);
00265 _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
00266 _EIGEN_DECLARE_CONST_Packet4i(2, 2);
00267 _EIGEN_DECLARE_CONST_Packet4i(4, 4);
00268
00269 _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
00270
00271 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
00272 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
00273 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
00274 _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
00275 _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
00276 _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
00277 _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
00278 _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
00279 _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
00280 _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);
00281
00282 Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
00283
00284 Packet4i emm0, emm2;
00285 sign_bit = x;
00286
00287 x = pabs(x);
00288
00289
00290
00291
00292 sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
00293
00294
00295 y = pmul(x, p4f_cephes_FOPI);
00296
00297
00298 emm2 = _mm_cvttps_epi32(y);
00299
00300 emm2 = _mm_add_epi32(emm2, p4i_1);
00301 emm2 = _mm_and_si128(emm2, p4i_not1);
00302 y = _mm_cvtepi32_ps(emm2);
00303
00304 emm0 = _mm_and_si128(emm2, p4i_4);
00305 emm0 = _mm_slli_epi32(emm0, 29);
00306
00307
00308
00309
00310
00311
00312 emm2 = _mm_and_si128(emm2, p4i_2);
00313 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
00314
00315 Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
00316 Packet4f poly_mask = _mm_castsi128_ps(emm2);
00317 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
00318
00319
00320
00321 xmm1 = pmul(y, p4f_minus_cephes_DP1);
00322 xmm2 = pmul(y, p4f_minus_cephes_DP2);
00323 xmm3 = pmul(y, p4f_minus_cephes_DP3);
00324 x = padd(x, xmm1);
00325 x = padd(x, xmm2);
00326 x = padd(x, xmm3);
00327
00328
00329 y = p4f_coscof_p0;
00330 Packet4f z = _mm_mul_ps(x,x);
00331
00332 y = pmadd(y, z, p4f_coscof_p1);
00333 y = pmadd(y, z, p4f_coscof_p2);
00334 y = pmul(y, z);
00335 y = pmul(y, z);
00336 Packet4f tmp = pmul(z, p4f_half);
00337 y = psub(y, tmp);
00338 y = padd(y, p4f_1);
00339
00340
00341
00342 Packet4f y2 = p4f_sincof_p0;
00343 y2 = pmadd(y2, z, p4f_sincof_p1);
00344 y2 = pmadd(y2, z, p4f_sincof_p2);
00345 y2 = pmul(y2, z);
00346 y2 = pmul(y2, x);
00347 y2 = padd(y2, x);
00348
00349
00350 y2 = _mm_and_ps(poly_mask, y2);
00351 y = _mm_andnot_ps(poly_mask, y);
00352 y = _mm_or_ps(y,y2);
00353
00354 return _mm_xor_ps(y, sign_bit);
00355 }
00356
00357
00358 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00359 Packet4f pcos<Packet4f>(const Packet4f& _x)
00360 {
00361 Packet4f x = _x;
00362 _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
00363 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
00364
00365 _EIGEN_DECLARE_CONST_Packet4i(1, 1);
00366 _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
00367 _EIGEN_DECLARE_CONST_Packet4i(2, 2);
00368 _EIGEN_DECLARE_CONST_Packet4i(4, 4);
00369
00370 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
00371 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
00372 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
00373 _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
00374 _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
00375 _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
00376 _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
00377 _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
00378 _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
00379 _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);
00380
00381 Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
00382 Packet4i emm0, emm2;
00383
00384 x = pabs(x);
00385
00386
00387 y = pmul(x, p4f_cephes_FOPI);
00388
00389
00390 emm2 = _mm_cvttps_epi32(y);
00391
00392 emm2 = _mm_add_epi32(emm2, p4i_1);
00393 emm2 = _mm_and_si128(emm2, p4i_not1);
00394 y = _mm_cvtepi32_ps(emm2);
00395
00396 emm2 = _mm_sub_epi32(emm2, p4i_2);
00397
00398
00399 emm0 = _mm_andnot_si128(emm2, p4i_4);
00400 emm0 = _mm_slli_epi32(emm0, 29);
00401
00402 emm2 = _mm_and_si128(emm2, p4i_2);
00403 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
00404
00405 Packet4f sign_bit = _mm_castsi128_ps(emm0);
00406 Packet4f poly_mask = _mm_castsi128_ps(emm2);
00407
00408
00409
00410 xmm1 = pmul(y, p4f_minus_cephes_DP1);
00411 xmm2 = pmul(y, p4f_minus_cephes_DP2);
00412 xmm3 = pmul(y, p4f_minus_cephes_DP3);
00413 x = padd(x, xmm1);
00414 x = padd(x, xmm2);
00415 x = padd(x, xmm3);
00416
00417
00418 y = p4f_coscof_p0;
00419 Packet4f z = pmul(x,x);
00420
00421 y = pmadd(y,z,p4f_coscof_p1);
00422 y = pmadd(y,z,p4f_coscof_p2);
00423 y = pmul(y, z);
00424 y = pmul(y, z);
00425 Packet4f tmp = _mm_mul_ps(z, p4f_half);
00426 y = psub(y, tmp);
00427 y = padd(y, p4f_1);
00428
00429
00430 Packet4f y2 = p4f_sincof_p0;
00431 y2 = pmadd(y2, z, p4f_sincof_p1);
00432 y2 = pmadd(y2, z, p4f_sincof_p2);
00433 y2 = pmul(y2, z);
00434 y2 = pmadd(y2, x, x);
00435
00436
00437 y2 = _mm_and_ps(poly_mask, y2);
00438 y = _mm_andnot_ps(poly_mask, y);
00439 y = _mm_or_ps(y,y2);
00440
00441
00442 return _mm_xor_ps(y, sign_bit);
00443 }
00444
00445
00446
00447 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
00448 Packet4f psqrt<Packet4f>(const Packet4f& _x)
00449 {
00450 Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
00451
00452
00453 Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()));
00454 Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
00455
00456 x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
00457 return pmul(_x,x);
00458 }
00459
00460 }
00461
00462 }
00463
00464 #endif // EIGEN_MATH_FUNCTIONS_SSE_H