10 #ifndef EIGEN_PACKET_MATH_SSE_H
11 #define EIGEN_PACKET_MATH_SSE_H
17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21 #if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
24 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
27 #ifdef EIGEN_VECTORIZE_FMA
28 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
29 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33 #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
39 typedef eigen_packet_wrapper<__m128>
Packet4f;
40 typedef eigen_packet_wrapper<__m128d>
Packet2d;
46 typedef eigen_packet_wrapper<__m128i, 0>
Packet4i;
55 template<
int p,
int q,
int r,
int s>
61 #define vec4f_swizzle1(v,p,q,r,s) \
62 Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
64 #define vec4i_swizzle1(v,p,q,r,s) \
65 Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
67 #define vec2d_swizzle1(v,p,q) \
68 Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
70 #define vec4f_swizzle2(a,b,p,q,r,s) \
71 Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
73 #define vec4i_swizzle2(a,b,p,q,r,s) \
74 Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
92 #define vec4f_duplane(a,p) \
93 vec4f_swizzle2(a,a,p,p,p,p)
95 #define vec2d_swizzle2(a,b,mask) \
96 Packet2d(_mm_shuffle_pd(a,b,mask))
106 #define vec2d_duplane(a,p) \
107 vec2d_swizzle2(a,a,(p<<1)|p)
109 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
110 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
112 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
113 const Packet2d p2d_##NAME = pset1<Packet2d>(X)
115 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
116 const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
118 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
119 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
124 #ifndef EIGEN_VECTORIZE_AVX
126 struct packet_traits<
float> : default_packet_traits {
152 #ifdef EIGEN_VECTORIZE_SSE4_1
159 struct packet_traits<double> : default_packet_traits {
177 #ifdef EIGEN_VECTORIZE_SSE4_1
184 template<>
struct packet_traits<
int> : default_packet_traits
222 template<>
struct unpacket_traits<
Packet4f> {
228 template<>
struct unpacket_traits<
Packet2d> {
233 template<>
struct unpacket_traits<
Packet4i> {
244 #ifndef EIGEN_VECTORIZE_AVX
245 template<>
struct scalar_div_cost<
float,true> {
enum {
value = 7 }; };
246 template<>
struct scalar_div_cost<double,true> {
enum {
value = 8 }; };
249 #if EIGEN_COMP_MSVC==1500
279 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
303 #ifdef EIGEN_VECTORIZE_SSE3
304 return _mm_addsub_ps(
a,
b);
306 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0
x0,0x80000000,0
x0));
314 #ifdef EIGEN_VECTORIZE_SSE3
315 return _mm_addsub_pd(
a,
b);
317 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0
x0,0x80000000,0
x0,0
x0));
324 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
325 return _mm_xor_ps(
a,mask);
329 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0
x0,0x80000000,0
x0,0x80000000));
330 return _mm_xor_pd(
a,mask);
350 #ifdef EIGEN_VECTORIZE_SSE4_1
351 return _mm_mullo_epi32(
a,
b);
371 #ifdef EIGEN_VECTORIZE_FMA
376 #ifdef EIGEN_VECTORIZE_SSE4_1
378 return _mm_blendv_ps(
b,
a,mask);
382 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(
b),_mm_castsi128_ps(
a),_mm_castsi128_ps(mask)));
388 return _mm_blendv_epi8(
b,
a,mask);
394 return _mm_or_si128(a_part, b_part);
403 return _mm_castsi128_ps(_mm_cmpeq_epi32(
b,
b));
408 return _mm_castsi128_pd(_mm_cmpeq_epi32(
b,
b));
447 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
452 #ifdef EIGEN_VECTORIZE_AVX
454 asm(
"vminps %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
457 asm(
"minps %[a], %[res]" : [
res]
"+x" (
res) : [
a]
"x" (
a));
462 return _mm_min_ps(
b,
a);
466 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
471 #ifdef EIGEN_VECTORIZE_AVX
473 asm(
"vminpd %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
476 asm(
"minpd %[a], %[res]" : [
res]
"+x" (
res) : [
a]
"x" (
a));
481 return _mm_min_pd(
b,
a);
486 #ifdef EIGEN_VECTORIZE_SSE4_1
487 return _mm_min_epi32(
a,
b);
491 return _mm_or_si128(_mm_and_si128(mask,
a),_mm_andnot_si128(mask,
b));
497 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
502 #ifdef EIGEN_VECTORIZE_AVX
504 asm(
"vmaxps %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
507 asm(
"maxps %[a], %[res]" : [
res]
"+x" (
res) : [
a]
"x" (
a));
512 return _mm_max_ps(
b,
a);
516 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
521 #ifdef EIGEN_VECTORIZE_AVX
523 asm(
"vmaxpd %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
526 asm(
"maxpd %[a], %[res]" : [
res]
"+x" (
res) : [
a]
"x" (
a));
531 return _mm_max_pd(
b,
a);
536 #ifdef EIGEN_VECTORIZE_SSE4_1
537 return _mm_max_epi32(
a,
b);
541 return _mm_or_si128(_mm_and_si128(mask,
a),_mm_andnot_si128(mask,
b));
545 template <
typename Packet,
typename Op>
551 return pselect<Packet>(not_nan_mask_a,
m,
b);
554 template <
typename Packet,
typename Op>
560 return pselect<Packet>(not_nan_mask_a,
m,
a);
603 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
604 return _mm_and_ps(
a,mask);
608 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
609 return _mm_and_pd(
a,mask);
613 #ifdef EIGEN_VECTORIZE_SSSE3
614 return _mm_abs_epi32(
a);
617 return _mm_sub_epi32(_mm_xor_si128(
a,aux),aux);
621 #ifdef EIGEN_VECTORIZE_SSE4_1
627 return _mm_round_ps(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
632 const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
633 const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
634 return _mm_round_pd(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
680 mask =
pand(mask, cst_1);
681 return psub(tmp, mask);
690 mask =
pand(mask, cst_1);
691 return psub(tmp, mask);
700 mask =
pand(mask, cst_1);
701 return padd(tmp, mask);
710 mask =
pand(mask, cst_1);
711 return padd(tmp, mask);
723 #if (EIGEN_COMP_MSVC==1600)
726 __m128
res = _mm_loadl_pi(_mm_set1_ps(0.0
f), (
const __m64*)(from));
727 res = _mm_loadh_pi(
res, (
const __m64*)(from+2));
730 return _mm_loadu_ps(from);
739 return _mm_loadu_ps(from);
746 return _mm_loadu_pd(from);
751 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
755 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
761 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(
reinterpret_cast<const double*
>(from))), 0, 0, 1, 1);
768 tmp = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(from));
776 __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(
reinterpret_cast<const double*
>(from)));
777 return _mm_unpacklo_epi8(tmp, tmp);
784 __m128i tmp = _mm_castps_si128(
pload1<Packet4f>(
reinterpret_cast<const float*
>(from)));
785 tmp = _mm_unpacklo_epi8(tmp, tmp);
786 return _mm_unpacklo_epi16(tmp, tmp);
801 return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
805 return _mm_set_pd(from[1*stride], from[0*stride]);
809 return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
814 return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
815 from[11*stride], from[10*stride], from[9*stride], from[8*stride],
816 from[7*stride], from[6*stride], from[5*stride], from[4*stride],
817 from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
822 to[stride*0] = _mm_cvtss_f32(from);
823 to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
824 to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
825 to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
829 to[stride*0] = _mm_cvtsd_f64(from);
830 to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
834 to[stride*0] = _mm_cvtsi128_si32(from);
835 to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
836 to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
837 to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
841 to[4*stride*0] = _mm_cvtsi128_si32(from);
842 to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
843 to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
844 to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
861 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
867 #ifndef EIGEN_VECTORIZE_AVX
873 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
879 #elif EIGEN_COMP_MSVC_STRICT
895 #ifdef EIGEN_VECTORIZE_SSSE3
896 __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
897 return _mm_shuffle_epi8(
a, mask);
899 Packet16b tmp = _mm_shuffle_epi32(
a, _MM_SHUFFLE(0, 1, 2, 3));
900 tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
901 return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
914 __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(
pand(
a, cst_exp_mask)), 52);
938 Packet4i b = parithmetic_shift_right<2>(ei);
942 c = _mm_castsi128_pd(_mm_slli_epi64(
padd(
b,
bias), 52));
963 #ifdef EIGEN_VECTORIZE_SSE3
964 a0 = _mm_loaddup_pd(
a+0);
965 a1 = _mm_loaddup_pd(
a+1);
966 a2 = _mm_loaddup_pd(
a+2);
967 a3 = _mm_loaddup_pd(
a+3);
981 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
982 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
983 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
984 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1011 #ifdef EIGEN_VECTORIZE_SSSE3
1021 Packet4i tmp = _mm_add_epi32(
a, _mm_unpackhi_epi64(
a,
a));
1027 Packet4i tmp = _mm_or_si128(
a, _mm_unpackhi_epi64(
a,
a));
1051 return (aux[0] * aux[1]) * (aux[2] * aux[3]);
1055 Packet4i tmp = _mm_and_si128(
a, _mm_unpackhi_epi64(
a,
a));
1072 #ifdef EIGEN_VECTORIZE_SSE4_1
1073 Packet4i tmp = _mm_min_epi32(
a, _mm_shuffle_epi32(
a, _MM_SHUFFLE(0,0,3,2)));
1080 int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
1081 int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
1082 return aux0<aux2 ? aux0 : aux2;
1083 #endif // EIGEN_VECTORIZE_SSE4_1
1098 #ifdef EIGEN_VECTORIZE_SSE4_1
1099 Packet4i tmp = _mm_max_epi32(
a, _mm_shuffle_epi32(
a, _MM_SHUFFLE(0,0,3,2)));
1106 int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
1107 int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
1108 return aux0>aux2 ? aux0 : aux2;
1109 #endif // EIGEN_VECTORIZE_SSE4_1
1120 return _mm_movemask_ps(
x) != 0x0;
1124 ptranspose(PacketBlock<Packet4f,4>& kernel) {
1125 _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1129 ptranspose(PacketBlock<Packet2d,2>& kernel) {
1130 __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1131 kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1132 kernel.packet[1] = tmp;
1136 ptranspose(PacketBlock<Packet4i,4>& kernel) {
1137 __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1138 __m128i
T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1139 __m128i
T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1140 __m128i
T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1142 kernel.packet[0] = _mm_unpacklo_epi64(T0,
T1);
1143 kernel.packet[1] = _mm_unpackhi_epi64(T0,
T1);
1144 kernel.packet[2] = _mm_unpacklo_epi64(
T2,
T3);
1145 kernel.packet[3] = _mm_unpackhi_epi64(
T2,
T3);
1150 __m128i T0 = _mm_unpacklo_epi8(kernel.
packet[0], kernel.
packet[1]);
1151 __m128i
T1 = _mm_unpackhi_epi8(kernel.
packet[0], kernel.
packet[1]);
1152 __m128i
T2 = _mm_unpacklo_epi8(kernel.
packet[2], kernel.
packet[3]);
1153 __m128i
T3 = _mm_unpackhi_epi8(kernel.
packet[2], kernel.
packet[3]);
1154 kernel.
packet[0] = _mm_unpacklo_epi16(T0,
T2);
1155 kernel.
packet[1] = _mm_unpackhi_epi16(T0,
T2);
1156 kernel.
packet[2] = _mm_unpacklo_epi16(
T1,
T3);
1157 kernel.
packet[3] = _mm_unpackhi_epi16(
T1,
T3);
1173 __m128i t0 = _mm_unpacklo_epi8(kernel.
packet[0], kernel.
packet[1]);
1174 __m128i t1 = _mm_unpackhi_epi8(kernel.
packet[0], kernel.
packet[1]);
1175 __m128i t2 = _mm_unpacklo_epi8(kernel.
packet[2], kernel.
packet[3]);
1176 __m128i t3 = _mm_unpackhi_epi8(kernel.
packet[2], kernel.
packet[3]);
1177 __m128i t4 = _mm_unpacklo_epi8(kernel.
packet[4], kernel.
packet[5]);
1178 __m128i t5 = _mm_unpackhi_epi8(kernel.
packet[4], kernel.
packet[5]);
1179 __m128i t6 = _mm_unpacklo_epi8(kernel.
packet[6], kernel.
packet[7]);
1180 __m128i t7 = _mm_unpackhi_epi8(kernel.
packet[6], kernel.
packet[7]);
1181 __m128i t8 = _mm_unpacklo_epi8(kernel.
packet[8], kernel.
packet[9]);
1182 __m128i t9 = _mm_unpackhi_epi8(kernel.
packet[8], kernel.
packet[9]);
1183 __m128i ta = _mm_unpacklo_epi8(kernel.
packet[10], kernel.
packet[11]);
1184 __m128i tb = _mm_unpackhi_epi8(kernel.
packet[10], kernel.
packet[11]);
1185 __m128i tc = _mm_unpacklo_epi8(kernel.
packet[12], kernel.
packet[13]);
1186 __m128i td = _mm_unpackhi_epi8(kernel.
packet[12], kernel.
packet[13]);
1187 __m128i te = _mm_unpacklo_epi8(kernel.
packet[14], kernel.
packet[15]);
1188 __m128i tf = _mm_unpackhi_epi8(kernel.
packet[14], kernel.
packet[15]);
1190 __m128i s0 = _mm_unpacklo_epi16(t0, t2);
1191 __m128i s1 = _mm_unpackhi_epi16(t0, t2);
1192 __m128i s2 = _mm_unpacklo_epi16(t1, t3);
1193 __m128i s3 = _mm_unpackhi_epi16(t1, t3);
1194 __m128i s4 = _mm_unpacklo_epi16(t4, t6);
1195 __m128i s5 = _mm_unpackhi_epi16(t4, t6);
1196 __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1197 __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1198 __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1199 __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1200 __m128i sa = _mm_unpacklo_epi16(t9, tb);
1201 __m128i sb = _mm_unpackhi_epi16(t9, tb);
1202 __m128i sc = _mm_unpacklo_epi16(tc, te);
1203 __m128i
sd = _mm_unpackhi_epi16(tc, te);
1204 __m128i se = _mm_unpacklo_epi16(td, tf);
1205 __m128i sf = _mm_unpackhi_epi16(td, tf);
1207 __m128i
u0 = _mm_unpacklo_epi32(s0, s4);
1208 __m128i u1 = _mm_unpackhi_epi32(s0, s4);
1209 __m128i
u2 = _mm_unpacklo_epi32(s1, s5);
1210 __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1211 __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1212 __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1213 __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1214 __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1215 __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1216 __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1217 __m128i ua = _mm_unpacklo_epi32(s9,
sd);
1218 __m128i ub = _mm_unpackhi_epi32(s9,
sd);
1219 __m128i uc = _mm_unpacklo_epi32(sa, se);
1220 __m128i ud = _mm_unpackhi_epi32(sa, se);
1221 __m128i ue = _mm_unpacklo_epi32(sb, sf);
1222 __m128i uf = _mm_unpackhi_epi32(sb, sf);
1224 kernel.
packet[0] = _mm_unpacklo_epi64(
u0, u8);
1225 kernel.
packet[1] = _mm_unpackhi_epi64(
u0, u8);
1226 kernel.
packet[2] = _mm_unpacklo_epi64(u1, u9);
1227 kernel.
packet[3] = _mm_unpackhi_epi64(u1, u9);
1228 kernel.
packet[4] = _mm_unpacklo_epi64(
u2, ua);
1229 kernel.
packet[5] = _mm_unpackhi_epi64(
u2, ua);
1230 kernel.
packet[6] = _mm_unpacklo_epi64(u3, ub);
1231 kernel.
packet[7] = _mm_unpackhi_epi64(u3, ub);
1232 kernel.
packet[8] = _mm_unpacklo_epi64(u4, uc);
1233 kernel.
packet[9] = _mm_unpackhi_epi64(u4, uc);
1234 kernel.
packet[10] = _mm_unpacklo_epi64(u5, ud);
1235 kernel.
packet[11] = _mm_unpackhi_epi64(u5, ud);
1236 kernel.
packet[12] = _mm_unpacklo_epi64(u6, ue);
1237 kernel.
packet[13] = _mm_unpackhi_epi64(u6, ue);
1238 kernel.
packet[14] = _mm_unpacklo_epi64(u7, uf);
1239 kernel.
packet[15] = _mm_unpackhi_epi64(u7, uf);
1243 const __m128i
zero = _mm_setzero_si128();
1244 const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1245 __m128i false_mask = _mm_cmpeq_epi32(select,
zero);
1246 #ifdef EIGEN_VECTORIZE_SSE4_1
1247 return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
1249 return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
1253 const __m128
zero = _mm_setzero_ps();
1254 const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1255 __m128 false_mask = _mm_cmpeq_ps(select,
zero);
1256 #ifdef EIGEN_VECTORIZE_SSE4_1
1257 return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
1259 return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
1263 const __m128d
zero = _mm_setzero_pd();
1264 const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
1265 __m128d false_mask = _mm_cmpeq_pd(select,
zero);
1266 #ifdef EIGEN_VECTORIZE_SSE4_1
1267 return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
1269 return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
1274 #ifdef EIGEN_VECTORIZE_FMA
1276 return ::fmaf(
a,
b,
c);
1279 return ::fma(
a,
b,
c);
1294 template<>
struct is_arithmetic<Packet4h> {
enum {
value =
true }; };
1297 struct packet_traits<
Eigen::half> : default_packet_traits {
1298 typedef Packet4h
type;
1300 typedef Packet4h
half;
1330 result.x = _mm_set1_pi16(from.
x);
1341 __int64_t a64 = _mm_cvtm64_si64(
a.x);
1342 __int64_t b64 = _mm_cvtm64_si64(
b.x);
1364 __int64_t a64 = _mm_cvtm64_si64(
a.x);
1365 __int64_t b64 = _mm_cvtm64_si64(
b.x);
1387 __int64_t a64 = _mm_cvtm64_si64(
a.x);
1388 __int64_t b64 = _mm_cvtm64_si64(
b.x);
1410 __int64_t a64 = _mm_cvtm64_si64(
a.x);
1411 __int64_t b64 = _mm_cvtm64_si64(
b.x);
1434 result.x = _mm_cvtsi64_m64(*
reinterpret_cast<const __int64_t*
>(from));
1440 result.x = _mm_cvtsi64_m64(*
reinterpret_cast<const __int64_t*
>(from));
1445 __int64_t r = _mm_cvtm64_si64(from.x);
1446 *(
reinterpret_cast<__int64_t*
>(to)) = r;
1450 __int64_t r = _mm_cvtm64_si64(from.x);
1451 *(
reinterpret_cast<__int64_t*
>(to)) = r;
1456 return pset1<Packet4h>(*from);
1462 result.x = _mm_set_pi16(from[3*stride].
x, from[2*stride].
x, from[1*stride].
x, from[0*stride].
x);
1468 __int64_t
a = _mm_cvtm64_si64(from.x);
1469 to[stride*0].
x =
static_cast<unsigned short>(
a);
1470 to[stride*1].
x =
static_cast<unsigned short>(
a >> 16);
1471 to[stride*2].
x =
static_cast<unsigned short>(
a >> 32);
1472 to[stride*3].
x =
static_cast<unsigned short>(
a >> 48);
1476 ptranspose(PacketBlock<Packet4h,4>& kernel) {
1477 __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
1478 __m64
T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
1479 __m64
T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
1480 __m64
T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
1482 kernel.packet[0].x = _mm_unpacklo_pi32(T0,
T1);
1483 kernel.packet[1].x = _mm_unpackhi_pi32(T0,
T1);
1484 kernel.packet[2].x = _mm_unpacklo_pi32(
T2,
T3);
1485 kernel.packet[3].x = _mm_unpackhi_pi32(
T2,
T3);
1495 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1497 static inline __m128 _mm_castpd_ps (__m128d
x) {
return reinterpret_cast<__m128&
>(
x); }
1498 static inline __m128i _mm_castpd_si128(__m128d
x) {
return reinterpret_cast<__m128i&
>(
x); }
1499 static inline __m128d _mm_castps_pd (__m128
x) {
return reinterpret_cast<__m128d&
>(
x); }
1500 static inline __m128i _mm_castps_si128(__m128
x) {
return reinterpret_cast<__m128i&
>(
x); }
1501 static inline __m128 _mm_castsi128_ps(__m128i
x) {
return reinterpret_cast<__m128&
>(
x); }
1502 static inline __m128d _mm_castsi128_pd(__m128i
x) {
return reinterpret_cast<__m128d&
>(
x); }
1505 #endif // EIGEN_PACKET_MATH_SSE_H