00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_PACKET_MATH_SSE_H
00026 #define EIGEN_PACKET_MATH_SSE_H
00027
00028 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00029 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
00030 #endif
00031
00032 template<> struct ei_packet_traits<float> { typedef __m128 type; enum {size=4}; };
00033 template<> struct ei_packet_traits<double> { typedef __m128d type; enum {size=2}; };
00034 template<> struct ei_packet_traits<int> { typedef __m128i type; enum {size=4}; };
00035
00036 template<> struct ei_unpacket_traits<__m128> { typedef float type; enum {size=4}; };
00037 template<> struct ei_unpacket_traits<__m128d> { typedef double type; enum {size=2}; };
00038 template<> struct ei_unpacket_traits<__m128i> { typedef int type; enum {size=4}; };
00039
00040 template<> EIGEN_STRONG_INLINE __m128 ei_pset1<float>(const float& from) { return _mm_set1_ps(from); }
00041 template<> EIGEN_STRONG_INLINE __m128d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
00042 template<> EIGEN_STRONG_INLINE __m128i ei_pset1<int>(const int& from) { return _mm_set1_epi32(from); }
00043
00044 template<> EIGEN_STRONG_INLINE __m128 ei_padd<__m128>(const __m128& a, const __m128& b) { return _mm_add_ps(a,b); }
00045 template<> EIGEN_STRONG_INLINE __m128d ei_padd<__m128d>(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); }
00046 template<> EIGEN_STRONG_INLINE __m128i ei_padd<__m128i>(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); }
00047
00048 template<> EIGEN_STRONG_INLINE __m128 ei_psub<__m128>(const __m128& a, const __m128& b) { return _mm_sub_ps(a,b); }
00049 template<> EIGEN_STRONG_INLINE __m128d ei_psub<__m128d>(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a,b); }
00050 template<> EIGEN_STRONG_INLINE __m128i ei_psub<__m128i>(const __m128i& a, const __m128i& b) { return _mm_sub_epi32(a,b); }
00051
00052 template<> EIGEN_STRONG_INLINE __m128 ei_pmul<__m128>(const __m128& a, const __m128& b) { return _mm_mul_ps(a,b); }
00053 template<> EIGEN_STRONG_INLINE __m128d ei_pmul<__m128d>(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); }
00054 template<> EIGEN_STRONG_INLINE __m128i ei_pmul<__m128i>(const __m128i& a, const __m128i& b)
00055 {
00056 return _mm_or_si128(
00057 _mm_and_si128(
00058 _mm_mul_epu32(a,b),
00059 _mm_setr_epi32(0xffffffff,0,0xffffffff,0)),
00060 _mm_slli_si128(
00061 _mm_and_si128(
00062 _mm_mul_epu32(_mm_srli_si128(a,4),_mm_srli_si128(b,4)),
00063 _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4));
00064 }
00065
00066 template<> EIGEN_STRONG_INLINE __m128 ei_pdiv<__m128>(const __m128& a, const __m128& b) { return _mm_div_ps(a,b); }
00067 template<> EIGEN_STRONG_INLINE __m128d ei_pdiv<__m128d>(const __m128d& a, const __m128d& b) { return _mm_div_pd(a,b); }
00068 template<> EIGEN_STRONG_INLINE __m128i ei_pdiv<__m128i>(const __m128i& , const __m128i& )
00069 { ei_assert(false && "packet integer division are not supported by SSE");
00070 __m128i dummy = ei_pset1<int>(0);
00071 return dummy;
00072 }
00073
00074
00075 template<> EIGEN_STRONG_INLINE __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); }
00076
00077 template<> EIGEN_STRONG_INLINE __m128 ei_pmin<__m128>(const __m128& a, const __m128& b) { return _mm_min_ps(a,b); }
00078 template<> EIGEN_STRONG_INLINE __m128d ei_pmin<__m128d>(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
00079
00080 template<> EIGEN_STRONG_INLINE __m128i ei_pmin<__m128i>(const __m128i& a, const __m128i& b)
00081 {
00082 __m128i mask = _mm_cmplt_epi32(a,b);
00083 return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
00084 }
00085
00086 template<> EIGEN_STRONG_INLINE __m128 ei_pmax<__m128>(const __m128& a, const __m128& b) { return _mm_max_ps(a,b); }
00087 template<> EIGEN_STRONG_INLINE __m128d ei_pmax<__m128d>(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); }
00088
00089 template<> EIGEN_STRONG_INLINE __m128i ei_pmax<__m128i>(const __m128i& a, const __m128i& b)
00090 {
00091 __m128i mask = _mm_cmpgt_epi32(a,b);
00092 return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
00093 }
00094
00095 template<> EIGEN_STRONG_INLINE __m128 ei_pload<float>(const float* from) { return _mm_load_ps(from); }
00096 template<> EIGEN_STRONG_INLINE __m128d ei_pload<double>(const double* from) { return _mm_load_pd(from); }
00097 template<> EIGEN_STRONG_INLINE __m128i ei_pload<int>(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
00098
00099 template<> EIGEN_STRONG_INLINE __m128 ei_ploadu<float>(const float* from) { return _mm_loadu_ps(from); }
00100
00101
00102
00103
00104
00105
00106 template<> EIGEN_STRONG_INLINE __m128d ei_ploadu<double>(const double* from) { return _mm_loadu_pd(from); }
00107 template<> EIGEN_STRONG_INLINE __m128i ei_ploadu<int>(const int* from) { return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
00108
00109 template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const __m128& from) { _mm_store_ps(to, from); }
00110 template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const __m128d& from) { _mm_store_pd(to, from); }
00111 template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
00112
00113 template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const __m128& from) { _mm_storeu_ps(to, from); }
00114 template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
00115 template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
00116
00117 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) && !defined(__INTEL_COMPILER)
00118
00119
00120 template<> EIGEN_STRONG_INLINE float ei_pfirst<__m128>(const __m128& a) { return a.m128_f32[0]; }
00121 template<> EIGEN_STRONG_INLINE double ei_pfirst<__m128d>(const __m128d& a) { return a.m128d_f64[0]; }
00122 template<> EIGEN_STRONG_INLINE int ei_pfirst<__m128i>(const __m128i& a) { int x = _mm_cvtsi128_si32(a); return x; }
00123 #elif defined(_MSC_VER) && (_MSC_VER <= 1500) && !defined(__INTEL_COMPILER)
00124
00125 template<> EIGEN_STRONG_INLINE float ei_pfirst<__m128>(const __m128& a) { float x = _mm_cvtss_f32(a); return x; }
00126 template<> EIGEN_STRONG_INLINE double ei_pfirst<__m128d>(const __m128d& a) { double x = _mm_cvtsd_f64(a); return x; }
00127 template<> EIGEN_STRONG_INLINE int ei_pfirst<__m128i>(const __m128i& a) { int x = _mm_cvtsi128_si32(a); return x; }
00128 #else
00129 template<> EIGEN_STRONG_INLINE float ei_pfirst<__m128>(const __m128& a) { return _mm_cvtss_f32(a); }
00130 template<> EIGEN_STRONG_INLINE double ei_pfirst<__m128d>(const __m128d& a) { return _mm_cvtsd_f64(a); }
00131 template<> EIGEN_STRONG_INLINE int ei_pfirst<__m128i>(const __m128i& a) { return _mm_cvtsi128_si32(a); }
00132 #endif
00133
00134 #ifdef __SSE3__
00135
00136 template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
00137 {
00138 return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
00139 }
00140 template<> EIGEN_STRONG_INLINE __m128d ei_preduxp<__m128d>(const __m128d* vecs)
00141 {
00142 return _mm_hadd_pd(vecs[0], vecs[1]);
00143 }
00144
00145
00146
00147
00148
00149
00150 template<> EIGEN_STRONG_INLINE float ei_predux<__m128>(const __m128& a)
00151 {
00152 __m128 tmp0 = _mm_hadd_ps(a,a);
00153 return ei_pfirst(_mm_hadd_ps(tmp0, tmp0));
00154 }
00155
00156 template<> EIGEN_STRONG_INLINE double ei_predux<__m128d>(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); }
00157
00158
00159
00160
00161
00162
00163
00164 #else
00165
00166 template<> EIGEN_STRONG_INLINE float ei_predux<__m128>(const __m128& a)
00167 {
00168 __m128 tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
00169 return ei_pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00170 }
00171 template<> EIGEN_STRONG_INLINE double ei_predux<__m128d>(const __m128d& a)
00172 {
00173 return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
00174 }
00175
00176 template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
00177 {
00178 __m128 tmp0, tmp1, tmp2;
00179 tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
00180 tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
00181 tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
00182 tmp0 = _mm_add_ps(tmp0, tmp1);
00183 tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
00184 tmp1 = _mm_add_ps(tmp1, tmp2);
00185 tmp2 = _mm_movehl_ps(tmp1, tmp0);
00186 tmp0 = _mm_movelh_ps(tmp0, tmp1);
00187 return _mm_add_ps(tmp0, tmp2);
00188 }
00189
00190 template<> EIGEN_STRONG_INLINE __m128d ei_preduxp<__m128d>(const __m128d* vecs)
00191 {
00192 return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
00193 }
00194 #endif // SSE3
00195
00196 template<> EIGEN_STRONG_INLINE int ei_predux<__m128i>(const __m128i& a)
00197 {
00198 __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
00199 return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1));
00200 }
00201
00202 template<> EIGEN_STRONG_INLINE __m128i ei_preduxp<__m128i>(const __m128i* vecs)
00203 {
00204 __m128i tmp0, tmp1, tmp2;
00205 tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
00206 tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
00207 tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
00208 tmp0 = _mm_add_epi32(tmp0, tmp1);
00209 tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
00210 tmp1 = _mm_add_epi32(tmp1, tmp2);
00211 tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
00212 tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
00213 return _mm_add_epi32(tmp0, tmp2);
00214 }
00215
00216 #if (defined __GNUC__)
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229 #endif
00230
00231 #ifdef __SSSE3__
00232
00233 template<int Offset>
00234 struct ei_palign_impl<Offset,__m128>
00235 {
00236 EIGEN_STRONG_INLINE static void run(__m128& first, const __m128& second)
00237 {
00238 if (Offset!=0)
00239 first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
00240 }
00241 };
00242
00243 template<int Offset>
00244 struct ei_palign_impl<Offset,__m128i>
00245 {
00246 EIGEN_STRONG_INLINE static void run(__m128i& first, const __m128i& second)
00247 {
00248 if (Offset!=0)
00249 first = _mm_alignr_epi8(second,first, Offset*4);
00250 }
00251 };
00252
00253 template<int Offset>
00254 struct ei_palign_impl<Offset,__m128d>
00255 {
00256 EIGEN_STRONG_INLINE static void run(__m128d& first, const __m128d& second)
00257 {
00258 if (Offset==1)
00259 first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
00260 }
00261 };
00262 #else
00263
00264 template<int Offset>
00265 struct ei_palign_impl<Offset,__m128>
00266 {
00267 EIGEN_STRONG_INLINE static void run(__m128& first, const __m128& second)
00268 {
00269 if (Offset==1)
00270 {
00271 first = _mm_move_ss(first,second);
00272 first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
00273 }
00274 else if (Offset==2)
00275 {
00276 first = _mm_movehl_ps(first,first);
00277 first = _mm_movelh_ps(first,second);
00278 }
00279 else if (Offset==3)
00280 {
00281 first = _mm_move_ss(first,second);
00282 first = _mm_shuffle_ps(first,second,0x93);
00283 }
00284 }
00285 };
00286
00287 template<int Offset>
00288 struct ei_palign_impl<Offset,__m128i>
00289 {
00290 EIGEN_STRONG_INLINE static void run(__m128i& first, const __m128i& second)
00291 {
00292 if (Offset==1)
00293 {
00294 first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00295 first = _mm_shuffle_epi32(first,0x39);
00296 }
00297 else if (Offset==2)
00298 {
00299 first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
00300 first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00301 }
00302 else if (Offset==3)
00303 {
00304 first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00305 first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
00306 }
00307 }
00308 };
00309
00310 template<int Offset>
00311 struct ei_palign_impl<Offset,__m128d>
00312 {
00313 EIGEN_STRONG_INLINE static void run(__m128d& first, const __m128d& second)
00314 {
00315 if (Offset==1)
00316 {
00317 first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
00318 first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
00319 }
00320 }
00321 };
00322 #endif
00323
00324 #define ei_vec4f_swizzle1(v,p,q,r,s) \
00325 (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
00326
00327 #endif // EIGEN_PACKET_MATH_SSE_H