00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00038 #ifndef FCL_MATH_SIMD_DETAILS_H
00039 #define FCL_MATH_SIMD_DETAILS_H
00040
00041 #include "fcl/data_types.h"
00042
00043 #include <xmmintrin.h>
00044 #if defined (__SSE3__)
00045 #include <pmmintrin.h>
00046 #endif
00047 #if defined (__SSE4__)
00048 #include <smmintrin.h>
00049 #endif
00050
00051
00052 namespace fcl
00053 {
00054
00056 namespace details
00057 {
00058
00059 const __m128 xmms_0 = {0.f, 0.f, 0.f, 0.f};
00060 const __m128d xmmd_0 = {0, 0};
00061
00062 static inline __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
00063 {
00064 return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
00065 }
00066 static inline __m128 vec_sel(__m128 a, __m128 b, const unsigned int* mask)
00067 {
00068 return vec_sel(a, b, _mm_load_ps((float*)mask));
00069 }
00070
00071 static inline __m128 vec_sel(__m128 a, __m128 b, unsigned int mask)
00072 {
00073 return vec_sel(a, b, _mm_set1_ps(*(float*)&mask));
00074 }
00075
00076 #define vec_splat(a, e) _mm_shuffle_ps((a), (a), _MM_SHUFFLE((e), (e), (e), (e)))
00077 #define vec_splatd(a, e) _mm_shuffle_pd((a), (a), _MM_SHUFFLE2((e), (e)))
00078
00079 #define _mm_ror_ps(x, e) (((e) % 4) ? _mm_shuffle_ps((x), (x), _MM_SHUFFLE(((e)+3)%4, ((e)+2)%4, ((e)+1)%4, (e)%4)) : (x))
00080
00081 #define _mm_rol_ps(x, e) (((e) % 4) ? _mm_shuffle_ps((x), (x), _MM_SHUFFLE((7-(e))%4, (6-(e))%4, (5-(e))%4, (4-(e))%4)) : (x))
00082
00083 static inline __m128 newtonraphson_rsqrt4(const __m128 v)
00084 {
00085 static const union { float i[4]; __m128 m; } _half4 __attribute__ ((aligned(16))) = {{.5f, .5f, .5f, .5f}};
00086 static const union { float i[4]; __m128 m; } _three __attribute__ ((aligned(16))) = {{3.f, 3.f, 3.f, 3.f}};
00087 __m128 approx = _mm_rsqrt_ps(v);
00088 __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);
00089 return _mm_mul_ps(_mm_mul_ps(_half4.m, approx), _mm_sub_ps(_three.m, muls));
00090 }
00091
00092 struct sse_meta_f4
00093 {
00094 typedef float meta_type;
00095
00096 union {float vs[4]; __m128 v; };
00097 sse_meta_f4() : v(_mm_set1_ps(0)) {}
00098 sse_meta_f4(float x) : v(_mm_set1_ps(x)) {}
00099 sse_meta_f4(float* px) : v(_mm_load_ps(px)) {}
00100 sse_meta_f4(__m128 x) : v(x) {}
00101 sse_meta_f4(float x, float y, float z, float w = 1) : v(_mm_setr_ps(x, y, z, w)) {}
00102 inline void setValue(float x, float y, float z, float w = 1) { v = _mm_setr_ps(x, y, z, w); }
00103 inline void setValue(float x) { v = _mm_set1_ps(x); }
00104 inline void setValue(__m128 x) { v = x; }
00105 inline void negate() { v = _mm_sub_ps(xmms_0, v); }
00106
00107 inline sse_meta_f4& ubound(const sse_meta_f4& u)
00108 {
00109 v = _mm_min_ps(v, u.v);
00110 return *this;
00111 }
00112
00113 inline sse_meta_f4& lbound(const sse_meta_f4& l)
00114 {
00115 v = _mm_max_ps(v, l.v);
00116 return *this;
00117 }
00118
00119 inline void* operator new [] (size_t n) { return _mm_malloc(n, 16); }
00120 inline void operator delete [] (void* x) { if(x) _mm_free(x); }
00121 inline float operator [] (size_t i) const { return vs[i]; }
00122 inline float& operator [] (size_t i) { return vs[i]; }
00123
00124 inline sse_meta_f4 operator + (const sse_meta_f4& other) const { return sse_meta_f4(_mm_add_ps(v, other.v)); }
00125 inline sse_meta_f4 operator - (const sse_meta_f4& other) const { return sse_meta_f4(_mm_sub_ps(v, other.v)); }
00126 inline sse_meta_f4 operator * (const sse_meta_f4& other) const { return sse_meta_f4(_mm_mul_ps(v, other.v)); }
00127 inline sse_meta_f4 operator / (const sse_meta_f4& other) const { return sse_meta_f4(_mm_div_ps(v, other.v)); }
00128 inline sse_meta_f4& operator += (const sse_meta_f4& other) { v = _mm_add_ps(v, other.v); return *this; }
00129 inline sse_meta_f4& operator -= (const sse_meta_f4& other) { v = _mm_sub_ps(v, other.v); return *this; }
00130 inline sse_meta_f4& operator *= (const sse_meta_f4& other) { v = _mm_mul_ps(v, other.v); return *this; }
00131 inline sse_meta_f4& operator /= (const sse_meta_f4& other) { v = _mm_div_ps(v, other.v); return *this; }
00132 inline sse_meta_f4 operator + (float t) const { return sse_meta_f4(_mm_add_ps(v, _mm_set1_ps(t))); }
00133 inline sse_meta_f4 operator - (float t) const { return sse_meta_f4(_mm_sub_ps(v, _mm_set1_ps(t))); }
00134 inline sse_meta_f4 operator * (float t) const { return sse_meta_f4(_mm_mul_ps(v, _mm_set1_ps(t))); }
00135 inline sse_meta_f4 operator / (float t) const { return sse_meta_f4(_mm_div_ps(v, _mm_set1_ps(t))); }
00136 inline sse_meta_f4& operator += (float t) { v = _mm_add_ps(v, _mm_set1_ps(t)); return *this; }
00137 inline sse_meta_f4& operator -= (float t) { v = _mm_sub_ps(v, _mm_set1_ps(t)); return *this; }
00138 inline sse_meta_f4& operator *= (float t) { v = _mm_mul_ps(v, _mm_set1_ps(t)); return *this; }
00139 inline sse_meta_f4& operator /= (float t) { v = _mm_div_ps(v, _mm_set1_ps(t)); return *this; }
00140 inline sse_meta_f4 operator - () const
00141 {
00142 static const union { int i[4]; __m128 m; } negativemask __attribute__ ((aligned(16))) = {{0x80000000, 0x80000000, 0x80000000, 0x80000000}};
00143 return sse_meta_f4(_mm_xor_ps(negativemask.m, v));
00144 }
00145 } __attribute__ ((aligned (16)));
00146
00147 struct sse_meta_d4
00148 {
00149 typedef double meta_type;
00150
00151 union {double vs[4]; __m128d v[2]; };
00152 sse_meta_d4()
00153 {
00154 setValue(0.0);
00155 }
00156
00157 sse_meta_d4(double x)
00158 {
00159 setValue(x);
00160 }
00161
00162 sse_meta_d4(double* px)
00163 {
00164 v[0] = _mm_load_pd(px);
00165 v[1] = _mm_set_pd(0, *(px + 2));
00166 }
00167
00168 sse_meta_d4(__m128d x, __m128d y)
00169 {
00170 v[0] = x;
00171 v[1] = y;
00172 }
00173
00174 sse_meta_d4(double x, double y, double z, double w = 0)
00175 {
00176 setValue(x, y, z, w);
00177 }
00178
00179 inline void setValue(double x, double y, double z, double w = 0)
00180 {
00181 v[0] = _mm_setr_pd(x, y);
00182 v[1] = _mm_setr_pd(z, w);
00183 }
00184
00185 inline void setValue(double x)
00186 {
00187 v[0] = _mm_set1_pd(x);
00188 v[1] = v[0];
00189 }
00190
00191 inline void setValue(__m128d x, __m128d y)
00192 {
00193 v[0] = x;
00194 v[1] = y;
00195 }
00196
00197 inline void negate()
00198 {
00199 v[0] = _mm_sub_pd(xmmd_0, v[0]);
00200 v[1] = _mm_sub_pd(xmmd_0, v[1]);
00201 }
00202
00203 inline sse_meta_d4& ubound(const sse_meta_d4& u)
00204 {
00205 v[0] = _mm_min_pd(v[0], u.v[0]);
00206 v[1] = _mm_min_pd(v[1], u.v[1]);
00207 return *this;
00208 }
00209
00210 inline sse_meta_d4& lbound(const sse_meta_d4& l)
00211 {
00212 v[0] = _mm_max_pd(v[0], l.v[0]);
00213 v[1] = _mm_max_pd(v[1], l.v[1]);
00214 return *this;
00215 }
00216
00217 inline void* operator new [] (size_t n)
00218 {
00219 return _mm_malloc(n, 16);
00220 }
00221
00222 inline void operator delete [] (void* x)
00223 {
00224 if(x) _mm_free(x);
00225 }
00226
00227 inline double operator [] (size_t i) const { return vs[i]; }
00228 inline double& operator [] (size_t i) { return vs[i]; }
00229
00230 inline sse_meta_d4 operator + (const sse_meta_d4& other) const { return sse_meta_d4(_mm_add_pd(v[0], other.v[0]), _mm_add_pd(v[1], other.v[1])); }
00231 inline sse_meta_d4 operator - (const sse_meta_d4& other) const { return sse_meta_d4(_mm_sub_pd(v[0], other.v[0]), _mm_sub_pd(v[1], other.v[1])); }
00232 inline sse_meta_d4 operator * (const sse_meta_d4& other) const { return sse_meta_d4(_mm_mul_pd(v[0], other.v[0]), _mm_mul_pd(v[1], other.v[1])); }
00233 inline sse_meta_d4 operator / (const sse_meta_d4& other) const { return sse_meta_d4(_mm_div_pd(v[0], other.v[0]), _mm_div_pd(v[1], other.v[1])); }
00234 inline sse_meta_d4& operator += (const sse_meta_d4& other) { v[0] = _mm_add_pd(v[0], other.v[0]); v[1] = _mm_add_pd(v[1], other.v[1]); return *this; }
00235 inline sse_meta_d4& operator -= (const sse_meta_d4& other) { v[0] = _mm_sub_pd(v[0], other.v[0]); v[1] = _mm_sub_pd(v[1], other.v[1]); return *this; }
00236 inline sse_meta_d4& operator *= (const sse_meta_d4& other) { v[0] = _mm_mul_pd(v[0], other.v[0]); v[1] = _mm_mul_pd(v[1], other.v[1]); return *this; }
00237 inline sse_meta_d4& operator /= (const sse_meta_d4& other) { v[0] = _mm_div_pd(v[0], other.v[0]); v[1] = _mm_div_pd(v[1], other.v[1]); return *this; }
00238 inline sse_meta_d4 operator + (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_add_pd(v[0], d), _mm_add_pd(v[1], d)); }
00239 inline sse_meta_d4 operator - (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_sub_pd(v[0], d), _mm_sub_pd(v[1], d)); }
00240 inline sse_meta_d4 operator * (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_mul_pd(v[0], d), _mm_mul_pd(v[1], d)); }
00241 inline sse_meta_d4 operator / (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_div_pd(v[0], d), _mm_div_pd(v[1], d)); }
00242 inline sse_meta_d4& operator += (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_add_pd(v[0], d); v[1] = _mm_add_pd(v[1], d); return *this; }
00243 inline sse_meta_d4& operator -= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_sub_pd(v[0], d); v[1] = _mm_sub_pd(v[1], d); return *this; }
00244 inline sse_meta_d4& operator *= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_mul_pd(v[0], d); v[1] = _mm_mul_pd(v[1], d); return *this; }
00245 inline sse_meta_d4& operator /= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_div_pd(v[0], d); v[1] = _mm_div_pd(v[1], d); return *this; }
00246 inline sse_meta_d4 operator - () const
00247 {
00248 static const union { FCL_INT64 i[2]; __m128d m; } negativemask __attribute__ ((aligned(16))) = {{0x8000000000000000, 0x8000000000000000}};
00249 return sse_meta_d4(_mm_xor_pd(v[0], negativemask.m), _mm_xor_pd(v[1], negativemask.m));
00250 }
00251 } __attribute__ ((aligned (16)));
00252
00253
00254
00255 static inline __m128 cross_prod(__m128 x, __m128 y)
00256 {
00257
00258
00259 static const int s1 = _MM_SHUFFLE(3, 0, 2, 1);
00260 static const int s2 = _MM_SHUFFLE(3, 1, 0, 2);
00261 __m128 xa = _mm_mul_ps(_mm_shuffle_ps(x, x, s1), _mm_shuffle_ps(y, y, s2));
00262
00263
00264
00265 __m128 xb = _mm_mul_ps(_mm_shuffle_ps(x, x, s2), _mm_shuffle_ps(y, y, s1));
00266
00267
00268 return _mm_sub_ps(xa, xb);
00269 }
00270
00271 static inline sse_meta_f4 cross_prod(const sse_meta_f4& x, const sse_meta_f4& y)
00272 {
00273 return sse_meta_f4(cross_prod(x.v, y.v));
00274 }
00275
00276 static inline void cross_prod(__m128d x0, __m128d x1, __m128d y0, __m128d y1, __m128d* z0, __m128d* z1)
00277 {
00278 static const int s0 = _MM_SHUFFLE2(0, 0);
00279 static const int s1 = _MM_SHUFFLE2(0, 1);
00280 static const int s2 = _MM_SHUFFLE2(1, 0);
00281 static const int s3 = _MM_SHUFFLE2(1, 1);
00282 __m128d xa1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s1), _mm_shuffle_pd(y1, y0, s0));
00283 __m128d ya1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s2), _mm_shuffle_pd(y0, y1, s3));
00284
00285 __m128d xa2 = _mm_mul_pd(_mm_shuffle_pd(x1, x0, s0), _mm_shuffle_pd(y0, y1, s1));
00286 __m128d ya2 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s3), _mm_shuffle_pd(y0, y1, s2));
00287
00288 *z0 = _mm_sub_pd(xa1, xa2);
00289 *z1 = _mm_sub_pd(ya1, ya2);
00290 }
00291
00292 static inline sse_meta_d4 cross_prod(const sse_meta_d4& x, const sse_meta_d4& y)
00293 {
00294 __m128d z0, z1;
00295 cross_prod(x.v[0], x.v[1], y.v[0], y.v[1], &z0, &z1);
00296 return sse_meta_d4(z0, z1);
00297 }
00298
00299
00300 static inline __m128 dot_prod3(__m128 x, __m128 y)
00301 {
00302 register __m128 m = _mm_mul_ps(x, y);
00303 return _mm_add_ps(_mm_shuffle_ps(m, m, _MM_SHUFFLE(0, 0, 0, 0)),
00304 _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
00305 }
00306
00307 static inline float dot_prod3(const sse_meta_f4& x, const sse_meta_f4& y)
00308 {
00309 return _mm_cvtss_f32(dot_prod3(x.v, y.v));
00310 }
00311
00312
00313 static inline __m128d dot_prod3(__m128d x0, __m128d x1, __m128d y0, __m128d y1)
00314 {
00315 register __m128d m1 = _mm_mul_pd(x0, y0);
00316 register __m128d m2 = _mm_mul_pd(x1, y1);
00317 return _mm_add_pd(_mm_add_pd(vec_splatd(m1, 0), vec_splatd(m1, 1)), vec_splatd(m2, 0));
00318 }
00319
00320 static inline double dot_prod3(const sse_meta_d4& x, const sse_meta_d4& y)
00321 {
00322 double d;
00323 _mm_storel_pd(&d, dot_prod3(x.v[0], x.v[1], y.v[0], y.v[1]));
00324 return d;
00325 }
00326
00327 static inline __m128 dot_prod4(__m128 x, __m128 y)
00328 {
00329 #if defined (__SSE4__)
00330 return _mm_dp_ps(x, y, 0x71);
00331 #elif defined (__SSE3__)
00332 register __m128 t = _mm_mul_ps(x, y);
00333 t = _mm_hadd_ps(t, t);
00334 return _mm_hadd_ps(t, t);
00335 #else
00336 register __m128 s = _mm_mul_ps(x, y);
00337 register __m128 r = _mm_add_ss(s, _mm_movehl_ps(s, s));
00338 return _mm_add_ss(r, _mm_shuffle_ps(r, r, 1));
00339 #endif
00340 }
00341
00342
00343 static inline float dot_prod4(const sse_meta_f4& x, const sse_meta_f4& y)
00344 {
00345 return _mm_cvtss_f32(dot_prod4(x.v, y.v));
00346 }
00347
00348 static inline __m128d dot_prod4(__m128d x0, __m128d x1, __m128d y0, __m128d y1)
00349 {
00350 #if defined (__SSE4__)
00351 register __m128d t1 = _mm_dp_pd(x0, y0, 0x31);
00352 register __m128d t2 = _mm_dp_pd(x1, y1, 0x11);
00353 return _mm_add_pd(t1, t2);
00354 #elif defined (__SSE3__)
00355 register __m128d t1 = _mm_mul_pd(x0, y0);
00356 register __m128d t2 = _mm_mul_pd(x1, y1);
00357 t1 = _mm_hadd_pd(t1, t1);
00358 t2 = _mm_hadd_pd(t2, t2);
00359 return _mm_add_pd(t1, t2);
00360 #else
00361 register __m128d t1 = _mm_mul_pd(x0, y0);
00362 register __m128d t2 = _mm_mul_pd(x1, y1);
00363 t1 = _mm_add_pd(t1, t2);
00364 return _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1));
00365 #endif
00366 }
00367
00368 static inline double dot_prod4(const sse_meta_d4& x, const sse_meta_d4& y)
00369 {
00370 double d;
00371 _mm_storel_pd(&d, dot_prod4(x.v[0], x.v[1], y.v[0], y.v[1]));
00372 return d;
00373 }
00374
00375 static inline sse_meta_f4 min(const sse_meta_f4& x, const sse_meta_f4& y)
00376 {
00377 return sse_meta_f4(_mm_min_ps(x.v, y.v));
00378 }
00379
00380 static inline sse_meta_d4 min(const sse_meta_d4& x, const sse_meta_d4& y)
00381 {
00382 return sse_meta_d4(_mm_min_pd(x.v[0], y.v[0]), _mm_min_pd(x.v[1], y.v[1]));
00383 }
00384
00385 static inline sse_meta_f4 max(const sse_meta_f4& x, const sse_meta_f4& y)
00386 {
00387 return sse_meta_f4(_mm_max_ps(x.v, y.v));
00388 }
00389
00390 static inline sse_meta_d4 max(const sse_meta_d4& x, const sse_meta_d4& y)
00391 {
00392 return sse_meta_d4(_mm_max_pd(x.v[0], y.v[0]), _mm_max_pd(x.v[1], y.v[1]));
00393 }
00394
00395 static inline sse_meta_f4 abs(const sse_meta_f4& x)
00396 {
00397 static const union { int i[4]; __m128 m; } abs4mask __attribute__ ((aligned (16))) = {{0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}};
00398 return sse_meta_f4(_mm_and_ps(x.v, abs4mask.m));
00399 }
00400
00401 static inline sse_meta_d4 abs(const sse_meta_d4& x)
00402 {
00403 static const union { FCL_INT64 i[2]; __m128d m; } abs2mask __attribute__ ((aligned (16))) = {{0x7fffffffffffffff, 0x7fffffffffffffff}};
00404 return sse_meta_d4(_mm_and_pd(x.v[0], abs2mask.m), _mm_and_pd(x.v[1], abs2mask.m));
00405 }
00406
00407 static inline bool equal(const sse_meta_f4& x, const sse_meta_f4& y, float epsilon)
00408 {
00409 register __m128 d = _mm_sub_ps(x.v, y.v);
00410 register __m128 e = _mm_set1_ps(epsilon);
00411 return ((_mm_movemask_ps(_mm_cmplt_ps(d, e)) & 0x7) == 0x7) && ((_mm_movemask_ps(_mm_cmpgt_ps(d, _mm_sub_ps(xmms_0, e))) & 0x7) == 0x7);
00412 }
00413
00414 static inline bool equal(const sse_meta_d4& x, const sse_meta_d4& y, double epsilon)
00415 {
00416 register __m128d d = _mm_sub_pd(x.v[0], y.v[0]);
00417 register __m128d e = _mm_set1_pd(epsilon);
00418
00419 if(_mm_movemask_pd(_mm_cmplt_pd(d, e)) != 0x3) return false;
00420 if(_mm_movemask_pd(_mm_cmpgt_pd(d, _mm_sub_pd(xmmd_0, e))) != 0x3) return false;
00421
00422 d = _mm_sub_pd(x.v[1], y.v[1]);
00423 if((_mm_movemask_pd(_mm_cmplt_pd(d, e)) & 0x1) != 0x1) return false;
00424 if((_mm_movemask_pd(_mm_cmpgt_pd(d, _mm_sub_pd(xmmd_0, e))) & 0x1) != 0x1) return false;
00425 return true;
00426 }
00427
00428 static inline sse_meta_f4 normalize3(const sse_meta_f4& x)
00429 {
00430 register __m128 m = _mm_mul_ps(x.v, x.v);
00431 __m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
00432 return sse_meta_f4(_mm_mul_ps(x.v, newtonraphson_rsqrt4(r)));
00433 }
00434
00435 static inline sse_meta_f4 normalize3_approx(const sse_meta_f4& x)
00436 {
00437 register __m128 m = _mm_mul_ps(x.v, x.v);
00438 __m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
00439 return sse_meta_f4(_mm_mul_ps(x.v, _mm_rsqrt_ps(r)));
00440 }
00441
00442
00443 static inline void transpose(__m128 c0, __m128 c1, __m128 c2, __m128* r0, __m128* r1, __m128* r2)
00444 {
00445 static const union { unsigned int i[4]; __m128 m; } selectmask __attribute__ ((aligned(16))) = {{0, 0xffffffff, 0, 0}};
00446 register __m128 t0, t1;
00447 t0 = _mm_unpacklo_ps(c0, c2);
00448 t1 = _mm_unpackhi_ps(c0, c2);
00449 *r0 = _mm_unpacklo_ps(t0, c1);
00450 *r1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 3, 2, 2));
00451 *r1 = vec_sel(*r1, c1, selectmask.i);
00452 *r2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 1, 1, 0));
00453 *r2 = vec_sel(*r2, vec_splat(c1, 2), selectmask.i);
00454 }
00455
00456
00457 static inline void inverse(__m128 c0, __m128 c1, __m128 c2, __m128* i0, __m128* i1, __m128* i2)
00458 {
00459 __m128 t0, t1, t2, d, invd;
00460 t2 = cross_prod(c0, c1);
00461 t0 = cross_prod(c1, c2);
00462 t1 = cross_prod(c2, c0);
00463 d = dot_prod3(t2, c2);
00464 d = vec_splat(d, 0);
00465 invd = _mm_rcp_ps(d);
00466 transpose(t0, t1, t2, i0, i1, i2);
00467 *i0 = _mm_mul_ps(*i0, invd);
00468 *i1 = _mm_mul_ps(*i1, invd);
00469 *i2 = _mm_mul_ps(*i2, invd);
00470 }
00471
00472
00473 struct sse_meta_f12
00474 {
00475 typedef float meta_type;
00476 typedef sse_meta_f4 vector_type;
00477 sse_meta_f4 c[3];
00478
00479 sse_meta_f12() { setZero(); }
00480
00481 sse_meta_f12(float xx, float xy, float xz,
00482 float yx, float yy, float yz,
00483 float zx, float zy, float zz)
00484 { setValue(xx, xy, xz, yx, yy, yz, zx, zy, zz); }
00485
00486 sse_meta_f12(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
00487 { setColumn(x, y, z); }
00488
00489 sse_meta_f12(__m128 x, __m128 y, __m128 z)
00490 { setColumn(x, y, z); }
00491
00492 inline void setValue(float xx, float xy, float xz,
00493 float yx, float yy, float yz,
00494 float zx, float zy, float zz)
00495 {
00496 c[0].setValue(xx, yx, zx, 0);
00497 c[1].setValue(xy, yy, zy, 0);
00498 c[2].setValue(xz, yz, zz, 0);
00499 }
00500
00501 inline void setIdentity()
00502 {
00503 c[0].setValue(1, 0, 0, 0);
00504 c[1].setValue(0, 1, 0, 0);
00505 c[2].setValue(0, 0, 1, 0);
00506 }
00507
00508 inline void setZero()
00509 {
00510 c[0].setValue(0);
00511 c[1].setValue(0);
00512 c[2].setValue(0);
00513 }
00514
00515 inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
00516 {
00517 c[0] = x; c[1] = y; c[2] = z;
00518 }
00519
00520 inline void setColumn(__m128 x, __m128 y, __m128 z)
00521 {
00522 c[0].setValue(x); c[1].setValue(y); c[2].setValue(z);
00523 }
00524
00525 inline const sse_meta_f4& getColumn(size_t i) const
00526 {
00527 return c[i];
00528 }
00529
00530 inline sse_meta_f4& getColumn(size_t i)
00531 {
00532 return c[i];
00533 }
00534
00535 inline sse_meta_f4 getRow(size_t i) const
00536 {
00537 return sse_meta_f4(c[0][i], c[1][i], c[2][i], 0);
00538 }
00539
00540 inline float operator () (size_t i, size_t j) const
00541 {
00542 return c[j][i];
00543 }
00544
00545 inline float& operator () (size_t i, size_t j)
00546 {
00547 return c[j][i];
00548 }
00549
00550 inline sse_meta_f4 operator * (const sse_meta_f4& v) const
00551 {
00552 return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))), _mm_mul_ps(c[2].v, vec_splat(v.v, 2))));
00553 }
00554
00555 inline sse_meta_f12 operator * (const sse_meta_f12& mat) const
00556 {
00557 return sse_meta_f12((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2]);
00558 }
00559
00560 inline sse_meta_f12 operator + (const sse_meta_f12& mat) const
00561 {
00562 return sse_meta_f12(c[0] + mat.c[0], c[1] + mat.c[1], c[2] + mat.c[2]);
00563 }
00564
00565 inline sse_meta_f12 operator - (const sse_meta_f12& mat) const
00566 {
00567 return sse_meta_f12(c[0] - mat.c[0], c[1] - mat.c[1], c[2] - mat.c[2]);
00568 }
00569
00570 inline sse_meta_f12 operator + (float t_) const
00571 {
00572 sse_meta_f4 t(t_);
00573 return sse_meta_f12(c[0] + t, c[1] + t, c[2] + t);
00574 }
00575
00576 inline sse_meta_f12 operator - (float t_) const
00577 {
00578 sse_meta_f4 t(t_);
00579 return sse_meta_f12(c[0] - t, c[1] - t, c[2] - t);
00580 }
00581
00582 inline sse_meta_f12 operator * (float t_) const
00583 {
00584 sse_meta_f4 t(t_);
00585 return sse_meta_f12(c[0] * t, c[1] * t, c[2] * t);
00586 }
00587
00588 inline sse_meta_f12 operator / (float t_) const
00589 {
00590 sse_meta_f4 t(t_);
00591 return sse_meta_f12(c[0] / t, c[1] / t, c[2] / t);
00592 }
00593
00594 inline sse_meta_f12& operator *= (const sse_meta_f12& mat)
00595 {
00596 setColumn((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2]);
00597 return *this;
00598 }
00599
00600 inline sse_meta_f12& operator += (const sse_meta_f12& mat)
00601 {
00602 c[0] += mat.c[0];
00603 c[1] += mat.c[1];
00604 c[2] += mat.c[2];
00605 return *this;
00606 }
00607
00608 inline sse_meta_f12& operator -= (const sse_meta_f12& mat)
00609 {
00610 c[0] -= mat.c[0];
00611 c[1] -= mat.c[1];
00612 c[2] -= mat.c[2];
00613 return *this;
00614 }
00615
00616 inline sse_meta_f12& operator += (float t_)
00617 {
00618 sse_meta_f4 t(t_);
00619 c[0] += t;
00620 c[1] += t;
00621 c[2] += t;
00622 return *this;
00623 }
00624
00625 inline sse_meta_f12& operator -= (float t_)
00626 {
00627 sse_meta_f4 t(t_);
00628 c[0] -= t;
00629 c[1] -= t;
00630 c[2] -= t;
00631 return *this;
00632 }
00633
00634 inline sse_meta_f12& operator *= (float t_)
00635 {
00636 sse_meta_f4 t(t_);
00637 c[0] *= t;
00638 c[1] *= t;
00639 c[2] *= t;
00640 return *this;
00641 }
00642
00643 inline sse_meta_f12& operator /= (float t_)
00644 {
00645 sse_meta_f4 t(t_);
00646 c[0] /= t;
00647 c[1] /= t;
00648 c[2] /= t;
00649 return *this;
00650 }
00651
00652 inline sse_meta_f12& inverse()
00653 {
00654 __m128 inv0, inv1, inv2;
00655 details::inverse(c[0].v, c[1].v, c[2].v, &inv0, &inv1, &inv2);
00656 setColumn(inv0, inv1, inv2);
00657 return *this;
00658 }
00659
00660 inline sse_meta_f12& transpose()
00661 {
00662 __m128 r0, r1, r2;
00663 details::transpose(c[0].v, c[1].v, c[2].v, &r0, &r1, &r2);
00664 setColumn(r0, r1, r2);
00665 return *this;
00666 }
00667
00668 inline sse_meta_f12& abs()
00669 {
00670 c[0] = details::abs(c[0]);
00671 c[1] = details::abs(c[1]);
00672 c[2] = details::abs(c[2]);
00673 return *this;
00674 }
00675
00676 inline float determinant() const
00677 {
00678 return _mm_cvtss_f32(dot_prod3(c[2].v, cross_prod(c[0].v, c[1].v)));
00679 }
00680
00681 inline sse_meta_f12 transposeTimes(const sse_meta_f12& other) const
00682 {
00683 return sse_meta_f12(dot_prod3(c[0], other.c[0]), dot_prod3(c[0], other.c[1]), dot_prod3(c[0], other.c[2]),
00684 dot_prod3(c[1], other.c[0]), dot_prod3(c[1], other.c[1]), dot_prod3(c[1], other.c[2]),
00685 dot_prod3(c[2], other.c[0]), dot_prod3(c[2], other.c[1]), dot_prod3(c[2], other.c[2]));
00686 }
00687
00688 inline sse_meta_f12 timesTranspose(const sse_meta_f12& m) const
00689 {
00690 sse_meta_f12 tmp(m);
00691 return (*this) * tmp.transpose();
00692 }
00693
00694 inline sse_meta_f4 transposeTimes(const sse_meta_f4& v) const
00695 {
00696 return sse_meta_f4(dot_prod3(c[0], v), dot_prod3(c[1], v), dot_prod3(c[2], v));
00697 }
00698
00699 inline float transposeDot(size_t i, const sse_meta_f4& v) const
00700 {
00701 return dot_prod3(c[i], v);
00702 }
00703
00704 inline float dot(size_t i, const sse_meta_f4& v) const
00705 {
00706 return v[0] * c[0][i] + v[1] * c[1][i] + v[2] * c[2][i];
00707 }
00708
00709 };
00710
00711 static inline sse_meta_f12 abs(const sse_meta_f12& mat)
00712 {
00713 return sse_meta_f12(abs(mat.getColumn(0)), abs(mat.getColumn(1)), abs(mat.getColumn(2)));
00714 }
00715
00716 static inline sse_meta_f12 transpose(const sse_meta_f12& mat)
00717 {
00718 __m128 r0, r1, r2;
00719 transpose(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &r0, &r1, &r2);
00720 return sse_meta_f12(r0, r1, r2);
00721 }
00722
00723
00724 static inline sse_meta_f12 inverse(const sse_meta_f12& mat)
00725 {
00726 __m128 inv0, inv1, inv2;
00727 inverse(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &inv0, &inv1, &inv2);
00728 return sse_meta_f12(inv0, inv1, inv2);
00729 }
00730
00731
00732 static inline void transpose(__m128 c0, __m128 c1, __m128 c2, __m128 c3,
00733 __m128* r0, __m128* r1, __m128* r2, __m128* r3)
00734 {
00735 __m128 tmp0 = _mm_unpacklo_ps(c0, c2);
00736 __m128 tmp1 = _mm_unpacklo_ps(c1, c3);
00737 __m128 tmp2 = _mm_unpackhi_ps(c0, c2);
00738 __m128 tmp3 = _mm_unpackhi_ps(c1, c3);
00739 *r0 = _mm_unpacklo_ps(tmp0, tmp1);
00740 *r1 = _mm_unpackhi_ps(tmp0, tmp1);
00741 *r2 = _mm_unpacklo_ps(tmp2, tmp3);
00742 *r3 = _mm_unpackhi_ps(tmp2, tmp3);
00743 }
00744
00745
00746 static inline void inverse(__m128 c0, __m128 c1, __m128 c2, __m128 c3,
00747 __m128* res0, __m128* res1, __m128* res2, __m128* res3)
00748 {
00749 __m128 Va, Vb, Vc;
00750 __m128 r1, r2, r3, tt, tt2;
00751 __m128 sum, Det, RDet;
00752 __m128 trns0, trns1, trns2, trns3;
00753
00754
00755
00756 tt = c3; tt2 = _mm_ror_ps(c2,1);
00757 Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));
00758 Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));
00759 Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));
00760
00761 r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));
00762 r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));
00763 r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));
00764
00765 tt = c1;
00766 Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1);
00767 Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
00768 Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
00769
00770
00771 Det = _mm_mul_ps(sum,c0);
00772 Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
00773
00774 static const union { int i[4]; __m128 m; } Sign_PNPN __attribute__ ((aligned(16))) = {{0x00000000, 0x80000000, 0x00000000, 0x80000000}};
00775 static const union { int i[4]; __m128 m; } Sign_NPNP __attribute__ ((aligned(16))) = {{0x80000000, 0x00000000, 0x80000000, 0x00000000}};
00776 static const union { float i[4]; __m128 m; } ZERONE __attribute__ ((aligned(16))) = {{1.0f, 0.0f, 0.0f, 1.0f}};
00777
00778 __m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN.m);
00779
00780
00781 tt = _mm_ror_ps(c0,1); sum = _mm_mul_ps(tt,r1);
00782 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
00783 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
00784 __m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP.m);
00785
00786
00787 Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
00788
00789
00790 tt = _mm_ror_ps(c0,1);
00791 Va = _mm_mul_ps(tt,Vb);
00792 Vb = _mm_mul_ps(tt,Vc);
00793 Vc = _mm_mul_ps(tt,c1);
00794
00795 r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));
00796 r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));
00797 r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));
00798
00799 tt = _mm_ror_ps(c3,1); sum = _mm_mul_ps(tt,r1);
00800 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
00801 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
00802 __m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN.m);
00803
00804
00805 RDet = _mm_div_ss(ZERONE.m, Det);
00806 RDet = _mm_shuffle_ps(RDet,RDet,0x00);
00807
00808
00809 mtL1 = _mm_mul_ps(mtL1, RDet);
00810 mtL2 = _mm_mul_ps(mtL2, RDet);
00811 mtL3 = _mm_mul_ps(mtL3, RDet);
00812
00813
00814 tt = _mm_ror_ps(c2,1); sum = _mm_mul_ps(tt,r1);
00815 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
00816 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
00817 __m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP.m);
00818 mtL4 = _mm_mul_ps(mtL4, RDet);
00819
00820
00821 trns0 = _mm_unpacklo_ps(mtL1,mtL2);
00822 trns1 = _mm_unpacklo_ps(mtL3,mtL4);
00823 trns2 = _mm_unpackhi_ps(mtL1,mtL2);
00824 trns3 = _mm_unpackhi_ps(mtL3,mtL4);
00825 *res0 = _mm_movelh_ps(trns0,trns1);
00826 *res1 = _mm_movehl_ps(trns1,trns0);
00827 *res2 = _mm_movelh_ps(trns2,trns3);
00828 *res3 = _mm_movehl_ps(trns3,trns2);
00829 }
00830
00831
00832 struct sse_meta_f16
00833 {
00834 typedef float meta_type;
00835 typedef sse_meta_f4 vector_type;
00836 sse_meta_f4 c[4];
00837
00838 sse_meta_f16() { setZero(); }
00839
00840 sse_meta_f16(float xx, float xy, float xz, float xw,
00841 float yx, float yy, float yz, float yw,
00842 float zx, float zy, float zz, float zw,
00843 float wx, float wy, float wz, float ww)
00844 { setValue(xx, xy, xz, xw, yz, yy, yz, yw, zx, zy, zz, zw, wx, wy, wz, ww); }
00845
00846 sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
00847 { setColumn(x, y, z, w); }
00848
00849 sse_meta_f16(__m128 x, __m128 y, __m128 z, __m128 w)
00850 { setColumn(x, y, z, w); }
00851
00852 inline void setValue(float xx, float xy, float xz, float xw,
00853 float yx, float yy, float yz, float yw,
00854 float zx, float zy, float zz, float zw,
00855 float wx, float wy, float wz, float ww)
00856 {
00857 c[0].setValue(xx, yx, zx, wx);
00858 c[1].setValue(xy, yy, zy, wy);
00859 c[2].setValue(xz, yz, zz, wz);
00860 c[3].setValue(xw, yw, zw, ww);
00861 }
00862
00863 inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
00864 {
00865 c[0] = x; c[1] = y; c[2] = z; c[3] = w;
00866 }
00867
00868 inline void setColumn(__m128 x, __m128 y, __m128 z, __m128 w)
00869 {
00870 c[0].setValue(x); c[1].setValue(y); c[2].setValue(z); c[3].setValue(w);
00871 }
00872
00873 inline void setIdentity()
00874 {
00875 c[0].setValue(1, 0, 0, 0);
00876 c[1].setValue(0, 1, 0, 0);
00877 c[2].setValue(0, 0, 1, 0);
00878 c[3].setValue(0, 0, 0, 1);
00879 }
00880
00881 inline void setZero()
00882 {
00883 c[0].setValue(0);
00884 c[1].setValue(0);
00885 c[2].setValue(0);
00886 c[3].setValue(0);
00887 }
00888
00889 inline const sse_meta_f4& getColumn(size_t i) const
00890 {
00891 return c[i];
00892 }
00893
00894 inline sse_meta_f4& getColumn(size_t i)
00895 {
00896 return c[i];
00897 }
00898
00899 inline sse_meta_f4 getRow(size_t i) const
00900 {
00901 return sse_meta_f4(c[0][i], c[1][i], c[2][i], c[3][i]);
00902 }
00903
00904 inline float operator () (size_t i, size_t j) const
00905 {
00906 return c[j][i];
00907 }
00908
00909 inline float& operator () (size_t i, size_t j)
00910 {
00911 return c[j][i];
00912 }
00913
00914 inline sse_meta_f4 operator * (const sse_meta_f4& v) const
00915 {
00916 return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))),
00917 _mm_add_ps(_mm_mul_ps(c[2].v, vec_splat(v.v, 2)), _mm_mul_ps(c[3].v, vec_splat(v.v, 3)))
00918 ));
00919 }
00920
00921 inline sse_meta_f16 operator * (const sse_meta_f16& mat) const
00922 {
00923 return sse_meta_f16((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2], (*this) * mat.c[3]);
00924 }
00925
00926
00927 inline sse_meta_f16 operator + (const sse_meta_f16& mat) const
00928 {
00929 return sse_meta_f16(c[0] + mat.c[0], c[1] + mat.c[1], c[2] + mat.c[2], c[3] + mat.c[3]);
00930 }
00931
00932 inline sse_meta_f16 operator - (const sse_meta_f16& mat) const
00933 {
00934 return sse_meta_f16(c[0] - mat.c[0], c[1] - mat.c[1], c[2] - mat.c[2], c[3] - mat.c[3]);
00935 }
00936
00937 inline sse_meta_f16 operator + (float t_) const
00938 {
00939 sse_meta_f4 t(t_);
00940 return sse_meta_f16(c[0] + t, c[1] + t, c[2] + t, c[3] + t);
00941 }
00942
00943 inline sse_meta_f16 operator - (float t_) const
00944 {
00945 sse_meta_f4 t(t_);
00946 return sse_meta_f16(c[0] - t, c[1] - t, c[2] - t, c[3] - t);
00947 }
00948
00949 inline sse_meta_f16 operator * (float t_) const
00950 {
00951 sse_meta_f4 t(t_);
00952 return sse_meta_f16(c[0] * t, c[1] * t, c[2] * t, c[3] * t);
00953 }
00954
00955 inline sse_meta_f16 operator / (float t_) const
00956 {
00957 sse_meta_f4 t(t_);
00958 return sse_meta_f16(c[0] / t, c[1] / t, c[2] / t, c[3] / t);
00959 }
00960
00961 inline sse_meta_f16& operator *= (const sse_meta_f16& mat)
00962 {
00963 setColumn((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2], (*this) * mat.c[3]);
00964 return *this;
00965 }
00966
00967 inline sse_meta_f16& operator += (const sse_meta_f16& mat)
00968 {
00969 c[0] += mat.c[0];
00970 c[1] += mat.c[1];
00971 c[2] += mat.c[2];
00972 c[3] += mat.c[3];
00973 return *this;
00974 }
00975
00976 inline sse_meta_f16& operator -= (const sse_meta_f16& mat)
00977 {
00978 c[0] -= mat.c[0];
00979 c[1] -= mat.c[1];
00980 c[2] -= mat.c[2];
00981 c[3] -= mat.c[3];
00982 return *this;
00983 }
00984
00985 inline sse_meta_f16& operator += (float t_)
00986 {
00987 sse_meta_f4 t(t_);
00988 c[0] += t;
00989 c[1] += t;
00990 c[2] += t;
00991 c[3] += t;
00992 return *this;
00993 }
00994
00995 inline sse_meta_f16& operator -= (float t_)
00996 {
00997 sse_meta_f4 t(t_);
00998 c[0] -= t;
00999 c[1] -= t;
01000 c[2] -= t;
01001 c[3] -= t;
01002 return *this;
01003 }
01004
01005 inline sse_meta_f16& operator *= (float t_)
01006 {
01007 sse_meta_f4 t(t_);
01008 c[0] *= t;
01009 c[1] *= t;
01010 c[2] *= t;
01011 c[3] *= t;
01012 return *this;
01013 }
01014
01015 inline sse_meta_f16& operator /= (float t_)
01016 {
01017 sse_meta_f4 t(t_);
01018 c[0] /= t;
01019 c[1] /= t;
01020 c[2] /= t;
01021 c[3] /= t;
01022 return *this;
01023 }
01024
01025 inline sse_meta_f16& abs()
01026 {
01027 c[0] = details::abs(c[0]);
01028 c[1] = details::abs(c[1]);
01029 c[2] = details::abs(c[2]);
01030 c[3] = details::abs(c[3]);
01031 return *this;
01032 }
01033
01034 inline sse_meta_f16& inverse()
01035 {
01036 __m128 r0, r1, r2, r3;
01037 details::inverse(c[0].v, c[1].v, c[2].v, c[3].v, &r0, &r1, &r2, &r3);
01038 setColumn(r0, r1, r2, r3);
01039 return *this;
01040 }
01041
01042 inline sse_meta_f16& transpose()
01043 {
01044 __m128 r0, r1, r2, r3;
01045 details::transpose(c[0].v, c[1].v, c[2].v, c[3].v, &r0, &r1, &r2, &r3);
01046 setColumn(r0, r1, r2, r3);
01047 return *this;
01048 }
01049
01050 inline float determinant() const
01051 {
01052 __m128 Va, Vb, Vc;
01053 __m128 r1, r2, r3, tt, tt2;
01054 __m128 sum, Det;
01055
01056 __m128 _L1 = c[0].v;
01057 __m128 _L2 = c[1].v;
01058 __m128 _L3 = c[2].v;
01059 __m128 _L4 = c[3].v;
01060
01061
01062
01063 tt = _L4; tt2 = _mm_ror_ps(_L3,1);
01064 Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));
01065 Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));
01066 Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));
01067
01068 r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));
01069 r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));
01070 r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));
01071
01072 tt = _L2;
01073 Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1);
01074 Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
01075 Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
01076
01077
01078 Det = _mm_mul_ps(sum,_L1);
01079 Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
01080
01081
01082 tt = _mm_ror_ps(_L1,1); sum = _mm_mul_ps(tt,r1);
01083 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
01084 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
01085
01086
01087 Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
01088 return _mm_cvtss_f32(Det);
01089 }
01090
01091 inline sse_meta_f16 transposeTimes(const sse_meta_f16& other) const
01092 {
01093 return sse_meta_f16(dot_prod4(c[0], other.c[0]), dot_prod4(c[0], other.c[1]), dot_prod4(c[0], other.c[2]), dot_prod4(c[0], other.c[3]),
01094 dot_prod4(c[1], other.c[0]), dot_prod4(c[1], other.c[1]), dot_prod4(c[1], other.c[2]), dot_prod4(c[1], other.c[3]),
01095 dot_prod4(c[2], other.c[0]), dot_prod4(c[2], other.c[1]), dot_prod4(c[2], other.c[2]), dot_prod4(c[2], other.c[3]),
01096 dot_prod4(c[3], other.c[0]), dot_prod4(c[3], other.c[1]), dot_prod4(c[3], other.c[2]), dot_prod4(c[3], other.c[3]));
01097 }
01098
01099 inline sse_meta_f16 timesTranspose(const sse_meta_f16& m) const
01100 {
01101 sse_meta_f16 tmp(m);
01102 return (*this) * tmp.transpose();
01103 }
01104
01105 inline sse_meta_f4 transposeTimes(const sse_meta_f4& v) const
01106 {
01107 return sse_meta_f4(dot_prod4(c[0], v), dot_prod4(c[1], v), dot_prod4(c[2], v), dot_prod4(c[3], v));
01108 }
01109
01110 inline float transposeDot(size_t i, const sse_meta_f4& v) const
01111 {
01112 return dot_prod4(c[i], v);
01113 }
01114
01115 inline float dot(size_t i, const sse_meta_f4& v) const
01116 {
01117 return v[0] * c[0][i] + v[1] * c[1][i] + v[2] * c[2][i] + v[3] * c[3][i];
01118 }
01119
01120 };
01121
01122 static inline sse_meta_f16 abs(const sse_meta_f16& mat)
01123 {
01124 return sse_meta_f16(abs(mat.getColumn(0)), abs(mat.getColumn(1)), abs(mat.getColumn(2)), abs(mat.getColumn(3)));
01125 }
01126
01127 static inline sse_meta_f16 transpose(const sse_meta_f16& mat)
01128 {
01129 __m128 r0, r1, r2, r3;
01130 transpose(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, mat.getColumn(3).v, &r0, &r1, &r2, &r3);
01131 return sse_meta_f16(r0, r1, r2, r3);
01132 }
01133
01134
01135 static inline sse_meta_f16 inverse(const sse_meta_f16& mat)
01136 {
01137 __m128 r0, r1, r2, r3;
01138 inverse(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, mat.getColumn(3).v, &r0, &r1, &r2, &r3);
01139 return sse_meta_f16(r0, r1, r2, r3);
01140 }
01141
01142
01143
01144
01145 }
01146 }
01147
01148
01149 #endif