GteIntelSSE.h
Go to the documentation of this file.
1 // David Eberly, Geometric Tools, Redmond WA 98052
2 // Copyright (c) 1998-2017
3 // Distributed under the Boost Software License, Version 1.0.
4 // http://www.boost.org/LICENSE_1_0.txt
5 // http://www.geometrictools.com/License/Boost/LICENSE_1_0.txt
6 // File Version: 3.0.0 (2016/06/19)
7 
8 #pragma once
9 
10 #include <GTEngineDEF.h>
11 #include <cmath>
12 #include <cstdint>
13 #include <xmmintrin.h>
14 #include <emmintrin.h>
15 
16 namespace gte
17 {
18 // Support for Intel's Streaming SIMD Extensions (SSE) using 128-bit registers
19 // that store four 32-bit floating-point numbers.
21 {
22 public:
23  // The representation of the SIMD 4-tuple.
25  {
26  public:
27  // Information about vectors.
28  enum { NUM_ELEMENTS = 4 };
29  typedef float ElementType;
30 
31  // Construction.
32  Vector ();
33  Vector (Vector const& vec);
34  Vector (__m128 const vec);
35  Vector (__m128i const vec);
36  Vector (float number);
37  Vector (float n0, float n1, float n2, float n3);
38  Vector (uint32_t encoding);
39  Vector (uint32_t e0, uint32_t e1, uint32_t e2, uint32_t e3);
40 
41  // Assignment.
42  Vector& operator= (Vector const& vec);
43  Vector& operator= (__m128 const vec);
44  Vector& operator= (__m128i const vec);
45 
46  // Implicit conversions.
47  operator __m128 ();
48  operator __m128 () const;
49  operator __m128i ();
50  operator __m128i () const;
51 
52  protected:
53  __m128 mTuple;
54  };
55 
56  // The representation of the SIMD 4x4-table.
58  {
59  public:
60  // Information about matrices.
61  enum
62  {
63  NUM_ROWS = 4,
64  NUM_COLS = 4,
65  NUM_ELEMENTS = 16,
66 #if defined(GTE_USE_ROW_MAJOR)
67  STORAGE_ROW_MAJOR = 1,
68 #else
69  STORAGE_ROW_MAJOR = 0,
70 #endif
71  };
72  typedef float ElementType;
73 
74  // Construction.
75  Matrix ();
76  Matrix (Matrix const& mat);
77  Matrix (__m128 const* mat);
78  Matrix (
79  float m00, float m01, float m02, float m03,
80  float m10, float m11, float m12, float m13,
81  float m20, float m21, float m22, float m23,
82  float m30, float m31, float m32, float m33);
83 
84  // Assignment.
85  Matrix& operator= (Matrix const& mat);
86  Matrix& operator= (__m128 const* mat);
87 
88  // Implicit conversions.
89  operator __m128* ();
90  operator __m128 const* () const;
91 
92  // Access to the slices (rows or columns) of the matrix.
93  __m128 const& operator[] (int i) const;
94  __m128& operator[] (int i);
95 
96  protected:
97  // mTable[i] is row i for row-major storage but is column i for
98  // column-major order.
99  __m128 mTable[4];
100  };
101 
102 public:
103  // Logical operations.
104  inline static __m128 Not (__m128 const v); // ~v
105  inline static __m128 And (__m128 const v0, __m128 const v1); // v0 & v1
106  inline static __m128 AndNot (__m128 const v0, __m128 const v1); // ~v0 & v1
107  inline static __m128 Or (__m128 const v0, __m128 const v1); // v0 | v1
108  inline static __m128 Xor (__m128 const v0, __m128 const v1); // v0 ^ v1
109  inline static __m128 Select (__m128 const c, __m128 const v0, __m128 const v1); // (c & v0) | (~c & v1)
110 
111  // Comparisons.
112  inline static __m128 Equal (__m128 const v0, __m128 const v1); // v0 == v1
113  inline static __m128 NotEqual (__m128 const v0, __m128 const v1); // v0 != v1
114  inline static __m128 Less (__m128 const v0, __m128 const v1); // v0 < v1
115  inline static __m128 LessEqual (__m128 const v0, __m128 const v1); // v0 <= v1
116  inline static __m128 Greater (__m128 const v0, __m128 const v1); // v0 > v1
117  inline static __m128 GreaterEqual (__m128 const v0, __m128 const v1); // v0 >= v1
118 
119  // Vector arithmetic operations.
120  inline static __m128 Negate (__m128 const v); // -v
121  inline static __m128 Add (__m128 const v0, __m128 const v1); // v0 + v1
122  inline static __m128 Subtract (__m128 const v0, __m128 const v1); // v0 - v1
123  inline static __m128 Multiply (__m128 const v0, __m128 const v1); // v0 * v1
124  inline static __m128 Divide (__m128 const v0, __m128 const v1); // v0 / v1
125  inline static __m128 Round (__m128 const v);
126  inline static __m128 MaximumAbsoluteComponent (__m128 const v);
127 
128  // Vector algebraic operations.
129  inline static __m128 Dot (__m128 const v0, __m128 const v1);
130  inline static __m128 Length (__m128 const v);
131  inline static __m128 LengthRobust (__m128 const v);
132  inline static __m128 Normalize (__m128 const v);
133  inline static __m128 NormalizeGetLength (__m128 const v, __m128& length);
134  inline static __m128 NormalizeRobust (__m128 const v);
135  inline static __m128 NormalizeRobustGetLength (__m128 const v, __m128& length);
136  inline static __m128 Cross (__m128 const v0, __m128 const v1);
137 
138  // Matrix arithmetic operations.
139  inline static void Negate (__m128 const* M, __m128* result);
140  inline static void Add (__m128 const* A, __m128 const*B, __m128* result);
141  inline static void Subtract (__m128 const* A, __m128 const* B, __m128* result);
142  inline static void Multiply (__m128 const* M, __m128 const c, __m128* result);
143  inline static void Divide (__m128 const* M, __m128 const c, __m128* result);
144 
145  // Matrix geometric operations.
146  inline static void Transpose (__m128 const* mat, __m128* trn);
147  inline static void Inverse (__m128 const* mat, __m128* inv);
148  inline static void Adjoint (__m128 const* mat, __m128* adj);
149  inline static __m128 Determinant (__m128 const* mat);
150  inline static __m128 L1Norm (__m128 const* mat);
151  inline static __m128 L2Norm (__m128 const* mat);
152  inline static __m128 LInfinityNorm (__m128 const* mat);
153 
154  // Matrix-matrix products.
155  inline static void MultiplyAB (__m128 const* A, __m128 const* B, __m128* AB);
156  inline static void MultiplyATB (__m128 const* A, __m128 const* B, __m128* ATB);
157  inline static void MultiplyABT (__m128 const* A, __m128 const* B, __m128* ABT);
158  inline static void MultiplyATBT (__m128 const* A, __m128 const* B, __m128* ATBT);
159  inline static void MultiplyDM (__m128 const D, __m128 const* M, __m128* DM);
160  inline static void MultiplyMD (__m128 const* M, __m128 const D, __m128* MD);
161 
162  // Matrix-vector products.
163  inline static __m128 MultiplyMV (__m128 const* M, __m128 const V);
164  inline static __m128 MultiplyVM (__m128 const V, __m128 const* M);
165 
166  // Quaternion support. In QSlerp, the 't' component must be the splat of
167  // a floating-point scalar in [0,1], and q0 and q1 must be unit-length
168  // quaternions.
169  inline static __m128 QMultiply (__m128 const q0, __m128 const q1);
170  inline static __m128 QConjugate (__m128 const q);
171  inline static __m128 QInverse (__m128 const q);
172  inline static __m128 QSlerp (__m128 const t, __m128 const q0, __m128 const q1);
173 
174  // Function evaluations (generally slow, CPU function call per component).
175  inline static __m128 Sin (__m128 const v);
176  inline static __m128 Cos (__m128 const v);
177  inline static __m128 Tan (__m128 const v);
178  inline static __m128 ASin (__m128 const v);
179  inline static __m128 ACos (__m128 const v);
180  inline static __m128 ATan (__m128 const v);
181 
182  // Fast function approximations.
183 
184  // The SinAppr* functions require |x| <= pi/2. When x is in this domain,
185  // just call SinAppr*(x). When x is not in the domain, call
186  // ReduceAnglesSin(x,y) to obtain y in the domain with sin(x) = sin(y),
187  // and then call SinAppr*(y).
188  inline static void ReduceAnglesSin (__m128 const x, __m128& y);
189  inline static __m128 SinApprDeg11 (__m128 const x);
190  inline static __m128 SinApprDeg7 (__m128 const x);
191 
192  // The CosAppr* functions require |x| <= pi/2. When x is in this domain,
193  // just call CosAppr*(x, Float4SEE::PPP). When x is not in the domain,
194  // call ReduceAnglesCos(x,y,sign) to obtain y in the domain with cos(x) =
195  // sign*cos(y), and then call CosAppr*(y,sign).
196  inline static void ReduceAnglesCos (__m128 const x, __m128& y, __m128& sign);
197  inline static __m128 CosApprDeg10 (__m128 const x, __m128 const sign);
198  inline static __m128 CosApprDeg6 (__m128 const x, __m128 const sign);
199 
200  // Integer masks.
201  static Vector const ZZZZ; // (0x00000000, 0x00000000, 0x00000000, 0x00000000)
202  static Vector const ZZZF; // (0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF)
203  static Vector const ZZFZ; // (0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000)
204  static Vector const ZZFF; // (0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF)
205  static Vector const ZFZZ; // (0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000)
206  static Vector const ZFZF; // (0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)
207  static Vector const ZFFZ; // (0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)
208  static Vector const ZFFF; // (0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
209  static Vector const FZZZ; // (0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)
210  static Vector const FZZF; // (0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF)
211  static Vector const FZFZ; // (0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)
212  static Vector const FZFF; // (0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF)
213  static Vector const FFZZ; // (0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)
214  static Vector const FFZF; // (0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF)
215  static Vector const FFFZ; // (0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)
216  static Vector const FFFF; // (0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
217  static Vector const SIGN; // (0x80000000, 0x80000000, 0x80000000, 0x80000000)
218  static Vector const NSIGN; // (0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)
219  static Vector const NOFRC; // (0x00800000, 0x00800000, 0x00800000, 0x00800000)
220 
221  // Numeric constants.
222  static Vector const PZZZ; // (+1.0f, 0.0f, 0.0f, 0.0f)
223  static Vector const ZPZZ; // ( 0.0f, +1.0f, 0.0f, 0.0f)
224  static Vector const ZZPZ; // ( 0.0f, 0.0f, +1.0f, 0.0f)
225  static Vector const ZZZP; // ( 0.0f, 0.0f, 0.0f, +1.0f)
226  static Vector const MZZZ; // (-1.0f, 0.0f, 0.0f, 0.0f)
227  static Vector const ZMZZ; // ( 0.0f, -1.0f, 0.0f, 0.0f)
228  static Vector const ZZMZ; // ( 0.0f, 0.0f, -1.0f, 0.0f)
229  static Vector const ZZZM; // ( 0.0f, 0.0f, 0.0f, -1.0f)
230  static Vector const MMMM; // (-1.0f, -1.0f, -1.0f, -1.0f)
231  static Vector const MMMP; // (-1.0f, -1.0f, -1.0f, +1.0f)
232  static Vector const MMPM; // (-1.0f, -1.0f, +1.0f, -1.0f)
233  static Vector const MMPP; // (-1.0f, -1.0f, +1.0f, +1.0f)
234  static Vector const MPMM; // (-1.0f, +1.0f, -1.0f, -1.0f)
235  static Vector const MPMP; // (-1.0f, +1.0f, -1.0f, +1.0f)
236  static Vector const MPPM; // (-1.0f, +1.0f, +1.0f, -1.0f)
237  static Vector const MPPP; // (-1.0f, +1.0f, +1.0f, +1.0f)
238  static Vector const PMMM; // (+1.0f, -1.0f, -1.0f, -1.0f)
239  static Vector const PMMP; // (+1.0f, -1.0f, -1.0f, +1.0f)
240  static Vector const PMPM; // (+1.0f, -1.0f, +1.0f, -1.0f)
241  static Vector const PMPP; // (+1.0f, -1.0f, +1.0f, +1.0f)
242  static Vector const PPMM; // (+1.0f, +1.0f, -1.0f, -1.0f)
243  static Vector const PPMP; // (+1.0f, +1.0f, -1.0f, +1.0f)
244  static Vector const PPPM; // (+1.0f, +1.0f, +1.0f, -1.0f)
245  static Vector const PPPP; // (+1.0f, +1.0f, +1.0f, +1.0f)
246  static Vector const UNIT[4]; // = {PZZZ, ZPZZ, ZZPZ, ZZZP};
247 
248  // Constants involving pi.
249  static Vector const PI;
250  static Vector const HALF_PI;
251  static Vector const TWO_PI;
252  static Vector const INV_PI;
253  static Vector const INV_TWO_PI;
254 
255 private:
256  // Support for computing the adjoint, determinanat, and inverse.
257  inline static void GetAdjDet (__m128 const* mat, __m128* adj, __m128* det);
258 
259  // Constants to support approximations of sin(x).
266  static Vector const C_SIN_APPR_DEG7_0;
267  static Vector const C_SIN_APPR_DEG7_1;
268  static Vector const C_SIN_APPR_DEG7_2;
269  static Vector const C_SIN_APPR_DEG7_3;
270 
271  // Constants to support approximations of cos(x).
278  static Vector const C_COS_APPR_DEG6_0;
279  static Vector const C_COS_APPR_DEG6_1;
280  static Vector const C_COS_APPR_DEG6_2;
281  static Vector const C_COS_APPR_DEG6_3;
282 };
283 
284 
285 // SIMD::Vector
286 
288 {
289  // Uninitialized.
290 }
291 
292 inline SIMD::Vector::Vector(Vector const& vec)
293  :
294  mTuple(vec.mTuple)
295 {
296 }
297 
298 inline SIMD::Vector::Vector(__m128 const vec)
299  :
300  mTuple(vec)
301 {
302 }
303 
304 inline SIMD::Vector::Vector(__m128i const vec)
305  :
306  mTuple(_mm_castsi128_ps(vec))
307 {
308 }
309 
310 inline SIMD::Vector::Vector(float number)
311 {
312  mTuple = _mm_set1_ps(number);
313 }
314 
315 inline SIMD::Vector::Vector(float n0, float n1, float n2, float n3)
316 {
317  mTuple = _mm_set_ps(n3, n2, n1, n0);
318 }
319 
320 inline SIMD::Vector::Vector(uint32_t encoding)
321 {
322  mTuple = _mm_castsi128_ps(_mm_set1_epi32(encoding));
323 }
324 
325 inline SIMD::Vector::Vector(uint32_t e0, uint32_t e1, uint32_t e2, uint32_t e3)
326 {
327  mTuple = _mm_castsi128_ps(_mm_set_epi32(e3, e2, e1, e0));
328 }
329 
331 {
332  mTuple = vec.mTuple;
333  return *this;
334 }
335 
336 inline SIMD::Vector& SIMD::Vector::operator= (__m128 const vec)
337 {
338  mTuple = vec;
339  return *this;
340 }
341 
342 inline SIMD::Vector& SIMD::Vector::operator= (__m128i const vec)
343 {
344  mTuple = _mm_castsi128_ps(vec);
345  return *this;
346 }
347 
348 inline SIMD::Vector::operator __m128 ()
349 {
350  return mTuple;
351 }
352 
353 inline SIMD::Vector::operator __m128 () const
354 {
355  return mTuple;
356 }
357 
358 inline SIMD::Vector::operator __m128i ()
359 {
360  return _mm_castps_si128(mTuple);
361 }
362 
363 inline SIMD::Vector::operator __m128i () const
364 {
365  return _mm_castps_si128(mTuple);
366 }
367 
368 
369 
370 // SIMD::Matrix
371 
373 {
374  // Uninitialized.
375 }
376 
377 inline SIMD::Matrix::Matrix(Matrix const& mat)
378 {
379  mTable[0] = mat.mTable[0];
380  mTable[1] = mat.mTable[1];
381  mTable[2] = mat.mTable[2];
382  mTable[3] = mat.mTable[3];
383 }
384 
385 inline SIMD::Matrix::Matrix(__m128 const* mat)
386 {
387  mTable[0] = mat[0];
388  mTable[1] = mat[1];
389  mTable[2] = mat[2];
390  mTable[3] = mat[3];
391 }
392 
394  float m00, float m01, float m02, float m03,
395  float m10, float m11, float m12, float m13,
396  float m20, float m21, float m22, float m23,
397  float m30, float m31, float m32, float m33)
398 {
399 #if defined(GTE_USE_ROW_MAJOR)
400  mTable[0] = _mm_setr_ps(m00, m01, m02, m03);
401  mTable[1] = _mm_setr_ps(m10, m11, m12, m13);
402  mTable[2] = _mm_setr_ps(m20, m21, m22, m23);
403  mTable[3] = _mm_setr_ps(m30, m31, m32, m33);
404 #else
405  mTable[0] = _mm_setr_ps(m00, m10, m20, m30);
406  mTable[1] = _mm_setr_ps(m01, m11, m21, m31);
407  mTable[2] = _mm_setr_ps(m02, m12, m22, m32);
408  mTable[3] = _mm_setr_ps(m03, m13, m23, m33);
409 #endif
410 }
411 
413 {
414  mTable[0] = mat.mTable[0];
415  mTable[1] = mat.mTable[1];
416  mTable[2] = mat.mTable[2];
417  mTable[3] = mat.mTable[3];
418  return *this;
419 }
420 
421 inline SIMD::Matrix& SIMD::Matrix::operator= (__m128 const* mat)
422 {
423  mTable[0] = mat[0];
424  mTable[1] = mat[1];
425  mTable[2] = mat[2];
426  mTable[3] = mat[3];
427  return *this;
428 }
429 
430 inline SIMD::Matrix::operator __m128* ()
431 {
432  return mTable;
433 }
434 
435 inline SIMD::Matrix::operator __m128 const* () const
436 {
437  return mTable;
438 }
439 
440 inline __m128 const& SIMD::Matrix::operator[] (int i) const
441 {
442  return mTable[i];
443 }
444 
445 inline __m128& SIMD::Matrix::operator[] (int i)
446 {
447  return mTable[i];
448 }
449 
450 
451 
452 // Logical operations.
453 
454 inline __m128 SIMD::Not(__m128 const v)
455 {
456  return _mm_xor_ps(v, FFFF);
457 }
458 
459 inline __m128 SIMD::And(__m128 const v0, __m128 const v1)
460 {
461  return _mm_and_ps(v0, v1);
462 }
463 
464 inline __m128 SIMD::AndNot(__m128 const v0, __m128 const v1)
465 {
466  return _mm_andnot_ps(v0, v1);
467 }
468 
469 inline __m128 SIMD::Or(__m128 const v0, __m128 const v1)
470 {
471  return _mm_or_ps(v0, v1);
472 }
473 
474 inline __m128 SIMD::Xor(__m128 const v0, __m128 const v1)
475 {
476  return _mm_xor_ps(v0, v1);
477 }
478 
479 inline __m128 SIMD::Select(__m128 const c, __m128 const v0, __m128 const v1)
480 {
481  return _mm_or_ps(_mm_and_ps(c, v0), _mm_andnot_ps(c, v1));
482 }
483 
484 
485 
486 // Comparisons.
487 
488 inline __m128 SIMD::Equal(__m128 const v0, __m128 const v1)
489 {
490  return _mm_cmpeq_ps(v0, v1);
491 }
492 
493 inline __m128 SIMD::NotEqual(__m128 const v0, __m128 const v1)
494 {
495  return _mm_cmpneq_ps(v0, v1);
496 }
497 
498 inline __m128 SIMD::Less(__m128 const v0, __m128 const v1)
499 {
500  return _mm_cmplt_ps(v0, v1);
501 }
502 
503 inline __m128 SIMD::LessEqual(__m128 const v0, __m128 const v1)
504 {
505  return _mm_cmple_ps(v0, v1);
506 }
507 
508 inline __m128 SIMD::Greater(__m128 const v0, __m128 const v1)
509 {
510  return _mm_cmpgt_ps(v0, v1);
511 }
512 
513 inline __m128 SIMD::GreaterEqual(__m128 const v0, __m128 const v1)
514 {
515  return _mm_cmpge_ps(v0, v1);
516 }
517 
518 
519 
520 // Vector arithmetic operations.
521 
522 inline __m128 SIMD::Negate(__m128 const v)
523 {
524  return _mm_xor_ps(v, SIGN);
525 }
526 
527 inline __m128 SIMD::Add(__m128 const v0, __m128 const v1)
528 {
529  return _mm_add_ps(v0, v1);
530 }
531 
532 inline __m128 SIMD::Subtract(__m128 const v0, __m128 const v1)
533 {
534  return _mm_sub_ps(v0, v1);
535 }
536 
537 inline __m128 SIMD::Multiply(__m128 const v0, __m128 const v1)
538 {
539  return _mm_mul_ps(v0, v1);
540 }
541 
542 inline __m128 SIMD::Divide(__m128 const v0, __m128 const v1)
543 {
544  return _mm_div_ps(v0, v1);
545 }
546 
547 inline __m128 SIMD::Round(__m128 const v)
548 {
549  __m128 t0 = _mm_and_ps(NSIGN, v);
550  t0 = _mm_castsi128_ps(_mm_cmplt_epi32(_mm_castps_si128(t0), NOFRC));
551  __m128i t1 = _mm_cvtps_epi32(v); // float-to-int
552  __m128 t2 = _mm_cvtepi32_ps(t1); // int-to-float
553  t2 = _mm_and_ps(t2, t0);
554  t0 = _mm_andnot_ps(t0, v);
555  t2 = _mm_or_ps(t2, t0);
556  return t2;
557 }
558 
559 inline __m128 SIMD::MaximumAbsoluteComponent(__m128 const v)
560 {
561  __m128 vAbs = _mm_andnot_ps(SIGN, v);
562  __m128 max0 = _mm_shuffle_ps(vAbs, vAbs, _MM_SHUFFLE(0, 0, 0, 0));
563  __m128 max1 = _mm_shuffle_ps(vAbs, vAbs, _MM_SHUFFLE(1, 1, 1, 1));
564  __m128 max2 = _mm_shuffle_ps(vAbs, vAbs, _MM_SHUFFLE(2, 2, 2, 2));
565  __m128 max3 = _mm_shuffle_ps(vAbs, vAbs, _MM_SHUFFLE(3, 3, 3, 3));
566  max0 = _mm_max_ps(max0, max1);
567  max2 = _mm_max_ps(max2, max3);
568  max0 = _mm_max_ps(max0, max2);
569  return max0;
570 }
571 
572 
573 
574 // Vector algebraic operations.
575 
576 inline __m128 SIMD::Dot(__m128 const v0, __m128 const v1)
577 {
578  // (x0*x1, y0*y1, z0*z1, w0*w1)
579  __m128 t0 = _mm_mul_ps(v0, v1);
580 
581  // (y0*y1, x0*x1, w0*w1, z0*z1)
582  __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 3, 0, 1));
583 
584  // (x0*x1 + y0*y1, x0*x1 + y0*y1, z0*z1 + w0*w1, z0*z1 + w0*w1)
585  __m128 t2 = _mm_add_ps(t0, t1);
586 
587  // (z0*z1 + w0*w1, z0*z1 + w0*w1, x0*x1 + y0*y1, x0*x1 + y0*y1)
588  __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 0, 2, 2));
589 
590  // (dot, dot, dot, dot)
591  __m128 dotSplat = _mm_add_ps(t2, t3);
592  return dotSplat;
593 }
594 
595 inline __m128 SIMD::Length(__m128 const v)
596 {
597  __m128 sqrLength = Dot(v, v);
598  return _mm_sqrt_ps(sqrLength);
599 }
600 
601 inline __m128 SIMD::LengthRobust(__m128 const v)
602 {
603  // Compute the maximum absolute value component.
604  __m128 maxComponent = MaximumAbsoluteComponent(v);
605 
606  // Divide by the maximum absolute component. This is potentially a
607  // divide by zero.
608  __m128 normalized = _mm_div_ps(v, maxComponent);
609 
610  // Set to zero when the original length is zero.
611  __m128 mask = _mm_cmpneq_ps(ZZZZ, maxComponent);
612  normalized = _mm_and_ps(mask, normalized);
613 
614  // (sqrLength, sqrLength, sqrLength, sqrLength)
615  __m128 sqrLength = Dot(normalized, normalized);
616  __m128 length = _mm_sqrt_ps(sqrLength);
617  return length;
618 }
619 
620 inline __m128 SIMD::Normalize(__m128 const v)
621 {
622  // (sqrLength, sqrLength, sqrLength, sqrLength)
623  __m128 sqrLength = Dot(v, v);
624 
625  // (length, length, length, length)
626  __m128 length = _mm_sqrt_ps(sqrLength);
627 
628  // Divide by the length to normalize. This is potentially a divide by
629  // zero or a divide by infinity.
630  __m128 normalized = _mm_div_ps(v, length);
631 
632  // Set to zero when the original length is zero.
633  __m128 mask = _mm_cmpneq_ps(ZZZZ, length);
634  normalized = _mm_and_ps(mask, normalized);
635  return normalized;
636 }
637 
638 inline __m128 SIMD::NormalizeGetLength(__m128 const v, __m128& length)
639 {
640  // (sqrLength, sqrLength, sqrLength, sqrLength)
641  __m128 sqrLength = Dot(v, v);
642 
643  // (length, length, length, length)
644  length = _mm_sqrt_ps(sqrLength);
645 
646  // Divide by the length to normalize. This is potentially a divide by
647  // zero or a divide by infinity.
648  __m128 normalized = _mm_div_ps(v, length);
649 
650  // Set to zero when the original length is zero.
651  __m128 mask = _mm_cmpneq_ps(ZZZZ, length);
652  normalized = _mm_and_ps(mask, normalized);
653  length = _mm_and_ps(mask, length);
654  return normalized;
655 }
656 
657 inline __m128 SIMD::NormalizeRobust(__m128 const v)
658 {
659  // Compute the maximum absolute value component.
660  __m128 maxComponent = MaximumAbsoluteComponent(v);
661 
662  // Divide by the maximum absolute component. This is potentially a
663  // divide by zero.
664  __m128 normalized = _mm_div_ps(v, maxComponent);
665 
666  // Set to zero when the original length is zero.
667  __m128 mask = _mm_cmpneq_ps(ZZZZ, maxComponent);
668  normalized = _mm_and_ps(mask, normalized);
669 
670  // (sqrLength, sqrLength, sqrLength, sqrLength)
671  __m128 sqrLength = Dot(normalized, normalized);
672 
673  // (length, length, length, length)
674  __m128 length = _mm_sqrt_ps(sqrLength);
675 
676  // Divide by the length to normalize. This is potentially a divide by
677  // zero.
678  normalized = _mm_div_ps(normalized, length);
679 
680  // Set to zero when the original length is zero or infinity. In the
681  // latter case, this is considered to be an unexpected condition.
682  normalized = _mm_and_ps(mask, normalized);
683  return normalized;
684 }
685 
686 inline __m128 SIMD::NormalizeRobustGetLength(__m128 const v, __m128& length)
687 {
688  // Compute the maximum absolute value component.
689  __m128 maxComponent = MaximumAbsoluteComponent(v);
690 
691  // Divide by the maximum absolute component. This is potentially a
692  // divide by zero.
693  __m128 normalized = _mm_div_ps(v, maxComponent);
694 
695  // Set to zero when the original length is zero.
696  __m128 mask = _mm_cmpneq_ps(ZZZZ, maxComponent);
697  normalized = _mm_and_ps(mask, normalized);
698 
699  // (sqrLength, sqrLength, sqrLength, sqrLength)
700  __m128 sqrLength = Dot(normalized, normalized);
701 
702  // (length, length, length, length)
703  length = _mm_sqrt_ps(sqrLength);
704 
705  // Divide by the length to normalize. This is potentially a divide by
706  // zero.
707  normalized = _mm_div_ps(normalized, length);
708  length = _mm_mul_ps(length, maxComponent); // true length
709 
710  // Set to zero when the original length is zero or infinity. In the
711  // latter case, this is considered to be an unexpected condition.
712  normalized = _mm_and_ps(mask, normalized);
713  length = _mm_and_ps(mask, length);
714  return normalized;
715 }
716 
717 inline __m128 SIMD::Cross(__m128 const v0, __m128 const v1)
718 {
719  // v0 = (x0, y0, z0, 0), v1 = (x1, y1, z1, 0)
720  // cross = (y0*z1 - z0*y1, z0*x1 - x0*z1, x0*y1 - y0*x1, 0)
721 
722  // (y0, z0, x0, 0)
723  __m128 t0 = _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(3, 0, 2, 1));
724 
725  // (z1, x1, y1, 0)
726  __m128 t1 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 1, 0, 2));
727 
728  // (y0*z1, z0*x1, x0*y1, 0)
729  __m128 cross = _mm_mul_ps(t0, t1);
730 
731  // (z0, x0, y0, 0), computed from t0, not v0
732  t0 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 0, 2, 1));
733 
734  // (y1, z1, x1, 0), computed from t1, not v1
735  t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 1, 0, 2));
736 
737  // (z0*y1, x0*z1, y0*x1, 0)
738  t0 = _mm_mul_ps(t0, t1);
739 
740  // (y0*z1 - z0*y1, z0*x1 - x0*z1, x0*y1 - y0*x1, 0)
741  cross = _mm_sub_ps(cross, t0);
742  return cross;
743 }
744 
745 
746 
747 // Matrix arithmetic operations.
748 
749 inline void SIMD::Negate(__m128 const* M, __m128* result)
750 {
751  result[0] = _mm_sub_ps(SIMD::ZZZZ, M[0]);
752  result[1] = _mm_sub_ps(SIMD::ZZZZ, M[1]);
753  result[2] = _mm_sub_ps(SIMD::ZZZZ, M[2]);
754  result[3] = _mm_sub_ps(SIMD::ZZZZ, M[3]);
755 }
756 
757 inline void SIMD::Add(__m128 const* A, __m128 const* B, __m128* result)
758 {
759  result[0] = _mm_add_ps(A[0], B[0]);
760  result[1] = _mm_add_ps(A[1], B[1]);
761  result[2] = _mm_add_ps(A[2], B[2]);
762  result[3] = _mm_add_ps(A[3], B[3]);
763 }
764 
765 inline void SIMD::Subtract(__m128 const* A, __m128 const*B, __m128* result)
766 {
767  result[0] = _mm_sub_ps(A[0], B[0]);
768  result[1] = _mm_sub_ps(A[1], B[1]);
769  result[2] = _mm_sub_ps(A[2], B[2]);
770  result[3] = _mm_sub_ps(A[3], B[3]);
771 }
772 
773 inline void SIMD::Multiply(__m128 const* M, __m128 const c, __m128* result)
774 {
775  result[0] = _mm_mul_ps(M[0], c);
776  result[1] = _mm_mul_ps(M[1], c);
777  result[2] = _mm_mul_ps(M[2], c);
778  result[3] = _mm_mul_ps(M[3], c);
779 }
780 
781 inline void SIMD::Divide(__m128 const* M, __m128 const c, __m128* result)
782 {
783  result[0] = _mm_div_ps(M[0], c);
784  result[1] = _mm_div_ps(M[1], c);
785  result[2] = _mm_div_ps(M[2], c);
786  result[3] = _mm_div_ps(M[3], c);
787 }
788 
789 
790 
791 // Matrix geometric operations.
792 
793 inline void SIMD::Transpose(__m128 const* mat, __m128* trn)
794 {
795  // VM:(m00, m01, m10, m11), MV:(m00, m10, m01, m11)
796  __m128 s0 = _mm_shuffle_ps(mat[0], mat[1], _MM_SHUFFLE(1, 0, 1, 0));
797  // VM:(m20, m21, m30, m31), MV:(m02, m12, m03, m13)
798  __m128 s1 = _mm_shuffle_ps(mat[2], mat[3], _MM_SHUFFLE(1, 0, 1, 0));
799  // VM:(m02, m03, m12, m13), MV:(m20, m30, m21, m31)
800  __m128 s2 = _mm_shuffle_ps(mat[0], mat[1], _MM_SHUFFLE(3, 2, 3, 2));
801  // VM:(m22, m23, m32, m33), MV:(m22, m32, m23, m33)
802  __m128 s3 = _mm_shuffle_ps(mat[2], mat[3], _MM_SHUFFLE(3, 2, 3, 2));
803 
804  // VM:(m00, m10, m20, m30), MV:(m00, m01, m02, m03)
805  trn[0] = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0));
806  // VM:(m01, m11, m21, m31), MV:(m10, m11, m12, m13)
807  trn[1] = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1));
808  // VM:(m02, m12, m22, m32), MV:(m20, m21, m22, m23)
809  trn[2] = _mm_shuffle_ps(s2, s3, _MM_SHUFFLE(2, 0, 2, 0));
810  // VM:(m03, m13, m23, m33), MV:(m30, m31, m32, m33)
811  trn[3] = _mm_shuffle_ps(s2, s3, _MM_SHUFFLE(3, 1, 3, 1));
812 }
813 
814 inline void SIMD::Inverse(__m128 const* mat, __m128* inv)
815 {
816  __m128 det;
817  GetAdjDet(mat, inv, &det);
818 
819  // Compute the reciprocal of the determinant. Guard against a division
820  // by zero. When the determinant is zero, the zero matrix is returned.
821  __m128 invDet = _mm_div_ps(PPPP, det);
822  __m128 neqZero = _mm_cmpneq_ps(det, ZZZZ);
823  invDet = _mm_and_ps(neqZero, invDet);
824 
825  inv[0] = _mm_mul_ps(inv[0], invDet);
826  inv[1] = _mm_mul_ps(inv[1], invDet);
827  inv[2] = _mm_mul_ps(inv[2], invDet);
828  inv[3] = _mm_mul_ps(inv[3], invDet);
829 }
830 
831 inline void SIMD::Adjoint(__m128 const* mat, __m128* adj)
832 {
833  GetAdjDet(mat, adj, 0);
834 }
835 
836 inline __m128 SIMD::Determinant(__m128 const* mat)
837 {
838  __m128 det;
839  GetAdjDet(mat, 0, &det);
840  return det;
841 }
842 
843 inline __m128 SIMD::L1Norm(__m128 const* mat)
844 {
845  __m128 sum = _mm_andnot_ps(SIMD::SIGN, mat[0]);
846  __m128 tmp = _mm_andnot_ps(SIMD::SIGN, mat[1]);
847  sum = _mm_add_ps(sum, tmp);
848  tmp = _mm_andnot_ps(SIMD::SIGN, mat[2]);
849  sum = _mm_add_ps(sum, tmp);
850  tmp = _mm_andnot_ps(SIMD::SIGN, mat[3]);
851  sum = _mm_add_ps(sum, tmp);
852  __m128 norm = SIMD::Dot(sum, SIMD::PPPP);
853  return norm;
854 }
855 
856 inline __m128 SIMD::L2Norm(__m128 const* mat)
857 {
858  __m128 sum = _mm_mul_ps(mat[0], mat[0]);
859  __m128 tmp = _mm_mul_ps(mat[1], mat[1]);
860  sum = _mm_add_ps(sum, tmp);
861  tmp = _mm_mul_ps(mat[2], mat[2]);
862  sum = _mm_add_ps(sum, tmp);
863  tmp = _mm_mul_ps(mat[3], mat[3]);
864  sum = _mm_add_ps(sum, tmp);
865  __m128 norm = SIMD::Dot(sum, SIMD::PPPP);
866  return norm;
867 }
868 
869 inline __m128 SIMD::LInfinityNorm(__m128 const* mat)
870 {
871  __m128 max = SIMD::MaximumAbsoluteComponent(mat[0]);
872  __m128 tmp = SIMD::MaximumAbsoluteComponent(mat[1]);
873  max = _mm_max_ps(max, tmp);
874  tmp = SIMD::MaximumAbsoluteComponent(mat[2]);
875  max = _mm_max_ps(max, tmp);
876  tmp = SIMD::MaximumAbsoluteComponent(mat[3]);
877  max = _mm_max_ps(max, tmp);
878  return max;
879 }
880 
881 
882 
883 // Matrix-matrix products.
884 
885 inline void SIMD::MultiplyAB(__m128 const* A, __m128 const* B, __m128* AB)
886 {
887  __m128 t0, t1, t2, t3;
888 
889 #if defined(GTE_USE_ROW_MAJOR)
890  t0 = _mm_shuffle_ps(A[0], A[0], _MM_SHUFFLE(0, 0, 0, 0));
891  t1 = _mm_shuffle_ps(A[0], A[0], _MM_SHUFFLE(1, 1, 1, 1));
892  t2 = _mm_shuffle_ps(A[0], A[0], _MM_SHUFFLE(2, 2, 2, 2));
893  t3 = _mm_shuffle_ps(A[0], A[0], _MM_SHUFFLE(3, 3, 3, 3));
894  t0 = _mm_mul_ps(t0, B[0]);
895  t1 = _mm_mul_ps(t1, B[1]);
896  t2 = _mm_mul_ps(t2, B[2]);
897  t3 = _mm_mul_ps(t3, B[3]);
898  t0 = _mm_add_ps(t0, t1);
899  t2 = _mm_add_ps(t2, t3);
900  AB[0] = _mm_add_ps(t0, t2);
901 
902  t0 = _mm_shuffle_ps(A[1], A[1], _MM_SHUFFLE(0, 0, 0, 0));
903  t1 = _mm_shuffle_ps(A[1], A[1], _MM_SHUFFLE(1, 1, 1, 1));
904  t2 = _mm_shuffle_ps(A[1], A[1], _MM_SHUFFLE(2, 2, 2, 2));
905  t3 = _mm_shuffle_ps(A[1], A[1], _MM_SHUFFLE(3, 3, 3, 3));
906  t0 = _mm_mul_ps(t0, B[0]);
907  t1 = _mm_mul_ps(t1, B[1]);
908  t2 = _mm_mul_ps(t2, B[2]);
909  t3 = _mm_mul_ps(t3, B[3]);
910  t0 = _mm_add_ps(t0, t1);
911  t2 = _mm_add_ps(t2, t3);
912  AB[1] = _mm_add_ps(t0, t2);
913 
914  t0 = _mm_shuffle_ps(A[2], A[2], _MM_SHUFFLE(0, 0, 0, 0));
915  t1 = _mm_shuffle_ps(A[2], A[2], _MM_SHUFFLE(1, 1, 1, 1));
916  t2 = _mm_shuffle_ps(A[2], A[2], _MM_SHUFFLE(2, 2, 2, 2));
917  t3 = _mm_shuffle_ps(A[2], A[2], _MM_SHUFFLE(3, 3, 3, 3));
918  t0 = _mm_mul_ps(t0, B[0]);
919  t1 = _mm_mul_ps(t1, B[1]);
920  t2 = _mm_mul_ps(t2, B[2]);
921  t3 = _mm_mul_ps(t3, B[3]);
922  t0 = _mm_add_ps(t0, t1);
923  t2 = _mm_add_ps(t2, t3);
924  AB[2] = _mm_add_ps(t0, t2);
925 
926  t0 = _mm_shuffle_ps(A[3], A[3], _MM_SHUFFLE(0, 0, 0, 0));
927  t1 = _mm_shuffle_ps(A[3], A[3], _MM_SHUFFLE(1, 1, 1, 1));
928  t2 = _mm_shuffle_ps(A[3], A[3], _MM_SHUFFLE(2, 2, 2, 2));
929  t3 = _mm_shuffle_ps(A[3], A[3], _MM_SHUFFLE(3, 3, 3, 3));
930  t0 = _mm_mul_ps(t0, B[0]);
931  t1 = _mm_mul_ps(t1, B[1]);
932  t2 = _mm_mul_ps(t2, B[2]);
933  t3 = _mm_mul_ps(t3, B[3]);
934  t0 = _mm_add_ps(t0, t1);
935  t2 = _mm_add_ps(t2, t3);
936  AB[3] = _mm_add_ps(t0, t2);
937 #else
938  t0 = _mm_shuffle_ps(B[0], B[0], _MM_SHUFFLE(0, 0, 0, 0));
939  t1 = _mm_shuffle_ps(B[0], B[0], _MM_SHUFFLE(1, 1, 1, 1));
940  t2 = _mm_shuffle_ps(B[0], B[0], _MM_SHUFFLE(2, 2, 2, 2));
941  t3 = _mm_shuffle_ps(B[0], B[0], _MM_SHUFFLE(3, 3, 3, 3));
942  t0 = _mm_mul_ps(t0, A[0]);
943  t1 = _mm_mul_ps(t1, A[1]);
944  t2 = _mm_mul_ps(t2, A[2]);
945  t3 = _mm_mul_ps(t3, A[3]);
946  t0 = _mm_add_ps(t0, t1);
947  t2 = _mm_add_ps(t2, t3);
948  AB[0] = _mm_add_ps(t0, t2);
949 
950  t0 = _mm_shuffle_ps(B[1], B[1], _MM_SHUFFLE(0, 0, 0, 0));
951  t1 = _mm_shuffle_ps(B[1], B[1], _MM_SHUFFLE(1, 1, 1, 1));
952  t2 = _mm_shuffle_ps(B[1], B[1], _MM_SHUFFLE(2, 2, 2, 2));
953  t3 = _mm_shuffle_ps(B[1], B[1], _MM_SHUFFLE(3, 3, 3, 3));
954  t0 = _mm_mul_ps(t0, A[0]);
955  t1 = _mm_mul_ps(t1, A[1]);
956  t2 = _mm_mul_ps(t2, A[2]);
957  t3 = _mm_mul_ps(t3, A[3]);
958  t0 = _mm_add_ps(t0, t1);
959  t2 = _mm_add_ps(t2, t3);
960  AB[1] = _mm_add_ps(t0, t2);
961 
962  t0 = _mm_shuffle_ps(B[2], B[2], _MM_SHUFFLE(0, 0, 0, 0));
963  t1 = _mm_shuffle_ps(B[2], B[2], _MM_SHUFFLE(1, 1, 1, 1));
964  t2 = _mm_shuffle_ps(B[2], B[2], _MM_SHUFFLE(2, 2, 2, 2));
965  t3 = _mm_shuffle_ps(B[2], B[2], _MM_SHUFFLE(3, 3, 3, 3));
966  t0 = _mm_mul_ps(t0, A[0]);
967  t1 = _mm_mul_ps(t1, A[1]);
968  t2 = _mm_mul_ps(t2, A[2]);
969  t3 = _mm_mul_ps(t3, A[3]);
970  t0 = _mm_add_ps(t0, t1);
971  t2 = _mm_add_ps(t2, t3);
972  AB[2] = _mm_add_ps(t0, t2);
973 
974  t0 = _mm_shuffle_ps(B[3], B[3], _MM_SHUFFLE(0, 0, 0, 0));
975  t1 = _mm_shuffle_ps(B[3], B[3], _MM_SHUFFLE(1, 1, 1, 1));
976  t2 = _mm_shuffle_ps(B[3], B[3], _MM_SHUFFLE(2, 2, 2, 2));
977  t3 = _mm_shuffle_ps(B[3], B[3], _MM_SHUFFLE(3, 3, 3, 3));
978  t0 = _mm_mul_ps(t0, A[0]);
979  t1 = _mm_mul_ps(t1, A[1]);
980  t2 = _mm_mul_ps(t2, A[2]);
981  t3 = _mm_mul_ps(t3, A[3]);
982  t0 = _mm_add_ps(t0, t1);
983  t2 = _mm_add_ps(t2, t3);
984  AB[3] = _mm_add_ps(t0, t2);
985 #endif
986 }
987 
988 inline void SIMD::MultiplyATB(__m128 const* A, __m128 const* B, __m128* ATB)
989 {
990  __m128 ATrn[4];
991  Transpose(A, ATrn);
992  MultiplyAB(ATrn, B, ATB);
993 }
994 
995 inline void SIMD::MultiplyABT(__m128 const* A, __m128 const* B, __m128* ABT)
996 {
997  __m128 BTrn[4];
998  Transpose(B, BTrn);
999  MultiplyAB(A, BTrn, ABT);
1000 }
1001 
1002 inline void SIMD::MultiplyATBT(__m128 const* A, __m128 const* B, __m128* ATBT)
1003 {
1004  __m128 BA[4];
1005  MultiplyAB(B, A, BA);
1006  Transpose(BA, ATBT);
1007 }
1008 
1009 inline void SIMD::MultiplyDM(__m128 const D, __m128 const* M, __m128* DM)
1010 {
1011 #if defined(GTE_USE_ROW_MAJOR)
1012  DM[0] = _mm_mul_ps(D, M[0]);
1013  DM[1] = _mm_mul_ps(D, M[1]);
1014  DM[2] = _mm_mul_ps(D, M[2]);
1015  DM[3] = _mm_mul_ps(D, M[3]);
1016 #else
1017  __m128 d0 = _mm_shuffle_ps(D, D, _MM_SHUFFLE(0, 0, 0, 0));
1018  __m128 d1 = _mm_shuffle_ps(D, D, _MM_SHUFFLE(1, 1, 1, 1));
1019  __m128 d2 = _mm_shuffle_ps(D, D, _MM_SHUFFLE(2, 2, 2, 2));
1020  __m128 d3 = _mm_shuffle_ps(D, D, _MM_SHUFFLE(3, 3, 3, 3));
1021  DM[0] = _mm_mul_ps(d0, M[0]);
1022  DM[1] = _mm_mul_ps(d1, M[1]);
1023  DM[2] = _mm_mul_ps(d2, M[2]);
1024  DM[3] = _mm_mul_ps(d3, M[3]);
1025 #endif
1026 }
1027 
1028 inline void SIMD::MultiplyMD(__m128 const* M, __m128 const D, __m128* MD)
1029 {
1030 #if defined(GTE_USE_ROW_MAJOR)
1031  __m128 d0 = _mm_shuffle_ps(D, D, _MM_SHUFFLE(0, 0, 0, 0));
1032  __m128 d1 = _mm_shuffle_ps(D, D, _MM_SHUFFLE(1, 1, 1, 1));
1033  __m128 d2 = _mm_shuffle_ps(D, D, _MM_SHUFFLE(2, 2, 2, 2));
1034  __m128 d3 = _mm_shuffle_ps(D, D, _MM_SHUFFLE(3, 3, 3, 3));
1035  MD[0] = _mm_mul_ps(M[0], d0);
1036  MD[1] = _mm_mul_ps(M[1], d1);
1037  MD[2] = _mm_mul_ps(M[2], d2);
1038  MD[3] = _mm_mul_ps(M[3], d3);
1039 #else
1040  MD[0] = _mm_mul_ps(M[0], D);
1041  MD[1] = _mm_mul_ps(M[1], D);
1042  MD[2] = _mm_mul_ps(M[2], D);
1043  MD[3] = _mm_mul_ps(M[3], D);
1044 #endif
1045 }
1046 
1047 
1048 
1049 // Matrix-vector products.
1050 
1051 inline __m128 SIMD::MultiplyMV(__m128 const* M, __m128 const V)
1052 {
1053 #if defined(GTE_USE_ROW_MAJOR)
1054  __m128 MTrn[4];
1055  Transpose(M, MTrn);
1056  return MultiplyVM(V, MTrn);
1057 #else
1058  // u0 = m00*v0 + m01*v1 + m02*v2 + m03*v3
1059  // u1 = m10*v0 + m11*v1 + m12*v2 + m13*v3
1060  // u2 = m20*v0 + m21*v1 + m22*v2 + m23*v3
1061  // u3 = m30*v0 + m31*v1 + m32*v2 + m33*v3
1062 
1063  // (v0, v0, v0, v0)
1064  __m128 t0 = _mm_shuffle_ps(V, V, _MM_SHUFFLE(0, 0, 0, 0));
1065  // (v1, v1, v1, v1)
1066  __m128 t1 = _mm_shuffle_ps(V, V, _MM_SHUFFLE(1, 1, 1, 1));
1067  // (v2, v2, v2, v2)
1068  __m128 t2 = _mm_shuffle_ps(V, V, _MM_SHUFFLE(2, 2, 2, 2));
1069  // (v3, v3, v3, v3)
1070  __m128 t3 = _mm_shuffle_ps(V, V, _MM_SHUFFLE(3, 3, 3, 3));
1071 
1072  // (m00*v0, m10*v0, m20*v0, m30*v0)
1073  t0 = _mm_mul_ps(M[0], t0);
1074  // (m01*v1, m11*v1, m21*v1, m31*v1)
1075  t1 = _mm_mul_ps(M[1], t1);
1076  // (m02*v2, m12*v2, m22*v2, m32*v2)
1077  t2 = _mm_mul_ps(M[2], t2);
1078  // (m03*v3, m13*v3, m23*v3, m33*v3)
1079  t3 = _mm_mul_ps(M[3], t3);
1080 
1081  // (m00*v0+m01*v1, m10*v0+m11*v1, m20*v0+m21*v1, m30*v0+m31*v1)
1082  t0 = _mm_add_ps(t0, t1);
1083  // (m02*v2+m03*v3, m12*v2+m13*v3, m22*v2+m23*v3, m32*v2+m33*v3)
1084  t2 = _mm_add_ps(t2, t3);
1085 
1086  // M*V
1087  t0 = _mm_add_ps(t0, t2);
1088  return t0;
1089 #endif
1090 }
1091 
1092 inline __m128 SIMD::MultiplyVM(__m128 const V, __m128 const* M)
1093 {
1094 #if defined(GTE_USE_ROW_MAJOR)
1095  // u0 = v0*m00 + v1*m10 + v2*m20 + v3*m30
1096  // u1 = v0*m01 + v1*m11 + v2*m21 + v3*m31
1097  // u2 = v0*m02 + v1*m12 + v2*m22 + v3*m32
1098  // u3 = v0*m03 + v1*m13 + v2*m23 + v3*m33
1099 
1100  // (v0, v0, v0, v0)
1101  __m128 t0 = _mm_shuffle_ps(V, V, _MM_SHUFFLE(0, 0, 0, 0));
1102  // (v1, v1, v1, v1)
1103  __m128 t1 = _mm_shuffle_ps(V, V, _MM_SHUFFLE(1, 1, 1, 1));
1104  // (v2, v2, v2, v2)
1105  __m128 t2 = _mm_shuffle_ps(V, V, _MM_SHUFFLE(2, 2, 2, 2));
1106  // (v3, v3, v3, v3)
1107  __m128 t3 = _mm_shuffle_ps(V, V, _MM_SHUFFLE(3, 3, 3, 3));
1108 
1109  // (v0*m00, v0*m01, v0*m02, v0*m03)
1110  t0 = _mm_mul_ps(t0, M[0]);
1111  // (v1*m10, v1*m11, v1*m12, v1*m13)
1112  t1 = _mm_mul_ps(t1, M[1]);
1113  // (v2*m20, v2*m21, v2*m22, v2*m23)
1114  t2 = _mm_mul_ps(t2, M[2]);
1115  // (v3*m30, v3*m31, v3*m32, v3*m33)
1116  t3 = _mm_mul_ps(t3, M[3]);
1117 
1118  // (v0*m00+v1*m10, v0*m01+v1*m11, v0*m02+v1*m12, v0*m03+v1*m13)
1119  t0 = _mm_add_ps(t0, t1);
1120  // (v2*m20+v3*m30, v2*m21+v3*m31, v2*m22+v3*m32, v2*m23+v3*m33)
1121  t2 = _mm_add_ps(t2, t3);
1122 
1123  // V*M
1124  t0 = _mm_add_ps(t0, t2);
1125  return t0;
1126 #else
1127  __m128 MTrn[4];
1128  Transpose(M, MTrn);
1129  return MultiplyMV(MTrn, V);
1130 #endif
1131 }
1132 
1133 
1134 
1135 // Quaternion support.
1136 
1137 inline __m128 SIMD::QMultiply(__m128 const q0, __m128 const q1)
1138 {
1139  // (x0*i + y0*j + z0*k + w0)*(x1*i + y1*j + z1*k + w1)
1140  // =
1141  // i*(+x0*w1 + y0*z1 - z0*y1 + w0*x1) +
1142  // j*(-x0*z1 + y0*w1 + z0*x1 + w0*y1) +
1143  // k*(+x0*y1 - y0*x1 + z0*w1 + w0*z1) +
1144  // 1*(-x0*x1 - y0*y1 - z0*z1 + w0*w1)
1145 
1146  __m128 product;
1147  {
1148  // (x0, x0, x0, x0)
1149  __m128 t0 = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(0, 0, 0, 0));
1150  // (w1, z1, y1, x1)
1151  __m128 t1 = _mm_shuffle_ps(q1, q1, _MM_SHUFFLE(0, 1, 2, 3));
1152  // (+w1, -z1, +y1, -x1)
1153  t1 = _mm_mul_ps(SIMD::PMPM, t1);
1154  // (+x0*w1, -x0*z1, +x0*y1, -x0*x1)
1155  product = _mm_mul_ps(t0, t1);
1156  }
1157  {
1158  // (y0, y0, y0, y0)
1159  __m128 t0 = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(1, 1, 1, 1));
1160  // (z1, w1, x1, y1)
1161  __m128 t1 = _mm_shuffle_ps(q1, q1, _MM_SHUFFLE(1, 0, 3, 2));
1162  // (+z1, +w1, -x1, -y1)
1163  t1 = _mm_mul_ps(SIMD::PPMM, t1);
1164  // product += (+y0*z1, +y0*w1, -y0*x1, -y0*y1)
1165  t1 = _mm_mul_ps(t0, t1);
1166  product = _mm_add_ps(product, t1);
1167  }
1168  {
1169  // (z0, z0, z0, z0)
1170  __m128 t0 = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(2, 2, 2, 2));
1171  // (y1, x1, w1, z1)
1172  __m128 t1 = _mm_shuffle_ps(q1, q1, _MM_SHUFFLE(2, 3, 0, 1));
1173  // (-y1, +x1, +w1, -z1)
1174  t1 = _mm_mul_ps(SIMD::MPPM, t1);
1175  // product += (-z0*y1, +z0*x1, +z0*w1, -z0*z1)
1176  t1 = _mm_mul_ps(t0, t1);
1177  product = _mm_add_ps(product, t1);
1178  }
1179  {
1180  // (w0, w0, w0, w0)
1181  __m128 t0 = _mm_shuffle_ps(q0, q0, _MM_SHUFFLE(3, 3, 3, 3));
1182  // (+w0*x1, +w0*y1, +w0*z1, +w0*w1)
1183  __m128 t1 = _mm_mul_ps(t0, q1);
1184  // product += (+w0*x1, +w0*y1, +w0*z1, +w0*w1)
1185  product = _mm_add_ps(product, t1);
1186  }
1187  return product;
1188 }
1189 
1190 inline __m128 SIMD::QConjugate(__m128 const q)
1191 {
1192  __m128 conjugate = _mm_mul_ps(SIMD::MMMP, q);
1193  return conjugate;
1194 }
1195 
1196 inline __m128 SIMD::QInverse(__m128 const q)
1197 {
1198  // (-x, -y, -z, +w)
1199  __m128 conjugate = _mm_mul_ps(SIMD::MMMP, q);
1200 
1201  // (sqrlen, sqrlen, sqrlen, sqrlen)
1202  __m128 sqrlen = SIMD::Dot(conjugate, conjugate);
1203 
1204  // Divide by the squared length. This is potentially a divide by
1205  // zero or a divide by infinity.
1206  __m128 inverse = _mm_div_ps(conjugate, sqrlen);
1207 
1208  // Set to zero when the squared length is zero.
1209  __m128 mask = _mm_cmpneq_ps(SIMD::ZZZZ, sqrlen);
1210  inverse = _mm_and_ps(mask, inverse);
1211  return inverse;
1212 }
1213 
1214 inline __m128 SIMD::QSlerp(__m128 const t, __m128 const q0, __m128 const q1)
1215 {
1216  float const onePlusMuFPU = 1.90110745351730037f;
1217 
1218  __m128 cs = Dot(q0, q1);
1219  __m128 negative = _mm_cmplt_ps(cs, ZZZZ);
1220  __m128 term0 = _mm_and_ps(negative, MMMM);
1221  __m128 term1 = _mm_andnot_ps(negative, PPPP);
1222  __m128 sign = _mm_or_ps(term0, term1);
1223  cs = _mm_mul_ps(cs, sign);
1224 
1225  __m128 csm1 = _mm_sub_ps(cs, PPPP);
1226  __m128 omt = _mm_sub_ps(PPPP, t);
1227  // (1-t,1-t,t,t)
1228  __m128 temp = _mm_shuffle_ps(omt, t, _MM_SHUFFLE(0, 0, 0, 0));
1229  // (1-t,t,0,0)
1230  __m128 coeff = _mm_shuffle_ps(temp, ZZZZ, _MM_SHUFFLE(0, 0, 2, 0));
1231  __m128 u = coeff;
1232  __m128 sqr = _mm_mul_ps(coeff, u);
1233 
1234  __m128 avalue = _mm_set1_ps(1.0f / (1.0f*3.0f));
1235  __m128 bvalue = _mm_set1_ps(1.0f / 3.0f);
1236  temp = _mm_mul_ps(avalue, sqr);
1237  temp = _mm_sub_ps(temp, bvalue);
1238  temp = _mm_mul_ps(temp, csm1);
1239  coeff = _mm_mul_ps(coeff, temp);
1240  u = _mm_add_ps(u, coeff);
1241 
1242  avalue = _mm_set1_ps(1.0f / (2.0f*5.0f));
1243  bvalue = _mm_set1_ps(2.0f / 5.0f);
1244  temp = _mm_mul_ps(avalue, sqr);
1245  temp = _mm_sub_ps(temp, bvalue);
1246  temp = _mm_mul_ps(temp, csm1);
1247  coeff = _mm_mul_ps(coeff, temp);
1248  u = _mm_add_ps(u, coeff);
1249 
1250  avalue = _mm_set1_ps(1.0f / (3.0f*7.0f));
1251  bvalue = _mm_set1_ps(3.0f / 7.0f);
1252  temp = _mm_mul_ps(avalue, sqr);
1253  temp = _mm_sub_ps(temp, bvalue);
1254  temp = _mm_mul_ps(temp, csm1);
1255  coeff = _mm_mul_ps(coeff, temp);
1256  u = _mm_add_ps(u, coeff);
1257 
1258  avalue = _mm_set1_ps(1.0f / (4.0f*9.0f));
1259  bvalue = _mm_set1_ps(4.0f / 9.0f);
1260  temp = _mm_mul_ps(avalue, sqr);
1261  temp = _mm_sub_ps(temp, bvalue);
1262  temp = _mm_mul_ps(temp, csm1);
1263  coeff = _mm_mul_ps(coeff, temp);
1264  u = _mm_add_ps(u, coeff);
1265 
1266  avalue = _mm_set1_ps(1.0f / (5.0f*11.0f));
1267  bvalue = _mm_set1_ps(5.0f / 11.0f);
1268  temp = _mm_mul_ps(avalue, sqr);
1269  temp = _mm_sub_ps(temp, bvalue);
1270  temp = _mm_mul_ps(temp, csm1);
1271  coeff = _mm_mul_ps(coeff, temp);
1272  u = _mm_add_ps(u, coeff);
1273 
1274  avalue = _mm_set1_ps(1.0f / (6.0f*13.0f));
1275  bvalue = _mm_set1_ps(6.0f / 13.0f);
1276  temp = _mm_mul_ps(avalue, sqr);
1277  temp = _mm_sub_ps(temp, bvalue);
1278  temp = _mm_mul_ps(temp, csm1);
1279  coeff = _mm_mul_ps(coeff, temp);
1280  u = _mm_add_ps(u, coeff);
1281 
1282  avalue = _mm_set1_ps(1.0f / (7.0f*15.0f));
1283  bvalue = _mm_set1_ps(7.0f / 15.0f);
1284  temp = _mm_mul_ps(avalue, sqr);
1285  temp = _mm_sub_ps(temp, bvalue);
1286  temp = _mm_mul_ps(temp, csm1);
1287  coeff = _mm_mul_ps(coeff, temp);
1288  u = _mm_add_ps(u, coeff);
1289 
1290  avalue = _mm_set1_ps(1.0f / (8.0f*17.0f));
1291  bvalue = _mm_set1_ps(8.0f / 17.0f);
1292  temp = _mm_mul_ps(avalue, sqr);
1293  temp = _mm_sub_ps(temp, bvalue);
1294  temp = _mm_mul_ps(temp, csm1);
1295  coeff = _mm_mul_ps(coeff, temp);
1296  u = _mm_add_ps(u, coeff);
1297 
1298  avalue = _mm_set1_ps(onePlusMuFPU*1.0f / (9.0f*19.0f));
1299  bvalue = _mm_set1_ps(onePlusMuFPU*9.0f / 19.0f);
1300  temp = _mm_mul_ps(avalue, sqr);
1301  temp = _mm_sub_ps(temp, bvalue);
1302  temp = _mm_mul_ps(temp, csm1);
1303  coeff = _mm_mul_ps(coeff, temp);
1304  u = _mm_add_ps(u, coeff);
1305 
1306  term0 = _mm_shuffle_ps(u, u, _MM_SHUFFLE(0, 0, 0, 0));
1307  term1 = _mm_shuffle_ps(u, u, _MM_SHUFFLE(1, 1, 1, 1));
1308  term0 = _mm_mul_ps(term0, q0);
1309  term1 = _mm_mul_ps(term1, q1);
1310  term1 = _mm_mul_ps(term1, sign);
1311  __m128 slerp = _mm_add_ps(term0, term1);
1312  return slerp;
1313 }
1314 
1315 
1316 
1317 // Function evaluations (generally slow, function call per component).
1318 
1319 inline __m128 SIMD::Sin(__m128 const v)
1320 {
1321  __m128 result;
1322  result.m128_f32[0] = sin(v.m128_f32[0]);
1323  result.m128_f32[1] = sin(v.m128_f32[1]);
1324  result.m128_f32[2] = sin(v.m128_f32[2]);
1325  result.m128_f32[3] = sin(v.m128_f32[3]);
1326  return result;
1327 }
1328 
1329 inline __m128 SIMD::Cos(__m128 const v)
1330 {
1331  __m128 result;
1332  result.m128_f32[0] = cos(v.m128_f32[0]);
1333  result.m128_f32[1] = cos(v.m128_f32[1]);
1334  result.m128_f32[2] = cos(v.m128_f32[2]);
1335  result.m128_f32[3] = cos(v.m128_f32[3]);
1336  return result;
1337 }
1338 
1339 inline __m128 SIMD::Tan(__m128 const v)
1340 {
1341  __m128 result;
1342  result.m128_f32[0] = tan(v.m128_f32[0]);
1343  result.m128_f32[1] = tan(v.m128_f32[1]);
1344  result.m128_f32[2] = tan(v.m128_f32[2]);
1345  result.m128_f32[3] = tan(v.m128_f32[3]);
1346  return result;
1347 }
1348 
1349 inline __m128 SIMD::ASin(__m128 const v)
1350 {
1351  __m128 result;
1352  result.m128_f32[0] = asin(v.m128_f32[0]);
1353  result.m128_f32[1] = asin(v.m128_f32[1]);
1354  result.m128_f32[2] = asin(v.m128_f32[2]);
1355  result.m128_f32[3] = asin(v.m128_f32[3]);
1356  return result;
1357 }
1358 
1359 inline __m128 SIMD::ACos(__m128 const v)
1360 {
1361  __m128 result;
1362  result.m128_f32[0] = acos(v.m128_f32[0]);
1363  result.m128_f32[1] = acos(v.m128_f32[1]);
1364  result.m128_f32[2] = acos(v.m128_f32[2]);
1365  result.m128_f32[3] = acos(v.m128_f32[3]);
1366  return result;
1367 }
1368 
1369 inline __m128 SIMD::ATan(__m128 const v)
1370 {
1371  __m128 result;
1372  result.m128_f32[0] = atan(v.m128_f32[0]);
1373  result.m128_f32[1] = atan(v.m128_f32[1]);
1374  result.m128_f32[2] = atan(v.m128_f32[2]);
1375  result.m128_f32[3] = atan(v.m128_f32[3]);
1376  return result;
1377 }
1378 
1379 
1380 
1381 // Fast function approximations.
1382 
1383 inline void SIMD::ReduceAnglesSin(__m128 const x, __m128& y)
1384 {
1385  // Map x to y in [-pi,pi], x = 2*pi*quotient + remainder.
1386  __m128 quotient = _mm_mul_ps(x, INV_TWO_PI);
1387  quotient = Round(quotient);
1388  y = _mm_mul_ps(quotient, TWO_PI);
1389  y = _mm_sub_ps(x, y);
1390 
1391  // Map y to [-pi/2,pi/2] with sin(y) = sin(x).
1392  __m128 sign = _mm_and_ps(x, SIGN);
1393  __m128 c = _mm_or_ps(PI, sign); // pi when x >= 0, -pi when x < 0
1394  __m128 absx = _mm_andnot_ps(sign, x); // |x|
1395  __m128 rflx = _mm_sub_ps(c, x);
1396  __m128 comp = _mm_cmple_ps(absx, HALF_PI);
1397  __m128 select0 = _mm_and_ps(comp, x);
1398  __m128 select1 = _mm_andnot_ps(comp, rflx);
1399  y = _mm_or_ps(select0, select1);
1400 }
1401 
1402 inline __m128 SIMD::SinApprDeg11(__m128 const x)
1403 {
1404  __m128 xsqr = _mm_mul_ps(x, x);
1405  __m128 poly = _mm_mul_ps(C_SIN_APPR_DEG11_5, xsqr);
1406  poly = _mm_add_ps(poly, C_SIN_APPR_DEG11_4);
1407  poly = _mm_mul_ps(poly, xsqr);
1408  poly = _mm_add_ps(poly, C_SIN_APPR_DEG11_3);
1409  poly = _mm_mul_ps(poly, xsqr);
1410  poly = _mm_add_ps(poly, C_SIN_APPR_DEG11_2);
1411  poly = _mm_mul_ps(poly, xsqr);
1412  poly = _mm_add_ps(poly, C_SIN_APPR_DEG11_1);
1413  poly = _mm_mul_ps(poly, xsqr);
1414  poly = _mm_add_ps(poly, C_SIN_APPR_DEG11_0);
1415  poly = _mm_mul_ps(poly, x);
1416  return poly;
1417 }
1418 
1419 inline __m128 SIMD::SinApprDeg7(__m128 const x)
1420 {
1421  __m128 xsqr = _mm_mul_ps(x, x);
1422  __m128 poly = _mm_mul_ps(C_SIN_APPR_DEG7_3, xsqr);
1423  poly = _mm_add_ps(poly, C_SIN_APPR_DEG7_2);
1424  poly = _mm_mul_ps(poly, xsqr);
1425  poly = _mm_add_ps(poly, C_SIN_APPR_DEG7_1);
1426  poly = _mm_mul_ps(poly, xsqr);
1427  poly = _mm_add_ps(poly, C_SIN_APPR_DEG7_0);
1428  poly = _mm_mul_ps(poly, x);
1429  return poly;
1430 }
1431 
1432 inline void SIMD::ReduceAnglesCos(__m128 const x, __m128& y, __m128& sign)
1433 {
1434  // Map x to y in [-pi,pi], x = 2*pi*quotient + remainder.
1435  __m128 quotient = _mm_mul_ps(x, INV_TWO_PI);
1436  quotient = Round(quotient);
1437  y = _mm_mul_ps(quotient, TWO_PI);
1438  y = _mm_sub_ps(x, y);
1439 
1440  // Map x to y in [-pi/2,pi/2] with cos(y) = sign*cos(x).
1441  sign = _mm_and_ps(x, SIGN);
1442  __m128 c = _mm_or_ps(PI, sign); // pi when x >= 0, -pi when x < 0
1443  __m128 absx = _mm_andnot_ps(sign, x); // |x|
1444  __m128 rflx = _mm_sub_ps(c, x);
1445  __m128 comp = _mm_cmple_ps(absx, HALF_PI);
1446  __m128 select0 = _mm_and_ps(comp, x);
1447  __m128 select1 = _mm_andnot_ps(comp, rflx);
1448  y = _mm_or_ps(select0, select1);
1449  select0 = _mm_and_ps(comp, PPPP);
1450  select1 = _mm_andnot_ps(comp, MMMM);
1451  sign = _mm_or_ps(select0, select1);
1452 }
1453 
1454 inline __m128 SIMD::CosApprDeg10(__m128 const x, __m128 const sign)
1455 {
1456  __m128 xsqr = _mm_mul_ps(x, x);
1457  __m128 poly = _mm_mul_ps(C_COS_APPR_DEG10_5, xsqr);
1458  poly = _mm_add_ps(poly, C_COS_APPR_DEG10_4);
1459  poly = _mm_mul_ps(poly, xsqr);
1460  poly = _mm_add_ps(poly, C_COS_APPR_DEG10_3);
1461  poly = _mm_mul_ps(poly, xsqr);
1462  poly = _mm_add_ps(poly, C_COS_APPR_DEG10_2);
1463  poly = _mm_mul_ps(poly, xsqr);
1464  poly = _mm_add_ps(poly, C_COS_APPR_DEG10_1);
1465  poly = _mm_mul_ps(poly, xsqr);
1466  poly = _mm_add_ps(poly, C_COS_APPR_DEG10_0);
1467  poly = _mm_mul_ps(poly, sign);
1468  return poly;
1469 }
1470 
1471 inline __m128 SIMD::CosApprDeg6(__m128 const x, __m128 const sign)
1472 {
1473  __m128 xsqr = _mm_mul_ps(x, x);
1474  __m128 poly = _mm_mul_ps(C_COS_APPR_DEG6_3, xsqr);
1475  poly = _mm_add_ps(poly, C_COS_APPR_DEG6_2);
1476  poly = _mm_mul_ps(poly, xsqr);
1477  poly = _mm_add_ps(poly, C_COS_APPR_DEG6_1);
1478  poly = _mm_mul_ps(poly, xsqr);
1479  poly = _mm_add_ps(poly, C_COS_APPR_DEG6_0);
1480  poly = _mm_mul_ps(poly, sign);
1481  return poly;
1482 }
1483 
1484 inline void SIMD::GetAdjDet(__m128 const* mat, __m128* adj, __m128* det)
1485 {
1486  // [GTE_USE_MAT_VEC]
1487  // a0 = m00*m11 - m01*m10, b0 = m20*m31 - m21*m30
1488  // a1 - m00*m12 - m02*m10, b1 = m20*m32 - m22*m30
1489  // a2 = m00*m13 - m03*m10, b2 = m20*m33 - m23*m30
1490  // a3 = m01*m12 - m02*m11, b3 = m21*m32 - m22*m31
1491  // a4 = m01*m13 - m03*m11, b4 = m21*m33 - m23*m31
1492  // a5 = m02*m13 - m03*m12, b5 = m22*m33 - m23*m32
1493  // +c00 = m11*b5 - m12*b4 + m13*b3
1494  // -c10 = m10*b5 - m12*b2 + m13*b1
1495  // +c20 = m10*b4 - m11*b2 + m13*b0
1496  // -c30 = m10*b3 - m11*b1 + m12*b0
1497  // -c01 = m01*b5 - m02*b4 + m03*b3
1498  // +c11 = m00*b5 - m02*b2 + m03*b1
1499  // -c21 = m00*b4 - m01*b2 + m03*b0
1500  // +c31 = m00*b3 - m01*b1 + m02*b0
1501  // +c02 = m31*a5 - m32*a4 + m33*a3
1502  // -c12 = m30*a5 - m32*a2 + m33*a1
1503  // +c22 = m30*a4 - m31*a2 + m33*a0
1504  // -c32 = m30*a3 - m31*a1 + m32*a0
1505  // -c03 = m21*a5 - m22*a4 + m23*a3
1506  // +c13 = m20*a5 - m22*a2 + m23*a1
1507  // -c23 = m20*a4 - m21*a2 + m23*a0
1508  // +c33 = m20*a3 - m21*a1 + m22*a0
1509  //
1510  // [GTE_USE_VEC_MAT]
1511  // a0 = m00*m11 - m01*m10, b0 = m02*m13 - m03*m12
1512  // a1 = m00*m21 - m01*m20, b1 = m02*m23 - m03*m22
1513  // a2 = m00*m31 - m01*m30, b2 = m02*m33 - m03*m32
1514  // a3 = m10*m21 - m11*m20, b3 = m12*m23 - m13*m22
1515  // a4 = m10*m31 - m11*m30, b4 = m12*m33 - m13*m32
1516  // a5 = m20*m31 - m21*m30, b5 = m22*m33 - m23*m32
1517  // +c00 = m11*b5 - m21*b4 + m31*b3
1518  // -c01 = m01*b5 - m21*b2 + m31*b1
1519  // +c02 = m01*b4 - m11*b2 + m31*b0
1520  // -c03 = m01*b3 - m11*b1 + m21*b0
1521  // -c10 = m10*b5 - m20*b4 + m30*b3
1522  // +c11 = m00*b5 - m20*b2 + m30*b1
1523  // -c12 = m00*b4 - m10*b2 + m30*b0
1524  // +c13 = m00*b3 - m10*b1 + m20*b0
1525  // +c20 = m13*a5 - m23*a4 + m33*a3
1526  // -c21 = m03*a5 - m23*a2 + m33*a1
1527  // +c22 = m03*a4 - m13*a2 + m33*a0
1528  // -c23 = m03*a3 - m13*a1 + m23*a0
1529  // -c30 = m12*a5 - m22*a4 + m32*a3
1530  // +c31 = m02*a5 - m22*a2 + m32*a1
1531  // -c32 = m02*a4 - m12*a2 + m32*a0
1532  // +c33 = m02*a3 - m12*a1 + m22*a0
1533  //
1534  // det = a0*b5 - a1*b4 + a2*b3 + a3*b2 - a4*b1 + a5*b0
1535  // inverse[i][j] = c[i][j]/det
1536  __m128 t0, t1;
1537 
1538  // Compute a1, a2, a3, a4.
1539  __m128 a1a2a3a4;
1540  {
1541  // MV:(m00, m00, m01, m01), VM:(m00, m00, m10, m10)
1542  t0 = _mm_shuffle_ps(mat[0], mat[1], _MM_SHUFFLE(0, 0, 0, 0));
1543  // MV:(m12, m12, m13, m13), VM:(m21, m21, m31, m31)
1544  t1 = _mm_shuffle_ps(mat[2], mat[3], _MM_SHUFFLE(1, 1, 1, 1));
1545  // MV:(m12, m13, m12, m13), VM:(m21, m31, m21, m31)
1546  t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 0, 2, 0));
1547  // MV:(m00*m12, m00*m13, m01*m12, m01*m13)
1548  // VM:(m00*m21, m00*m31, m10*m21, m10*m31)
1549  a1a2a3a4 = _mm_mul_ps(t0, t1);
1550  // MV:(m10, m10, m11, m11), VM:(m01, m01, m11, m11)
1551  t0 = _mm_shuffle_ps(mat[0], mat[1], _MM_SHUFFLE(1, 1, 1, 1));
1552  // MV:(m02, m02, m03, m03), VM:(m20, m20, m30, m30)
1553  t1 = _mm_shuffle_ps(mat[2], mat[3], _MM_SHUFFLE(0, 0, 0, 0));
1554  // MV:(m02, m03, m02, m03), VM:(m20, m30, m20, m30)
1555  t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 0, 2, 0));
1556  // MV:(m10*m02, m10*m03, m11*m02, m11*m03)
1557  // VM:(m01*m20, m01*m30, m11*m20, m11*m30)
1558  t0 = _mm_mul_ps(t0, t1);
1559  // MV:(m00*m12-m10*m02,m00*m13-m10*m03,m01*m12-m11*m02,m01*m13-m11*m03)
1560  // VM:(m00*m21-m01*m20,m00*m31-m01*m30,m10*m21-m11*m20,m10*m31-m11*m30)
1561  a1a2a3a4 = _mm_sub_ps(a1a2a3a4, t0);
1562  }
1563 
1564  // Compute b1, b2, b3, b4.
1565  __m128 b1b2b3b4;
1566  {
1567  // MV:(m20, m20, m21, m21), VM:(m02, m02, m12, m12)
1568  t0 = _mm_shuffle_ps(mat[0], mat[1], _MM_SHUFFLE(2, 2, 2, 2));
1569  // MV:(m32, m32, m33, m33), VM:(m23, m23, m33, m33)
1570  t1 = _mm_shuffle_ps(mat[2], mat[3], _MM_SHUFFLE(3, 3, 3, 3));
1571  // MV:(m32, m33, m32, m33), VM:(m23, m33, m23, m33)
1572  t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 0, 2, 0));
1573  // MV:(m20*m32, m20*m33, m21*m32, m21*m33)
1574  // VM:(m02*m23, m02*m33, m12*m23, m12*m33)
1575  b1b2b3b4 = _mm_mul_ps(t0, t1);
1576  // MV:(m30, m30, m31, m31), VM:(m03, m03, m13, m13)
1577  t0 = _mm_shuffle_ps(mat[0], mat[1], _MM_SHUFFLE(3, 3, 3, 3));
1578  // MV:(m22, m22, m23, m23), VM:(m22, m22, m32, m32)
1579  t1 = _mm_shuffle_ps(mat[2], mat[3], _MM_SHUFFLE(2, 2, 2, 2));
1580  // MV:(m22, m23, m22, m23), VM:(m22, m32, m22, m32)
1581  t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 0, 2, 0));
1582  // MV:(m30*m22, m30*m23, m31*m22, m31*m23)
1583  // VM:(m03*m22, m03*m32, m13*m22, m13*m32)
1584  t0 = _mm_mul_ps(t0, t1);
1585  // MV:(m20*m32-m30*m22,m20*m33-m30*m23,m21*m32-m31*m22,m21*m33-m31*m22)
1586  // VM:(m02*m23-m03*m22,m02*m33-m03*m32,m12*m23-m13*m22,m12*m33-m13*m32)
1587  b1b2b3b4 = _mm_sub_ps(b1b2b3b4, t0);
1588  }
1589 
1590  // Compute a0, b0, a5, b5.
1591  __m128 a0b0a5b5;
1592  {
1593  // MV:(m00, m20, m02, m22), VM:(m00, m02, m20, m22)
1594  t0 = _mm_shuffle_ps(mat[0], mat[2], _MM_SHUFFLE(2, 0, 2, 0));
1595  // MV:(m11, m31, m13, m33), VM:(m11, m13, m31, m33)
1596  t1 = _mm_shuffle_ps(mat[1], mat[3], _MM_SHUFFLE(3, 1, 3, 1));
1597  // MV:(m00*m11, m20*m31, m02*m13, m22*m33)
1598  // VM:(m00*m11, m02*m13, m20*m31, m22*m33)
1599  a0b0a5b5 = _mm_mul_ps(t0, t1);
1600  // MV:(m10, m30, m12, m32), VM:(m01, m03, m21, m23)
1601  t0 = _mm_shuffle_ps(mat[0], mat[2], _MM_SHUFFLE(3, 1, 3, 1));
1602  // MV:(m01, m21, m03, m23), VM:(m10, m12, m30, m32)
1603  t1 = _mm_shuffle_ps(mat[1], mat[3], _MM_SHUFFLE(2, 0, 2, 0));
1604  // MV:(m10*m01, m30*m21, m12*m03, m32*m23)
1605  // VM:(m01*m10, m03*m12, m21*m30, m23*m32)
1606  t0 = _mm_mul_ps(t0, t1);
1607  // MV:(m00*m11-m10*m01,m20*m31-m30*m21,m02*m13-m12*m03,m22*m33-m32*m23)
1608  // VM:(m00*m11-m01*m10,m02*m13-m03*m12,m20*m31-m21*m30,m22*m33-m23*m32)
1609  a0b0a5b5 = _mm_sub_ps(a0b0a5b5, t0);
1610  }
1611 
1612  if (adj)
1613  {
1614  // Compute slices 0 and 1 of the adjoint matrix. An MV slice is a
1615  // column and a VM slice is a row.
1616  __m128 slice0, slice1;
1617  {
1618  __m128 b5b5b4b3 = _mm_shuffle_ps(a0b0a5b5, b1b2b3b4, _MM_SHUFFLE(2, 3, 3, 3));
1619  __m128 b4b2b2b1 = _mm_shuffle_ps(b1b2b3b4, b1b2b3b4, _MM_SHUFFLE(0, 1, 1, 3));
1620  __m128 b3b1b0b0 = _mm_shuffle_ps(b1b2b3b4, a0b0a5b5, _MM_SHUFFLE(1, 1, 0, 2));
1621 
1622  // Compute slice 0 of the adjoint matrix.
1623  {
1624  // MV:(m11, m11, m10, m10), VM:(m11, m11, m01, m01)
1625  t0 = _mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(1, 1, 1, 1));
1626  // MV:(m11, m10, m10, m10), VM:(m11, m01, m01, m01)
1627  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 0));
1628  // MV:(m11*b5, m10*b5, m10*b4, m10*b3)
1629  // VM:(m11*b5, m01*b5, m01*b4, m01*b3)
1630  slice0 = _mm_mul_ps(t1, b5b5b4b3);
1631 
1632  // MV:(m12, m12, m11, m11), VM:(m21, m21, m11, m11)
1633  t0 = _mm_shuffle_ps(mat[2], mat[1], _MM_SHUFFLE(1, 1, 1, 1));
1634  // MV:(m12*b4, m12*b2, m11*b2, m11*b1)
1635  // VM:(m21*b4, m21*b2, m11*b2, m11*b1)
1636  t1 = _mm_mul_ps(t0, b4b2b2b1);
1637  slice0 = _mm_sub_ps(slice0, t1);
1638 
1639  // MV:(m13, m13, m12, m12), VM:(m31, m31, m21, m21)
1640  t0 = _mm_shuffle_ps(mat[3], mat[2], _MM_SHUFFLE(1, 1, 1, 1));
1641  // MV:(m13, m13, m13, m12), VM:(m31, m31, m31, m21)
1642  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 0, 0));
1643  // MV:(m13*b3, m13*b1, m13*b0, m12*b0)
1644  // VM:(m31*b3, m31*b1, m31*b0, m21*b0)
1645  t0 = _mm_mul_ps(t1, b3b1b0b0);
1646  slice0 = _mm_add_ps(slice0, t0);
1647 
1648  // MV:(c00, c10, c20, c30), VM:(c00, c01, c02, c03)
1649  slice0 = _mm_mul_ps(slice0, PMPM);
1650  }
1651 
1652  // Compute slice 1 of the adjoint matrix.
1653  {
1654  // MV:(m01, m01, m00, m00), VM:(m10, m10, m00, m00)
1655  t0 = _mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(0, 0, 0, 0));
1656  // MV:(m01, m00, m00, m00), VM:(m10, m00, m00, m00)
1657  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 0));
1658  // MV:(m01*b5, m00*b5, m00*b4, m00*b3)
1659  // VM:(m10*b5, m00*b5, m00*b4, m00*b3)
1660  slice1 = _mm_mul_ps(t1, b5b5b4b3);
1661 
1662  // MV:(m02, m02, m01, m01), VM:(m20, m20, m10, m10)
1663  t0 = _mm_shuffle_ps(mat[2], mat[1], _MM_SHUFFLE(0, 0, 0, 0));
1664  // MV:(m02*b4, m02*b2, m01*b2, m01*b1)
1665  // VM:(m20*b4, m20*b2, m10*b2, m10*b1)
1666  t1 = _mm_mul_ps(t0, b4b2b2b1);
1667  slice1 = _mm_sub_ps(slice1, t1);
1668 
1669  // MV:(m03, m03, m02, m02), VM:(m30, m30, m20, m20)
1670  t0 = _mm_shuffle_ps(mat[3], mat[2], _MM_SHUFFLE(0, 0, 0, 0));
1671  // MV:(m03, m03, m03, m02), VM:(m30, m30, m30, m20)
1672  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 0, 0));
1673  // MV:(m03*b3, m03*b1, m03*b0, m02*b0)
1674  // VM:(m30*b3, m30*b1, m30*b0, m20*b0)
1675  t0 = _mm_mul_ps(t1, b3b1b0b0);
1676  slice1 = _mm_add_ps(slice1, t0);
1677 
1678  // MV:(c01, c11, c21, c31), VM:(c10, c11, c12, c13)
1679  slice1 = _mm_mul_ps(slice1, MPMP);
1680  }
1681  }
1682 
1683  // Compute slices 2 and 3 of the adjoint matrix. An MV slice is a
1684  // column and a VM slice is a row.
1685  __m128 slice2, slice3;
1686  {
1687  __m128 a5a5a4a3 = _mm_shuffle_ps(a0b0a5b5, a1a2a3a4, _MM_SHUFFLE(2, 3, 2, 2));
1688  __m128 a4a2a2a1 = _mm_shuffle_ps(a1a2a3a4, a1a2a3a4, _MM_SHUFFLE(0, 1, 1, 3));
1689  __m128 a3a1a0a0 = _mm_shuffle_ps(a1a2a3a4, a0b0a5b5, _MM_SHUFFLE(0, 0, 0, 2));
1690 
1691  // Compute slice 2 of the adjoint matrix.
1692  {
1693  // MV:(m31, m31, m30, m30), VM:(m13, m13, m03, m03)
1694  t0 = _mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(3, 3, 3, 3));
1695  // MV:(m13, m03, m03, m03), VM:(m13, m03, m03, m03)
1696  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 0));
1697  // MV:(m31*a5, m30*a5, m30*a4, m30*a3)
1698  // VM:(m13*a5, m03*a5, m03*a4, m03*a3)
1699  slice2 = _mm_mul_ps(t1, a5a5a4a3);
1700 
1701  // MV:(m32, m32, m31, m31), VM:(m23, m23, m13, m13)
1702  t0 = _mm_shuffle_ps(mat[2], mat[1], _MM_SHUFFLE(3, 3, 3, 3));
1703  // MV:(m32*a4, m32*a2, m31*a2, m31*a1)
1704  // VM:(m23*a4, m23*a2, m13*a2, m13*a1)
1705  t1 = _mm_mul_ps(t0, a4a2a2a1);
1706  slice2 = _mm_sub_ps(slice2, t1);
1707 
1708  // MV:(m33, m33, m32, m32), VM:(m33, m33, m23, m23)
1709  t0 = _mm_shuffle_ps(mat[3], mat[2], _MM_SHUFFLE(3, 3, 3, 3));
1710  // MV:(m33, m33, m33, m32), VM:(m33, m33, m33, m23)
1711  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 0, 0));
1712  // MV:(m33*a3, m33*a1, m33*a0, m32*a0)
1713  // VM:(m33*a3, m33*a1, m33*a0, m23*a0)
1714  t0 = _mm_mul_ps(t1, a3a1a0a0);
1715  slice2 = _mm_add_ps(slice2, t0);
1716 
1717  // MV:(c02, c12, c22, c32), VM:(c20, c21, c22, c23)
1718  slice2 = _mm_mul_ps(slice2, PMPM);
1719  }
1720 
1721  // Compute slice 3 of the adjoint matrix.
1722  {
1723  // MV:(m21, m21, m20, m20), VM:(m12, m12, m02, m02)
1724  t0 = _mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(2, 2, 2, 2));
1725  // MV:(m21, m20, m20, m20), VM:(m12, m02, m02, m02)
1726  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 0));
1727  // MV:(m21*a5, m20*a5, m20*a4, m20*a3)
1728  // VM:(m12*a5, m02*a5, m02*a4, m02*a3)
1729  slice3 = _mm_mul_ps(t1, a5a5a4a3);
1730 
1731  // MV:(m22, m22, m21, m21), VM:(m22, m22, m12, m12)
1732  t0 = _mm_shuffle_ps(mat[2], mat[1], _MM_SHUFFLE(2, 2, 2, 2));
1733  // MV:(m22*a4, m22*a2, m21*a2, m21*a1)
1734  // VM:(m22*a4, m22*a2, m12*a2, m12*a1)
1735  t1 = _mm_mul_ps(t0, a4a2a2a1);
1736  slice3 = _mm_sub_ps(slice3, t1);
1737 
1738  // MV:(m23, m23, m22, m22), VM:(m32, m32, m22, m22)
1739  t0 = _mm_shuffle_ps(mat[3], mat[2], _MM_SHUFFLE(2, 2, 2, 2));
1740  // MV:(m23, m23, m23, m22), VM:(m32, m32, m32, m22)
1741  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 0, 0, 0));
1742  // MV:(m23*a3, m23*a1, m23*a0, m22*a0)
1743  // VM:(m32*a3, m32*a1, m32*a0, m22*a0)
1744  t0 = _mm_mul_ps(t1, a3a1a0a0);
1745  slice3 = _mm_add_ps(slice3, t0);
1746 
1747  // MV:(c03, c13, c23, c33), VM:(c30, c31, c32, c33)
1748  slice3 = _mm_mul_ps(slice3, MPMP);
1749  }
1750  }
1751 
1752  adj[0] = slice0;
1753  adj[1] = slice1;
1754  adj[2] = slice2;
1755  adj[3] = slice3;
1756 
1757  if (det)
1758  {
1759  // Compute the determinant using the cofactors.
1760  // det = m00*c00 + m01*c10 + m02*c20 + m03*c30
1761  {
1762  // MV:(c00, c00, c01, c01), VM:(c00, c00, c10, c10)
1763  t0 = _mm_shuffle_ps(slice0, slice1, _MM_SHUFFLE(0, 0, 0, 0));
1764  // MV:(c02, c02, c03, c03), VM:(c20, c20, c30, c30)
1765  t1 = _mm_shuffle_ps(slice2, slice3, _MM_SHUFFLE(0, 0, 0, 0));
1766  // MV:(c00, c01, c02, c03), VM:(c00, c10, c20, c30)
1767  t1 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2, 0, 2, 0));
1768  // MV:(m00*c00, m10*c01, m20*c02, m30*c03)
1769  // VM:(m00*c00, m01*c10, m02*c20, m03*c30)
1770  t0 = _mm_mul_ps(mat[0], t1);
1771  // MV:(m10*c01, m00*c00, m30*c03, m20*c02)
1772  // VM:(m01*c10, m00*c00, m03*c30, m02*c20)
1773  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 3, 0, 1));
1774  // MV:(m00*c00+m10*c01,m00*c00+m01*c01,m20*c02+m30*c03,m20*c02+m30*c03)
1775  // VM:(m00*c00+m01*c10,m00*c00+m10*c10,m02*c20+m03*c30,m02*c20+m03*c30)
1776  t0 = _mm_add_ps(t0, t1);
1777  // MV:(m20*c02+m30*c03,m20*c02+m30*c03,m00*c00+m10*c01,m00*c00+m10*c01)
1778  // VM:(m02*c20+m03*c30,m02*c20+m03*c30,m00*c00+m01*c10,m00*c00+m01*c10)
1779  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 2, 2));
1780  // (det, det, det, det)
1781  *det = _mm_add_ps(t0, t1);
1782  }
1783  }
1784  }
1785  else if (det)
1786  {
1787  // Compute the determinant using the a- and b-coefficients.
1788  // det = (a0*b5+a5*b0+a2*b3+a3*b2) - (a1*b4+a4*b1) = dot0 - dot1
1789 
1790  // (a0, a5, a2, a3)
1791  t0 = _mm_shuffle_ps(a0b0a5b5, a1a2a3a4, _MM_SHUFFLE(2, 1, 2, 0));
1792  // (b5, b0, b3, b2)
1793  t1 = _mm_shuffle_ps(a0b0a5b5, b1b2b3b4, _MM_SHUFFLE(1, 2, 1, 3));
1794  // (a0*b5, a5*b0, a2*b3, a3*b2)
1795  t0 = _mm_mul_ps(t0, t1);
1796  // (a5*b0, a0*b5, a3*b2, a2*b3)
1797  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 3, 0, 1));
1798  // (a0*b5+a5*b0, a0*b5+a5*b0, a2*b3+a3*b2, a2*b3+a3*b2)
1799  t0 = _mm_add_ps(t0, t1);
1800  // (a2*b3+a3*b2, a2*b3+a3*b2, a0*b5+a5*b0, a0*b5+a5*b0)
1801  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 2, 2));
1802  // (dot0, dot0, dot0, dot0)
1803  __m128 dot0 = _mm_add_ps(t0, t1);
1804 
1805  // (a1, a4, a1, a4)
1806  t0 = _mm_shuffle_ps(a1a2a3a4, a1a2a3a4, _MM_SHUFFLE(3, 0, 3, 0));
1807  // (b4, b1, b4, b1)
1808  t1 = _mm_shuffle_ps(b1b2b3b4, b1b2b3b4, _MM_SHUFFLE(0, 3, 0, 3));
1809  // (a1*b4, a4*b1, a1*b4, a4*b1)
1810  t0 = _mm_mul_ps(t0, t1);
1811  // (a4*b1, a1*b4, a4*b1, a1*b4)
1812  t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 1, 0, 1));
1813  // (a1*b4+a4*b1, a1*b4+a4*b1, a1*b4+a4*b1, a1*b4+a4*b1)
1814  __m128 dot1 = _mm_add_ps(t0, t1);
1815  *det = _mm_sub_ps(dot0, dot1);
1816  }
1817 }
1818 
1819 
1820 }
static __m128 SinApprDeg11(__m128 const x)
Definition: GteIntelSSE.h:1402
static Vector const ZZZF
Definition: GteIntelSSE.h:202
static Vector const C_COS_APPR_DEG10_1
Definition: GteIntelSSE.h:273
static Vector const ZZPZ
Definition: GteIntelSSE.h:224
static void MultiplyATB(__m128 const *A, __m128 const *B, __m128 *ATB)
Definition: GteIntelSSE.h:988
static Vector const C_COS_APPR_DEG10_0
Definition: GteIntelSSE.h:272
static __m128 Sin(__m128 const v)
Definition: GteIntelSSE.h:1319
static Vector const ZFFZ
Definition: GteIntelSSE.h:207
static Vector const FFZF
Definition: GteIntelSSE.h:214
static Vector const ZFFF
Definition: GteIntelSSE.h:208
static Vector const FFFF
Definition: GteIntelSSE.h:216
static void GetAdjDet(__m128 const *mat, __m128 *adj, __m128 *det)
Definition: GteIntelSSE.h:1484
static __m128 ATan(__m128 const v)
Definition: GteIntelSSE.h:1369
static __m128 And(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:459
static __m128 ACos(__m128 const v)
Definition: GteIntelSSE.h:1359
static Vector const PPPM
Definition: GteIntelSSE.h:244
static __m128 Add(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:527
static Vector const PI
Definition: GteIntelSSE.h:249
static Vector const PPMP
Definition: GteIntelSSE.h:243
Matrix2x2< Real > Adjoint(Matrix2x2< Real > const &M)
Definition: GteMatrix2x2.h:108
static Vector const FZFZ
Definition: GteIntelSSE.h:211
static __m128 Subtract(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:532
static Vector const C_SIN_APPR_DEG11_4
Definition: GteIntelSSE.h:264
static Vector const MMMM
Definition: GteIntelSSE.h:230
Matrix & operator=(Matrix const &mat)
Definition: GteIntelSSE.h:412
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t0
Definition: glext.h:9013
static __m128 Determinant(__m128 const *mat)
Definition: GteIntelSSE.h:836
__m128 mTable[4]
Definition: GteIntelSSE.h:99
static Vector const FFFZ
Definition: GteIntelSSE.h:215
static Vector const C_SIN_APPR_DEG11_2
Definition: GteIntelSSE.h:262
static __m128 ASin(__m128 const v)
Definition: GteIntelSSE.h:1349
static void MultiplyAB(__m128 const *A, __m128 const *B, __m128 *AB)
Definition: GteIntelSSE.h:885
static Vector const FZZF
Definition: GteIntelSSE.h:210
static __m128 Greater(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:508
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s0
Definition: glext.h:9013
static void ReduceAnglesCos(__m128 const x, __m128 &y, __m128 &sign)
Definition: GteIntelSSE.h:1432
static Vector const FZFF
Definition: GteIntelSSE.h:212
static __m128 Normalize(__m128 const v)
Definition: GteIntelSSE.h:620
static Vector const MPPM
Definition: GteIntelSSE.h:236
static __m128 Equal(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:488
static Vector const C_COS_APPR_DEG6_3
Definition: GteIntelSSE.h:281
GLfloat GLfloat v1
Definition: glcorearb.h:812
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
Definition: glext.h:9013
static Vector const MMPM
Definition: GteIntelSSE.h:232
static __m128 Less(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:498
static Vector const C_COS_APPR_DEG10_2
Definition: GteIntelSSE.h:274
static Vector const MPMM
Definition: GteIntelSSE.h:234
static Vector const FFZZ
Definition: GteIntelSSE.h:213
static Vector const ZZZM
Definition: GteIntelSSE.h:229
static __m128 LessEqual(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:503
GMatrix< Real > MultiplyMD(GMatrix< Real > const &M, GVector< Real > const &D)
Definition: GteGMatrix.h:781
static Vector const C_COS_APPR_DEG6_1
Definition: GteIntelSSE.h:279
static Vector const PMPP
Definition: GteIntelSSE.h:241
static Vector const C_SIN_APPR_DEG11_0
Definition: GteIntelSSE.h:260
static __m128 NotEqual(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:493
static Vector const C_SIN_APPR_DEG11_1
Definition: GteIntelSSE.h:261
static __m128 Negate(__m128 const v)
Definition: GteIntelSSE.h:522
static Vector const C_COS_APPR_DEG10_4
Definition: GteIntelSSE.h:276
static Vector const PZZZ
Definition: GteIntelSSE.h:222
static Vector const MMMP
Definition: GteIntelSSE.h:231
static __m128 CosApprDeg6(__m128 const x, __m128 const sign)
Definition: GteIntelSSE.h:1471
GLint GLenum GLint x
Definition: glcorearb.h:404
GLint GLuint mask
Definition: glcorearb.h:119
static Vector const C_SIN_APPR_DEG11_3
Definition: GteIntelSSE.h:263
static __m128 GreaterEqual(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:513
__m128 const & operator[](int i) const
Definition: GteIntelSSE.h:440
static __m128 SinApprDeg7(__m128 const x)
Definition: GteIntelSSE.h:1419
static Vector const PMMP
Definition: GteIntelSSE.h:239
GMatrix< Real > MultiplyATB(GMatrix< Real > const &A, GMatrix< Real > const &B)
Definition: GteGMatrix.h:737
static Vector const ZZFZ
Definition: GteIntelSSE.h:203
static void MultiplyDM(__m128 const D, __m128 const *M, __m128 *DM)
Definition: GteIntelSSE.h:1009
static __m128 NormalizeRobustGetLength(__m128 const v, __m128 &length)
Definition: GteIntelSSE.h:686
static Vector const MPMP
Definition: GteIntelSSE.h:235
static __m128 QMultiply(__m128 const q0, __m128 const q1)
Definition: GteIntelSSE.h:1137
GMatrix< Real > Transpose(GMatrix< Real > const &M)
Definition: GteGMatrix.h:637
const GLubyte * c
Definition: glext.h:11671
GLint GLenum GLboolean normalized
Definition: glcorearb.h:867
static void MultiplyABT(__m128 const *A, __m128 const *B, __m128 *ABT)
Definition: GteIntelSSE.h:995
static __m128 MaximumAbsoluteComponent(__m128 const v)
Definition: GteIntelSSE.h:559
static __m128 Xor(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:474
static Vector const C_SIN_APPR_DEG7_3
Definition: GteIntelSSE.h:269
static Vector const MMPP
Definition: GteIntelSSE.h:233
static Vector const ZZZP
Definition: GteIntelSSE.h:225
static Vector const PPPP
Definition: GteIntelSSE.h:245
static Vector const SIGN
Definition: GteIntelSSE.h:217
static Vector const INV_TWO_PI
Definition: GteIntelSSE.h:253
static __m128 Or(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:469
static __m128 L1Norm(__m128 const *mat)
Definition: GteIntelSSE.h:843
DualQuaternion< Real > Dot(DualQuaternion< Real > const &d0, DualQuaternion< Real > const &d1)
static __m128 Tan(__m128 const v)
Definition: GteIntelSSE.h:1339
static __m128 MultiplyVM(__m128 const V, __m128 const *M)
Definition: GteIntelSSE.h:1092
static Vector const PMPM
Definition: GteIntelSSE.h:240
static Vector const ZFZZ
Definition: GteIntelSSE.h:205
static __m128 Dot(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:576
static void MultiplyMD(__m128 const *M, __m128 const D, __m128 *MD)
Definition: GteIntelSSE.h:1028
static Vector const NSIGN
Definition: GteIntelSSE.h:218
static Vector const ZZFF
Definition: GteIntelSSE.h:204
Real Normalize(GVector< Real > &v, bool robust=false)
Definition: GteGVector.h:454
static __m128 CosApprDeg10(__m128 const x, __m128 const sign)
Definition: GteIntelSSE.h:1454
static __m128 Not(__m128 const v)
Definition: GteIntelSSE.h:454
GLfloat v0
Definition: glcorearb.h:811
GLdouble GLdouble t
Definition: glext.h:239
static __m128 QConjugate(__m128 const q)
Definition: GteIntelSSE.h:1190
DualQuaternion< Real > Cross(DualQuaternion< Real > const &d0, DualQuaternion< Real > const &d1)
static void MultiplyATBT(__m128 const *A, __m128 const *B, __m128 *ATBT)
Definition: GteIntelSSE.h:1002
static Vector const C_COS_APPR_DEG10_5
Definition: GteIntelSSE.h:277
GMatrix< Real > MultiplyAB(GMatrix< Real > const &A, GMatrix< Real > const &B)
Definition: GteGMatrix.h:693
static Vector const C_SIN_APPR_DEG7_2
Definition: GteIntelSSE.h:268
static __m128 Cross(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:717
GLuint GLsizei GLsizei * length
Definition: glcorearb.h:790
static void Adjoint(__m128 const *mat, __m128 *adj)
Definition: GteIntelSSE.h:831
static Vector const C_COS_APPR_DEG6_2
Definition: GteIntelSSE.h:280
static Vector const HALF_PI
Definition: GteIntelSSE.h:250
static Vector const ZFZF
Definition: GteIntelSSE.h:206
static void Transpose(__m128 const *mat, __m128 *trn)
Definition: GteIntelSSE.h:793
static Vector const INV_PI
Definition: GteIntelSSE.h:252
static __m128 Length(__m128 const v)
Definition: GteIntelSSE.h:595
DualQuaternion< Real > Length(DualQuaternion< Real > const &d, bool robust=false)
const GLdouble * v
Definition: glcorearb.h:832
static Vector const MPPP
Definition: GteIntelSSE.h:237
static __m128 NormalizeRobust(__m128 const v)
Definition: GteIntelSSE.h:657
GMatrix< Real > MultiplyATBT(GMatrix< Real > const &A, GMatrix< Real > const &B)
Definition: GteGMatrix.h:759
Real LInfinityNorm(GMatrix< Real > const &M)
Definition: GteGMatrix.h:575
static Vector const FZZZ
Definition: GteIntelSSE.h:209
static Vector const PMMM
Definition: GteIntelSSE.h:238
static __m128 Round(__m128 const v)
Definition: GteIntelSSE.h:547
static Vector const C_COS_APPR_DEG10_3
Definition: GteIntelSSE.h:275
static Vector const ZZZZ
Definition: GteIntelSSE.h:201
static Vector const TWO_PI
Definition: GteIntelSSE.h:251
static __m128 LengthRobust(__m128 const v)
Definition: GteIntelSSE.h:601
GMatrix< Real > MultiplyABT(GMatrix< Real > const &A, GMatrix< Real > const &B)
Definition: GteGMatrix.h:715
static __m128 QInverse(__m128 const q)
Definition: GteIntelSSE.h:1196
GLfloat f
Definition: glcorearb.h:1921
GLdouble GLdouble GLdouble GLdouble q
Definition: glext.h:255
static Vector const C_SIN_APPR_DEG7_1
Definition: GteIntelSSE.h:267
static Vector const ZPZZ
Definition: GteIntelSSE.h:223
static __m128 Cos(__m128 const v)
Definition: GteIntelSSE.h:1329
Quaternion< Real > Inverse(Quaternion< Real > const &d)
static __m128 MultiplyMV(__m128 const *M, __m128 const V)
Definition: GteIntelSSE.h:1051
static Vector const C_SIN_APPR_DEG7_0
Definition: GteIntelSSE.h:266
static void Inverse(__m128 const *mat, __m128 *inv)
Definition: GteIntelSSE.h:814
Real L2Norm(GMatrix< Real > const &M)
Definition: GteGMatrix.h:564
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1
Definition: glext.h:9013
static __m128 NormalizeGetLength(__m128 const v, __m128 &length)
Definition: GteIntelSSE.h:638
static Vector const C_SIN_APPR_DEG11_5
Definition: GteIntelSSE.h:265
static __m128 L2Norm(__m128 const *mat)
Definition: GteIntelSSE.h:856
Real Determinant(GMatrix< Real > const &M)
Definition: GteGMatrix.h:618
static Vector const MZZZ
Definition: GteIntelSSE.h:226
static __m128 Select(__m128 const c, __m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:479
static Vector const ZZMZ
Definition: GteIntelSSE.h:228
static void ReduceAnglesSin(__m128 const x, __m128 &y)
Definition: GteIntelSSE.h:1383
static Vector const C_COS_APPR_DEG6_0
Definition: GteIntelSSE.h:278
Vector & operator=(Vector const &vec)
Definition: GteIntelSSE.h:330
static __m128 QSlerp(__m128 const t, __m128 const q0, __m128 const q1)
Definition: GteIntelSSE.h:1214
GLuint64EXT * result
Definition: glext.h:10003
GLint y
Definition: glcorearb.h:98
static __m128 Multiply(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:537
static Vector const ZMZZ
Definition: GteIntelSSE.h:227
#define GTE_IMPEXP
Definition: GTEngineDEF.h:63
static __m128 AndNot(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:464
static Vector const NOFRC
Definition: GteIntelSSE.h:219
Real L1Norm(GMatrix< Real > const &M)
Definition: GteGMatrix.h:553
static __m128 LInfinityNorm(__m128 const *mat)
Definition: GteIntelSSE.h:869
GMatrix< Real > MultiplyDM(GVector< Real > const &D, GMatrix< Real > const &M)
Definition: GteGMatrix.h:798
static __m128 Divide(__m128 const v0, __m128 const v1)
Definition: GteIntelSSE.h:542
static Vector const PPMM
Definition: GteIntelSSE.h:242


geometric_tools_engine
Author(s): Yijiang Huang
autogenerated on Thu Jul 18 2019 04:00:00