00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
00011 #define EIGEN_PACKET_MATH_ALTIVEC_H
00012
00013 namespace Eigen {
00014
00015 namespace internal {
00016
00017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
00019 #endif
00020
00021 #ifndef EIGEN_HAS_FUSE_CJMADD
00022 #define EIGEN_HAS_FUSE_CJMADD 1
00023 #endif
00024
00025
00026 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
00027 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
00028 #endif
00029
00030 typedef __vector float Packet4f;
00031 typedef __vector int Packet4i;
00032 typedef __vector unsigned int Packet4ui;
00033 typedef __vector __bool int Packet4bi;
00034 typedef __vector short int Packet8i;
00035 typedef __vector unsigned char Packet16uc;
00036
00037
00038
00039
00040 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
00041 Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
00042
00043 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
00044 Packet4i p4i_##NAME = vec_splat_s32(X)
00045
00046 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
00047 Packet4f p4f_##NAME = pset1<Packet4f>(X)
00048
00049 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
00050 Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
00051
00052 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
00053 Packet4i p4i_##NAME = pset1<Packet4i>(X)
00054
00055 #define DST_CHAN 1
00056 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
00057
00058
00059 static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
00060 static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
00061 static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
00062 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
00063 static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
00064
00065 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
00066 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
00067 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
00068 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
00069 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
00070 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
00071 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
00072
00073 template<> struct packet_traits<float> : default_packet_traits
00074 {
00075 typedef Packet4f type;
00076 enum {
00077 Vectorizable = 1,
00078 AlignedOnScalar = 1,
00079 size=4,
00080
00081
00082 HasSin = 0,
00083 HasCos = 0,
00084 HasLog = 0,
00085 HasExp = 0,
00086 HasSqrt = 0
00087 };
00088 };
00089 template<> struct packet_traits<int> : default_packet_traits
00090 {
00091 typedef Packet4i type;
00092 enum {
00093
00094 Vectorizable = 1,
00095 AlignedOnScalar = 1,
00096 size=4
00097 };
00098 };
00099
00100 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
00101 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
00148
00149 float EIGEN_ALIGN16 af[4];
00150 af[0] = from;
00151 Packet4f vc = vec_ld(0, af);
00152 vc = vec_splat(vc, 0);
00153 return vc;
00154 }
00155
00156 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
00157 int EIGEN_ALIGN16 ai[4];
00158 ai[0] = from;
00159 Packet4i vc = vec_ld(0, ai);
00160 vc = vec_splat(vc, 0);
00161 return vc;
00162 }
00163
00164 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
00165 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
00166
00167 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
00168 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
00169
00170 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
00171 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
00172
00173 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
00174 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
00175
00176 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
00177 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
00178
00179 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
00217 {
00218 Packet4f t, y_0, y_1, res;
00219
00220
00221 y_0 = vec_re(b);
00222
00223
00224 t = vec_nmsub(y_0, b, p4f_ONE);
00225 y_1 = vec_madd(y_0, t, y_0);
00226
00227 res = vec_madd(a, y_1, p4f_ZERO);
00228 return res;
00229 }
00230
00231 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& , const Packet4i& )
00232 { eigen_assert(false && "packet integer division are not supported by AltiVec");
00233 return pset1<Packet4i>(0);
00234 }
00235
00236
00237 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
00238 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
00239
00240 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
00241 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
00242
00243 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
00244 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
00245
00246
00247 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
00248 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
00249
00250 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
00251 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
00252
00253 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
00254 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
00255
00256 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
00257 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
00258
00259 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
00260 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
00261
00262 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
00263 {
00264 EIGEN_DEBUG_ALIGNED_LOAD
00265
00266 Packet16uc MSQ, LSQ;
00267 Packet16uc mask;
00268 MSQ = vec_ld(0, (unsigned char *)from);
00269 LSQ = vec_ld(15, (unsigned char *)from);
00270 mask = vec_lvsl(0, from);
00271 return (Packet4f) vec_perm(MSQ, LSQ, mask);
00272
00273 }
00274 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
00275 {
00276 EIGEN_DEBUG_ALIGNED_LOAD
00277
00278 Packet16uc MSQ, LSQ;
00279 Packet16uc mask;
00280 MSQ = vec_ld(0, (unsigned char *)from);
00281 LSQ = vec_ld(15, (unsigned char *)from);
00282 mask = vec_lvsl(0, from);
00283 return (Packet4i) vec_perm(MSQ, LSQ, mask);
00284 }
00285
00286 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
00287 {
00288 Packet4f p;
00289 if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4f>(from);
00290 else p = ploadu<Packet4f>(from);
00291 return vec_perm(p, p, p16uc_DUPLICATE);
00292 }
00293 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
00294 {
00295 Packet4i p;
00296 if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4i>(from);
00297 else p = ploadu<Packet4i>(from);
00298 return vec_perm(p, p, p16uc_DUPLICATE);
00299 }
00300
00301 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
00302 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
00303
00304 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
00305 {
00306 EIGEN_DEBUG_UNALIGNED_STORE
00307
00308
00309 Packet16uc MSQ, LSQ, edges;
00310 Packet16uc edgeAlign, align;
00311
00312 MSQ = vec_ld(0, (unsigned char *)to);
00313 LSQ = vec_ld(15, (unsigned char *)to);
00314 edgeAlign = vec_lvsl(0, to);
00315 edges=vec_perm(LSQ,MSQ,edgeAlign);
00316 align = vec_lvsr( 0, to );
00317 MSQ = vec_perm(edges,(Packet16uc)from,align);
00318 LSQ = vec_perm((Packet16uc)from,edges,align);
00319 vec_st( LSQ, 15, (unsigned char *)to );
00320 vec_st( MSQ, 0, (unsigned char *)to );
00321 }
00322 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
00323 {
00324 EIGEN_DEBUG_UNALIGNED_STORE
00325
00326
00327 Packet16uc MSQ, LSQ, edges;
00328 Packet16uc edgeAlign, align;
00329
00330 MSQ = vec_ld(0, (unsigned char *)to);
00331 LSQ = vec_ld(15, (unsigned char *)to);
00332 edgeAlign = vec_lvsl(0, to);
00333 edges=vec_perm(LSQ, MSQ, edgeAlign);
00334 align = vec_lvsr( 0, to );
00335 MSQ = vec_perm(edges, (Packet16uc) from, align);
00336 LSQ = vec_perm((Packet16uc) from, edges, align);
00337 vec_st( LSQ, 15, (unsigned char *)to );
00338 vec_st( MSQ, 0, (unsigned char *)to );
00339 }
00340
00341 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
00342 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
00343
00344 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
00345 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
00346
00347 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
00348 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
00349
00350 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
00351 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
00352
00353 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00354 {
00355 Packet4f b, sum;
00356 b = (Packet4f) vec_sld(a, a, 8);
00357 sum = vec_add(a, b);
00358 b = (Packet4f) vec_sld(sum, sum, 4);
00359 sum = vec_add(sum, b);
00360 return pfirst(sum);
00361 }
00362
00363 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00364 {
00365 Packet4f v[4], sum[4];
00366
00367
00368
00369
00370 v[0] = vec_mergeh(vecs[0], vecs[2]);
00371 v[1] = vec_mergel(vecs[0], vecs[2]);
00372 v[2] = vec_mergeh(vecs[1], vecs[3]);
00373 v[3] = vec_mergel(vecs[1], vecs[3]);
00374
00375 sum[0] = vec_mergeh(v[0], v[2]);
00376 sum[1] = vec_mergel(v[0], v[2]);
00377 sum[2] = vec_mergeh(v[1], v[3]);
00378 sum[3] = vec_mergel(v[1], v[3]);
00379
00380
00381
00382 sum[0] = vec_add(sum[0], sum[1]);
00383
00384 sum[1] = vec_add(sum[2], sum[3]);
00385
00386 sum[0] = vec_add(sum[0], sum[1]);
00387
00388 return sum[0];
00389 }
00390
00391 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00392 {
00393 Packet4i sum;
00394 sum = vec_sums(a, p4i_ZERO);
00395 sum = vec_sld(sum, p4i_ZERO, 12);
00396 return pfirst(sum);
00397 }
00398
00399 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00400 {
00401 Packet4i v[4], sum[4];
00402
00403
00404
00405
00406 v[0] = vec_mergeh(vecs[0], vecs[2]);
00407 v[1] = vec_mergel(vecs[0], vecs[2]);
00408 v[2] = vec_mergeh(vecs[1], vecs[3]);
00409 v[3] = vec_mergel(vecs[1], vecs[3]);
00410
00411 sum[0] = vec_mergeh(v[0], v[2]);
00412 sum[1] = vec_mergel(v[0], v[2]);
00413 sum[2] = vec_mergeh(v[1], v[3]);
00414 sum[3] = vec_mergel(v[1], v[3]);
00415
00416
00417
00418 sum[0] = vec_add(sum[0], sum[1]);
00419
00420 sum[1] = vec_add(sum[2], sum[3]);
00421
00422 sum[0] = vec_add(sum[0], sum[1]);
00423
00424 return sum[0];
00425 }
00426
00427
00428
00429 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
00430 {
00431 Packet4f prod;
00432 prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
00433 return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
00434 }
00435
00436 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
00437 {
00438 EIGEN_ALIGN16 int aux[4];
00439 pstore(aux, a);
00440 return aux[0] * aux[1] * aux[2] * aux[3];
00441 }
00442
00443
00444 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
00445 {
00446 Packet4f b, res;
00447 b = vec_min(a, vec_sld(a, a, 8));
00448 res = vec_min(b, vec_sld(b, b, 4));
00449 return pfirst(res);
00450 }
00451
00452 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
00453 {
00454 Packet4i b, res;
00455 b = vec_min(a, vec_sld(a, a, 8));
00456 res = vec_min(b, vec_sld(b, b, 4));
00457 return pfirst(res);
00458 }
00459
00460
00461 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
00462 {
00463 Packet4f b, res;
00464 b = vec_max(a, vec_sld(a, a, 8));
00465 res = vec_max(b, vec_sld(b, b, 4));
00466 return pfirst(res);
00467 }
00468
00469 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
00470 {
00471 Packet4i b, res;
00472 b = vec_max(a, vec_sld(a, a, 8));
00473 res = vec_max(b, vec_sld(b, b, 4));
00474 return pfirst(res);
00475 }
00476
00477 template<int Offset>
00478 struct palign_impl<Offset,Packet4f>
00479 {
00480 static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
00481 {
00482 if (Offset!=0)
00483 first = vec_sld(first, second, Offset*4);
00484 }
00485 };
00486
00487 template<int Offset>
00488 struct palign_impl<Offset,Packet4i>
00489 {
00490 static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
00491 {
00492 if (Offset!=0)
00493 first = vec_sld(first, second, Offset*4);
00494 }
00495 };
00496
00497 }
00498
00499 }
00500
00501 #endif // EIGEN_PACKET_MATH_ALTIVEC_H