00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
00026 #define EIGEN_PACKET_MATH_ALTIVEC_H
00027
00028 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00029 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
00030 #endif
00031
00032 typedef __vector float v4f;
00033 typedef __vector int v4i;
00034 typedef __vector unsigned int v4ui;
00035 typedef __vector __bool int v4bi;
00036
00037
00038
00039
00040 #define USE_CONST_v0i const v4i v0i = vec_splat_s32(0)
00041 #define USE_CONST_v1i const v4i v1i = vec_splat_s32(1)
00042 #define USE_CONST_v16i_ const v4i v16i_ = vec_splat_s32(-16)
00043 #define USE_CONST_v0f USE_CONST_v0i; const v4f v0f = (v4f) v0i
00044 #define USE_CONST_v1f USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0)
00045 #define USE_CONST_v1i_ const v4ui v1i_ = vec_splat_u32(-1)
00046 #define USE_CONST_v0f_ USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_)
00047
00048 template<> struct ei_packet_traits<float> { typedef v4f type; enum {size=4}; };
00049 template<> struct ei_packet_traits<int> { typedef v4i type; enum {size=4}; };
00050
00051 template<> struct ei_unpacket_traits<v4f> { typedef float type; enum {size=4}; };
00052 template<> struct ei_unpacket_traits<v4i> { typedef int type; enum {size=4}; };
00053
00054 inline std::ostream & operator <<(std::ostream & s, const v4f & v)
00055 {
00056 union {
00057 v4f v;
00058 float n[4];
00059 } vt;
00060 vt.v = v;
00061 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00062 return s;
00063 }
00064
00065 inline std::ostream & operator <<(std::ostream & s, const v4i & v)
00066 {
00067 union {
00068 v4i v;
00069 int n[4];
00070 } vt;
00071 vt.v = v;
00072 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00073 return s;
00074 }
00075
00076 inline std::ostream & operator <<(std::ostream & s, const v4ui & v)
00077 {
00078 union {
00079 v4ui v;
00080 unsigned int n[4];
00081 } vt;
00082 vt.v = v;
00083 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00084 return s;
00085 }
00086
00087 inline std::ostream & operator <<(std::ostream & s, const v4bi & v)
00088 {
00089 union {
00090 __vector __bool int v;
00091 unsigned int n[4];
00092 } vt;
00093 vt.v = v;
00094 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00095 return s;
00096 }
00097
00098 template<> inline v4f ei_padd(const v4f& a, const v4f& b) { return vec_add(a,b); }
00099 template<> inline v4i ei_padd(const v4i& a, const v4i& b) { return vec_add(a,b); }
00100
00101 template<> inline v4f ei_psub(const v4f& a, const v4f& b) { return vec_sub(a,b); }
00102 template<> inline v4i ei_psub(const v4i& a, const v4i& b) { return vec_sub(a,b); }
00103
00104 template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
00105 template<> inline v4i ei_pmul(const v4i& a, const v4i& b)
00106 {
00107
00108
00109 v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
00110 USE_CONST_v0i;
00111 USE_CONST_v1i;
00112 USE_CONST_v16i_;
00113
00114
00115 a1 = vec_abs(a);
00116 b1 = vec_abs(b);
00117
00118
00119 v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i);
00120
00121
00122 bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ );
00123 low_prod = vec_mulo((__vector short)a1, (__vector short)b1);
00124 high_prod = vec_msum((__vector short)a1, (__vector short)bswap, v0i);
00125 high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_);
00126 prod = vec_add( low_prod, high_prod );
00127
00128
00129 prod_ = vec_nor(prod, prod);
00130 prod_ = vec_sel(v0i, prod_, sgn);
00131
00132
00133 v1sel = vec_sel(v0i, v1i, sgn);
00134 prod_ = vec_add(prod_, v1sel);
00135
00136
00137 prod = vec_sel(prod, prod_, sgn);
00138
00139 return prod;
00140 }
00141
00142 template<> inline v4f ei_pdiv(const v4f& a, const v4f& b) {
00143 v4f t, y_0, y_1, res;
00144 USE_CONST_v0f;
00145 USE_CONST_v1f;
00146
00147
00148 y_0 = vec_re(b);
00149
00150
00151 t = vec_nmsub(y_0, b, v1f);
00152 y_1 = vec_madd(y_0, t, y_0);
00153
00154 res = vec_madd(a, y_1, v0f);
00155 return res;
00156 }
00157
00158 template<> inline v4f ei_pmadd(const v4f& a, const v4f& b, const v4f& c) { return vec_madd(a, b, c); }
00159
00160 template<> inline v4f ei_pmin(const v4f& a, const v4f& b) { return vec_min(a,b); }
00161 template<> inline v4i ei_pmin(const v4i& a, const v4i& b) { return vec_min(a,b); }
00162
00163 template<> inline v4f ei_pmax(const v4f& a, const v4f& b) { return vec_max(a,b); }
00164 template<> inline v4i ei_pmax(const v4i& a, const v4i& b) { return vec_max(a,b); }
00165
00166 template<> inline v4f ei_pload(const float* from) { return vec_ld(0, from); }
00167 template<> inline v4i ei_pload(const int* from) { return vec_ld(0, from); }
00168
00169 template<> inline v4f ei_ploadu(const float* from)
00170 {
00171
00172 __vector unsigned char MSQ, LSQ;
00173 __vector unsigned char mask;
00174 MSQ = vec_ld(0, (unsigned char *)from);
00175 LSQ = vec_ld(15, (unsigned char *)from);
00176 mask = vec_lvsl(0, from);
00177 return (v4f) vec_perm(MSQ, LSQ, mask);
00178 }
00179
00180 template<> inline v4i ei_ploadu(const int* from)
00181 {
00182
00183 __vector unsigned char MSQ, LSQ;
00184 __vector unsigned char mask;
00185 MSQ = vec_ld(0, (unsigned char *)from);
00186 LSQ = vec_ld(15, (unsigned char *)from);
00187 mask = vec_lvsl(0, from);
00188 return (v4i) vec_perm(MSQ, LSQ, mask);
00189 }
00190
00191 template<> inline v4f ei_pset1(const float& from)
00192 {
00193
00194 float __attribute__(aligned(16)) af[4];
00195 af[0] = from;
00196 v4f vc = vec_ld(0, af);
00197 vc = vec_splat(vc, 0);
00198 return vc;
00199 }
00200
00201 template<> inline v4i ei_pset1(const int& from)
00202 {
00203 int __attribute__(aligned(16)) ai[4];
00204 ai[0] = from;
00205 v4i vc = vec_ld(0, ai);
00206 vc = vec_splat(vc, 0);
00207 return vc;
00208 }
00209
00210 template<> inline void ei_pstore(float* to, const v4f& from) { vec_st(from, 0, to); }
00211 template<> inline void ei_pstore(int* to, const v4i& from) { vec_st(from, 0, to); }
00212
00213 template<> inline void ei_pstoreu(float* to, const v4f& from)
00214 {
00215
00216
00217 __vector unsigned char MSQ, LSQ, edges;
00218 __vector unsigned char edgeAlign, align;
00219
00220 MSQ = vec_ld(0, (unsigned char *)to);
00221 LSQ = vec_ld(15, (unsigned char *)to);
00222 edgeAlign = vec_lvsl(0, to);
00223 edges=vec_perm(LSQ,MSQ,edgeAlign);
00224 align = vec_lvsr( 0, to );
00225 MSQ = vec_perm(edges,(__vector unsigned char)from,align);
00226 LSQ = vec_perm((__vector unsigned char)from,edges,align);
00227 vec_st( LSQ, 15, (unsigned char *)to );
00228 vec_st( MSQ, 0, (unsigned char *)to );
00229 }
00230
00231 template<> inline void ei_pstoreu(int* to , const v4i& from )
00232 {
00233
00234
00235 __vector unsigned char MSQ, LSQ, edges;
00236 __vector unsigned char edgeAlign, align;
00237
00238 MSQ = vec_ld(0, (unsigned char *)to);
00239 LSQ = vec_ld(15, (unsigned char *)to);
00240 edgeAlign = vec_lvsl(0, to);
00241 edges=vec_perm(LSQ,MSQ,edgeAlign);
00242 align = vec_lvsr( 0, to );
00243 MSQ = vec_perm(edges,(__vector unsigned char)from,align);
00244 LSQ = vec_perm((__vector unsigned char)from,edges,align);
00245 vec_st( LSQ, 15, (unsigned char *)to );
00246 vec_st( MSQ, 0, (unsigned char *)to );
00247 }
00248
00249 template<> inline float ei_pfirst(const v4f& a)
00250 {
00251 float __attribute__(aligned(16)) af[4];
00252 vec_st(a, 0, af);
00253 return af[0];
00254 }
00255
00256 template<> inline int ei_pfirst(const v4i& a)
00257 {
00258 int __attribute__(aligned(16)) ai[4];
00259 vec_st(a, 0, ai);
00260 return ai[0];
00261 }
00262
00263 inline v4f ei_preduxp(const v4f* vecs)
00264 {
00265 v4f v[4], sum[4];
00266
00267
00268
00269
00270 v[0] = vec_mergeh(vecs[0], vecs[2]);
00271 v[1] = vec_mergel(vecs[0], vecs[2]);
00272 v[2] = vec_mergeh(vecs[1], vecs[3]);
00273 v[3] = vec_mergel(vecs[1], vecs[3]);
00274
00275 sum[0] = vec_mergeh(v[0], v[2]);
00276 sum[1] = vec_mergel(v[0], v[2]);
00277 sum[2] = vec_mergeh(v[1], v[3]);
00278 sum[3] = vec_mergel(v[1], v[3]);
00279
00280
00281
00282 sum[0] = vec_add(sum[0], sum[1]);
00283
00284 sum[1] = vec_add(sum[2], sum[3]);
00285
00286 sum[0] = vec_add(sum[0], sum[1]);
00287 return sum[0];
00288 }
00289
00290 inline float ei_predux(const v4f& a)
00291 {
00292 v4f b, sum;
00293 b = (v4f)vec_sld(a, a, 8);
00294 sum = vec_add(a, b);
00295 b = (v4f)vec_sld(sum, sum, 4);
00296 sum = vec_add(sum, b);
00297 return ei_pfirst(sum);
00298 }
00299
00300 inline v4i ei_preduxp(const v4i* vecs)
00301 {
00302 v4i v[4], sum[4];
00303
00304
00305
00306
00307 v[0] = vec_mergeh(vecs[0], vecs[2]);
00308 v[1] = vec_mergel(vecs[0], vecs[2]);
00309 v[2] = vec_mergeh(vecs[1], vecs[3]);
00310 v[3] = vec_mergel(vecs[1], vecs[3]);
00311
00312 sum[0] = vec_mergeh(v[0], v[2]);
00313 sum[1] = vec_mergel(v[0], v[2]);
00314 sum[2] = vec_mergeh(v[1], v[3]);
00315 sum[3] = vec_mergel(v[1], v[3]);
00316
00317
00318
00319 sum[0] = vec_add(sum[0], sum[1]);
00320
00321 sum[1] = vec_add(sum[2], sum[3]);
00322
00323 sum[0] = vec_add(sum[0], sum[1]);
00324 return sum[0];
00325 }
00326
00327 inline int ei_predux(const v4i& a)
00328 {
00329 USE_CONST_v0i;
00330 v4i sum;
00331 sum = vec_sums(a, v0i);
00332 sum = vec_sld(sum, v0i, 12);
00333 return ei_pfirst(sum);
00334 }
00335
00336 template<int Offset>
00337 struct ei_palign_impl<Offset, v4f>
00338 {
00339 inline static void run(v4f& first, const v4f& second)
00340 {
00341 first = vec_sld(first, second, Offset*4);
00342 }
00343 };
00344
00345 template<int Offset>
00346 struct ei_palign_impl<Offset, v4i>
00347 {
00348 inline static void run(v4i& first, const v4i& second)
00349 {
00350 first = vec_sld(first, second, Offset*4);
00351 }
00352 };
00353
00354 #endif // EIGEN_PACKET_MATH_ALTIVEC_H