vcglib: PacketMath.h Source File

00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra. Eigen itself is part of the KDE project.
00003 //
00004 // Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
00005 //
00006 // Eigen is free software; you can redistribute it and/or
00007 // modify it under the terms of the GNU Lesser General Public
00008 // License as published by the Free Software Foundation; either
00009 // version 3 of the License, or (at your option) any later version.
00010 //
00011 // Alternatively, you can redistribute it and/or
00012 // modify it under the terms of the GNU General Public License as
00013 // published by the Free Software Foundation; either version 2 of
00014 // the License, or (at your option) any later version.
00015 //
00016 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
00017 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00018 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
00019 // GNU General Public License for more details.
00020 //
00021 // You should have received a copy of the GNU Lesser General Public
00022 // License and a copy of the GNU General Public License along with
00023 // Eigen. If not, see <http://www.gnu.org/licenses/>.
00024 
00025 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
00026 #define EIGEN_PACKET_MATH_ALTIVEC_H
00027 
00028 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00029 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
00030 #endif
00031 
00032 typedef __vector float          v4f;
00033 typedef __vector int            v4i;
00034 typedef __vector unsigned int   v4ui;
00035 typedef __vector __bool int     v4bi;
00036 
00037 // We don't want to write the same code all the time, but we need to reuse the constants
00038 // and it doesn't really work to declare them global, so we define macros instead
00039 
00040 #define USE_CONST_v0i     const v4i   v0i   = vec_splat_s32(0)
00041 #define USE_CONST_v1i     const v4i   v1i   = vec_splat_s32(1)
00042 #define USE_CONST_v16i_   const v4i   v16i_ = vec_splat_s32(-16)
00043 #define USE_CONST_v0f     USE_CONST_v0i; const v4f v0f = (v4f) v0i
00044 #define USE_CONST_v1f     USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0)
00045 #define USE_CONST_v1i_    const v4ui  v1i_  = vec_splat_u32(-1)
00046 #define USE_CONST_v0f_    USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_)
00047 
00048 template<> struct ei_packet_traits<float>  { typedef v4f type; enum {size=4}; };
00049 template<> struct ei_packet_traits<int>    { typedef v4i type; enum {size=4}; };
00050 
00051 template<> struct ei_unpacket_traits<v4f>  { typedef float  type; enum {size=4}; };
00052 template<> struct ei_unpacket_traits<v4i>  { typedef int    type; enum {size=4}; };
00053 
00054 inline std::ostream & operator <<(std::ostream & s, const v4f & v)
00055 {
00056   union {
00057     v4f   v;
00058     float n[4];
00059   } vt;
00060   vt.v = v;
00061   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00062   return s;
00063 }
00064 
00065 inline std::ostream & operator <<(std::ostream & s, const v4i & v)
00066 {
00067   union {
00068     v4i   v;
00069     int n[4];
00070   } vt;
00071   vt.v = v;
00072   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00073   return s;
00074 }
00075 
00076 inline std::ostream & operator <<(std::ostream & s, const v4ui & v)
00077 {
00078   union {
00079     v4ui   v;
00080     unsigned int n[4];
00081   } vt;
00082   vt.v = v;
00083   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00084   return s;
00085 }
00086 
00087 inline std::ostream & operator <<(std::ostream & s, const v4bi & v)
00088 {
00089   union {
00090     __vector __bool int v;
00091     unsigned int n[4];
00092   } vt;
00093   vt.v = v;
00094   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
00095   return s;
00096 }
00097 
00098 template<> inline v4f  ei_padd(const v4f&   a, const v4f&   b) { return vec_add(a,b); }
00099 template<> inline v4i  ei_padd(const v4i&   a, const v4i&   b) { return vec_add(a,b); }
00100 
00101 template<> inline v4f  ei_psub(const v4f&   a, const v4f&   b) { return vec_sub(a,b); }
00102 template<> inline v4i  ei_psub(const v4i&   a, const v4i&   b) { return vec_sub(a,b); }
00103 
00104 template<> inline v4f  ei_pmul(const v4f&   a, const v4f&   b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
00105 template<> inline v4i  ei_pmul(const v4i&   a, const v4i&   b)
00106 {
00107   // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
00108   //Set up constants, variables
00109   v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
00110   USE_CONST_v0i;
00111   USE_CONST_v1i;
00112   USE_CONST_v16i_;
00113 
00114   // Get the absolute values 
00115   a1  = vec_abs(a);
00116   b1  = vec_abs(b);
00117 
00118   // Get the signs using xor
00119   v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i);
00120 
00121   // Do the multiplication for the asbolute values.
00122   bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ );
00123   low_prod = vec_mulo((__vector short)a1, (__vector short)b1);
00124   high_prod = vec_msum((__vector short)a1, (__vector short)bswap, v0i);
00125   high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_);
00126   prod = vec_add( low_prod, high_prod );
00127 
00128   // NOR the product and select only the negative elements according to the sign mask
00129   prod_ = vec_nor(prod, prod);
00130   prod_ = vec_sel(v0i, prod_, sgn);
00131 
00132   // Add 1 to the result to get the negative numbers
00133   v1sel = vec_sel(v0i, v1i, sgn);
00134   prod_ = vec_add(prod_, v1sel);
00135 
00136   // Merge the results back to the final vector.
00137   prod = vec_sel(prod, prod_, sgn);
00138 
00139   return prod;
00140 }
00141 
00142 template<> inline v4f  ei_pdiv(const v4f&   a, const v4f&   b) {
00143   v4f t, y_0, y_1, res;
00144   USE_CONST_v0f;
00145   USE_CONST_v1f;
00146 
00147   // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
00148   y_0 = vec_re(b);
00149   
00150   // Do one Newton-Raphson iteration to get the needed accuracy
00151   t = vec_nmsub(y_0, b, v1f);
00152   y_1 = vec_madd(y_0, t, y_0);
00153 
00154   res = vec_madd(a, y_1, v0f);
00155   return res;
00156 }
00157 
00158 template<> inline v4f  ei_pmadd(const v4f&  a, const v4f&   b, const v4f&  c) { return vec_madd(a, b, c); }
00159 
00160 template<> inline v4f  ei_pmin(const v4f&   a, const v4f&   b) { return vec_min(a,b); }
00161 template<> inline v4i  ei_pmin(const v4i&   a, const v4i&   b) { return vec_min(a,b); }
00162 
00163 template<> inline v4f  ei_pmax(const v4f&   a, const v4f&   b) { return vec_max(a,b); }
00164 template<> inline v4i  ei_pmax(const v4i&   a, const v4i&   b) { return vec_max(a,b); }
00165 
00166 template<> inline v4f  ei_pload(const float* from) { return vec_ld(0, from); }
00167 template<> inline v4i  ei_pload(const int*   from) { return vec_ld(0, from); }
00168 
00169 template<> inline v4f  ei_ploadu(const float*  from)
00170 {
00171   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00172   __vector unsigned char MSQ, LSQ;
00173   __vector unsigned char mask;
00174   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
00175   LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
00176   mask = vec_lvsl(0, from);                        // create the permute mask
00177   return (v4f) vec_perm(MSQ, LSQ, mask);           // align the data
00178 }
00179 
00180 template<> inline v4i    ei_ploadu(const int*    from)
00181 {
00182   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00183   __vector unsigned char MSQ, LSQ;
00184   __vector unsigned char mask;
00185   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
00186   LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
00187   mask = vec_lvsl(0, from);                        // create the permute mask
00188   return (v4i) vec_perm(MSQ, LSQ, mask);    // align the data
00189 }
00190 
00191 template<> inline v4f  ei_pset1(const float&  from)
00192 {
00193   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00194   float __attribute__(aligned(16)) af[4];
00195   af[0] = from;
00196   v4f vc = vec_ld(0, af);
00197   vc = vec_splat(vc, 0);
00198   return vc;
00199 }
00200 
00201 template<> inline v4i    ei_pset1(const int&    from)
00202 {
00203   int __attribute__(aligned(16)) ai[4];
00204   ai[0] = from;
00205   v4i vc = vec_ld(0, ai);
00206   vc = vec_splat(vc, 0);
00207   return vc;
00208 }
00209 
00210 template<> inline void ei_pstore(float*   to, const v4f&   from) { vec_st(from, 0, to); }
00211 template<> inline void ei_pstore(int*     to, const v4i&   from) { vec_st(from, 0, to); }
00212 
00213 template<> inline void ei_pstoreu(float*  to, const v4f&   from)
00214 {
00215   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00216   // Warning: not thread safe!
00217   __vector unsigned char MSQ, LSQ, edges;
00218   __vector unsigned char edgeAlign, align;
00219 
00220   MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
00221   LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
00222   edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
00223   edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
00224   align = vec_lvsr( 0, to );                                // permute map to misalign data
00225   MSQ = vec_perm(edges,(__vector unsigned char)from,align);   // misalign the data (MSQ)
00226   LSQ = vec_perm((__vector unsigned char)from,edges,align);   // misalign the data (LSQ)
00227   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
00228   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
00229 }
00230 
00231 template<> inline void ei_pstoreu(int*    to , const v4i&    from )
00232 {
00233   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
00234   // Warning: not thread safe!
00235   __vector unsigned char MSQ, LSQ, edges;
00236   __vector unsigned char edgeAlign, align;
00237 
00238   MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
00239   LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
00240   edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
00241   edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
00242   align = vec_lvsr( 0, to );                                // permute map to misalign data
00243   MSQ = vec_perm(edges,(__vector unsigned char)from,align);   // misalign the data (MSQ)
00244   LSQ = vec_perm((__vector unsigned char)from,edges,align);   // misalign the data (LSQ)
00245   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
00246   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
00247 }
00248 
00249 template<> inline float  ei_pfirst(const v4f&  a)
00250 {
00251   float __attribute__(aligned(16)) af[4];
00252   vec_st(a, 0, af);
00253   return af[0];
00254 }
00255 
00256 template<> inline int    ei_pfirst(const v4i&  a)
00257 {
00258   int __attribute__(aligned(16)) ai[4];
00259   vec_st(a, 0, ai);
00260   return ai[0];
00261 }
00262 
00263 inline v4f ei_preduxp(const v4f* vecs)
00264 {
00265   v4f v[4], sum[4];
00266 
00267   // It's easier and faster to transpose then add as columns
00268   // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
00269   // Do the transpose, first set of moves
00270   v[0] = vec_mergeh(vecs[0], vecs[2]);
00271   v[1] = vec_mergel(vecs[0], vecs[2]);
00272   v[2] = vec_mergeh(vecs[1], vecs[3]);
00273   v[3] = vec_mergel(vecs[1], vecs[3]);
00274   // Get the resulting vectors
00275   sum[0] = vec_mergeh(v[0], v[2]);
00276   sum[1] = vec_mergel(v[0], v[2]);
00277   sum[2] = vec_mergeh(v[1], v[3]);
00278   sum[3] = vec_mergel(v[1], v[3]);
00279 
00280   // Now do the summation:
00281   // Lines 0+1
00282   sum[0] = vec_add(sum[0], sum[1]);
00283   // Lines 2+3
00284   sum[1] = vec_add(sum[2], sum[3]);
00285   // Add the results
00286   sum[0] = vec_add(sum[0], sum[1]);
00287   return sum[0];
00288 }
00289 
00290 inline float ei_predux(const v4f& a)
00291 {
00292   v4f b, sum;
00293   b = (v4f)vec_sld(a, a, 8);
00294   sum = vec_add(a, b);
00295   b = (v4f)vec_sld(sum, sum, 4);
00296   sum = vec_add(sum, b);
00297   return ei_pfirst(sum);
00298 }
00299 
00300 inline v4i  ei_preduxp(const v4i* vecs)
00301 {
00302   v4i v[4], sum[4];
00303 
00304   // It's easier and faster to transpose then add as columns
00305   // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
00306   // Do the transpose, first set of moves
00307   v[0] = vec_mergeh(vecs[0], vecs[2]);
00308   v[1] = vec_mergel(vecs[0], vecs[2]);
00309   v[2] = vec_mergeh(vecs[1], vecs[3]);
00310   v[3] = vec_mergel(vecs[1], vecs[3]);
00311   // Get the resulting vectors
00312   sum[0] = vec_mergeh(v[0], v[2]);
00313   sum[1] = vec_mergel(v[0], v[2]);
00314   sum[2] = vec_mergeh(v[1], v[3]);
00315   sum[3] = vec_mergel(v[1], v[3]);
00316 
00317   // Now do the summation:
00318   // Lines 0+1
00319   sum[0] = vec_add(sum[0], sum[1]);
00320   // Lines 2+3
00321   sum[1] = vec_add(sum[2], sum[3]);
00322   // Add the results
00323   sum[0] = vec_add(sum[0], sum[1]);
00324   return sum[0];
00325 }
00326 
00327 inline int ei_predux(const v4i& a)
00328 {
00329   USE_CONST_v0i;
00330   v4i sum;
00331   sum = vec_sums(a, v0i);
00332   sum = vec_sld(sum, v0i, 12);
00333   return ei_pfirst(sum);
00334 }
00335 
00336 template<int Offset>
00337 struct ei_palign_impl<Offset, v4f>
00338 {
00339   inline static void run(v4f& first, const v4f& second)
00340   {
00341     first = vec_sld(first, second, Offset*4);
00342   }
00343 };
00344 
00345 template<int Offset>
00346 struct ei_palign_impl<Offset, v4i>
00347 {
00348   inline static void run(v4i& first, const v4i& second)
00349   {
00350     first = vec_sld(first, second, Offset*4);
00351   }
00352 };
00353 
00354 #endif // EIGEN_PACKET_MATH_ALTIVEC_H