GeneralBlockPanelKernel.h
Go to the documentation of this file.
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
00005 //
00006 // Eigen is free software; you can redistribute it and/or
00007 // modify it under the terms of the GNU Lesser General Public
00008 // License as published by the Free Software Foundation; either
00009 // version 3 of the License, or (at your option) any later version.
00010 //
00011 // Alternatively, you can redistribute it and/or
00012 // modify it under the terms of the GNU General Public License as
00013 // published by the Free Software Foundation; either version 2 of
00014 // the License, or (at your option) any later version.
00015 //
00016 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
00017 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00018 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
00019 // GNU General Public License for more details.
00020 //
00021 // You should have received a copy of the GNU Lesser General Public
00022 // License and a copy of the GNU General Public License along with
00023 // Eigen. If not, see <http://www.gnu.org/licenses/>.
00024 
00025 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
00026 #define EIGEN_GENERAL_BLOCK_PANEL_H
00027 
00028 namespace internal {
00029 
00030 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
00031 class gebp_traits;
00032 
00034 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
00035 {
00036   static std::ptrdiff_t m_l1CacheSize = 0;
00037   static std::ptrdiff_t m_l2CacheSize = 0;
00038   if(m_l1CacheSize==0)
00039   {
00040     m_l1CacheSize = queryL1CacheSize();
00041     m_l2CacheSize = queryTopLevelCacheSize();
00042 
00043     if(m_l1CacheSize<=0) m_l1CacheSize = 8 * 1024;
00044     if(m_l2CacheSize<=0) m_l2CacheSize = 1 * 1024 * 1024;
00045   }
00046 
00047   if(action==SetAction)
00048   {
00049     // set the cpu cache size and cache all block sizes from a global cache size in byte
00050     eigen_internal_assert(l1!=0 && l2!=0);
00051     m_l1CacheSize = *l1;
00052     m_l2CacheSize = *l2;
00053   }
00054   else if(action==GetAction)
00055   {
00056     eigen_internal_assert(l1!=0 && l2!=0);
00057     *l1 = m_l1CacheSize;
00058     *l2 = m_l2CacheSize;
00059   }
00060   else
00061   {
00062     eigen_internal_assert(false);
00063   }
00064 }
00065 
00081 template<typename LhsScalar, typename RhsScalar, int KcFactor>
00082 void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00083 {
00084   // Explanations:
00085   // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
00086   // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
00087   // per kc x nr vertical small panels where nr is the blocking size along the n dimension
00088   // at the register level. For vectorization purpose, these small vertical panels are unpacked,
00089   // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
00090   // stay in L1 cache.
00091   std::ptrdiff_t l1, l2;
00092 
00093   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
00094   enum {
00095     kdiv = KcFactor * 2 * Traits::nr
00096          * Traits::RhsProgress * sizeof(RhsScalar),
00097     mr = gebp_traits<LhsScalar,RhsScalar>::mr,
00098     mr_mask = (0xffffffff/mr)*mr
00099   };
00100 
00101   manage_caching_sizes(GetAction, &l1, &l2);
00102   k = std::min<std::ptrdiff_t>(k, l1/kdiv);
00103   std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
00104   if(_m<m) m = _m & mr_mask;
00105   n = n;
00106 }
00107 
00108 template<typename LhsScalar, typename RhsScalar>
00109 inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00110 {
00111   computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
00112 }
00113 
00114 #ifdef EIGEN_HAS_FUSE_CJMADD
00115   #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
00116 #else
00117 
00118   // FIXME (a bit overkill maybe ?)
00119 
00120   template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
00121     EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
00122     {
00123       c = cj.pmadd(a,b,c);
00124     }
00125   };
00126 
00127   template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
00128     EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t)
00129     {
00130       t = b; t = cj.pmul(a,t); c = padd(c,t);
00131     }
00132   };
00133 
00134   template<typename CJ, typename A, typename B, typename C, typename T>
00135   EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
00136   {
00137     gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
00138   }
00139 
00140   #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
00141 //   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
00142 #endif
00143 
00144 /* Vectorization logic
00145  *  real*real: unpack rhs to constant packets, ...
00146  * 
00147  *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
00148  *          storing each res packet into two packets (2x2),
00149  *          at the end combine them: swap the second and addsub them 
00150  *  cf*cf : same but with 2x4 blocks
00151  *  cplx*real : unpack rhs to constant packets, ...
00152  *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
00153  */
00154 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
00155 class gebp_traits
00156 {
00157 public:
00158   typedef _LhsScalar LhsScalar;
00159   typedef _RhsScalar RhsScalar;
00160   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00161 
00162   enum {
00163     ConjLhs = _ConjLhs,
00164     ConjRhs = _ConjRhs,
00165     Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00166     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00167     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00168     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00169     
00170     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00171 
00172     // register block size along the N direction (must be either 2 or 4)
00173     nr = NumberOfRegisters/4,
00174 
00175     // register block size along the M direction (currently, this one cannot be modified)
00176     mr = 2 * LhsPacketSize,
00177     
00178     WorkSpaceFactor = nr * RhsPacketSize,
00179 
00180     LhsProgress = LhsPacketSize,
00181     RhsProgress = RhsPacketSize
00182   };
00183 
00184   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00185   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00186   typedef typename packet_traits<ResScalar>::type  _ResPacket;
00187 
00188   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00189   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00190   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00191 
00192   typedef ResPacket AccPacket;
00193   
00194   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00195   {
00196     p = pset1<ResPacket>(ResScalar(0));
00197   }
00198 
00199   EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00200   {
00201     for(DenseIndex k=0; k<n; k++)
00202       pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00203   }
00204 
00205   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00206   {
00207     dest = pload<RhsPacket>(b);
00208   }
00209 
00210   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00211   {
00212     dest = pload<LhsPacket>(a);
00213   }
00214 
00215   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
00216   {
00217     tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
00218   }
00219 
00220   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00221   {
00222     r = pmadd(c,alpha,r);
00223   }
00224 
00225 protected:
00226 //   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00227 //   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
00228 };
00229 
00230 template<typename RealScalar, bool _ConjLhs>
00231 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
00232 {
00233 public:
00234   typedef std::complex<RealScalar> LhsScalar;
00235   typedef RealScalar RhsScalar;
00236   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00237 
00238   enum {
00239     ConjLhs = _ConjLhs,
00240     ConjRhs = false,
00241     Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00242     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00243     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00244     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00245     
00246     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00247     nr = NumberOfRegisters/4,
00248     mr = 2 * LhsPacketSize,
00249     WorkSpaceFactor = nr*RhsPacketSize,
00250 
00251     LhsProgress = LhsPacketSize,
00252     RhsProgress = RhsPacketSize
00253   };
00254 
00255   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00256   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00257   typedef typename packet_traits<ResScalar>::type  _ResPacket;
00258 
00259   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00260   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00261   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00262 
00263   typedef ResPacket AccPacket;
00264 
00265   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00266   {
00267     p = pset1<ResPacket>(ResScalar(0));
00268   }
00269 
00270   EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00271   {
00272     for(DenseIndex k=0; k<n; k++)
00273       pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00274   }
00275 
00276   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00277   {
00278     dest = pload<RhsPacket>(b);
00279   }
00280 
00281   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00282   {
00283     dest = pload<LhsPacket>(a);
00284   }
00285 
00286   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00287   {
00288     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00289   }
00290 
00291   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00292   {
00293     tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
00294   }
00295 
00296   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
00297   {
00298     c += a * b;
00299   }
00300 
00301   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00302   {
00303     r = cj.pmadd(c,alpha,r);
00304   }
00305 
00306 protected:
00307   conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
00308 };
00309 
00310 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
00311 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
00312 {
00313 public:
00314   typedef std::complex<RealScalar>  Scalar;
00315   typedef std::complex<RealScalar>  LhsScalar;
00316   typedef std::complex<RealScalar>  RhsScalar;
00317   typedef std::complex<RealScalar>  ResScalar;
00318   
00319   enum {
00320     ConjLhs = _ConjLhs,
00321     ConjRhs = _ConjRhs,
00322     Vectorizable = packet_traits<RealScalar>::Vectorizable
00323                 && packet_traits<Scalar>::Vectorizable,
00324     RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
00325     ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
00326     
00327     nr = 2,
00328     mr = 2 * ResPacketSize,
00329     WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
00330 
00331     LhsProgress = ResPacketSize,
00332     RhsProgress = Vectorizable ? 2*ResPacketSize : 1
00333   };
00334   
00335   typedef typename packet_traits<RealScalar>::type RealPacket;
00336   typedef typename packet_traits<Scalar>::type     ScalarPacket;
00337   struct DoublePacket
00338   {
00339     RealPacket first;
00340     RealPacket second;
00341   };
00342 
00343   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
00344   typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
00345   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
00346   typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
00347   
00348   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
00349 
00350   EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
00351   {
00352     p.first   = pset1<RealPacket>(RealScalar(0));
00353     p.second  = pset1<RealPacket>(RealScalar(0));
00354   }
00355 
00356   /* Unpack the rhs coeff such that each complex coefficient is spread into
00357    * two packects containing respectively the real and imaginary coefficient
00358    * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
00359    */
00360   EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
00361   {
00362     for(DenseIndex k=0; k<n; k++)
00363     {
00364       if(Vectorizable)
00365       {
00366         pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0],             real(rhs[k]));
00367         pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
00368       }
00369       else
00370         b[k] = rhs[k];
00371     }
00372   }
00373 
00374   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
00375 
00376   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
00377   {
00378     dest.first  = pload<RealPacket>((const RealScalar*)b);
00379     dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
00380   }
00381 
00382   // nothing special here
00383   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00384   {
00385     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
00386   }
00387 
00388   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
00389   {
00390     c.first   = padd(pmul(a,b.first), c.first);
00391     c.second  = padd(pmul(a,b.second),c.second);
00392   }
00393 
00394   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
00395   {
00396     c = cj.pmadd(a,b,c);
00397   }
00398   
00399   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
00400   
00401   EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
00402   {
00403     // assemble c
00404     ResPacket tmp;
00405     if((!ConjLhs)&&(!ConjRhs))
00406     {
00407       tmp = pcplxflip(pconj(ResPacket(c.second)));
00408       tmp = padd(ResPacket(c.first),tmp);
00409     }
00410     else if((!ConjLhs)&&(ConjRhs))
00411     {
00412       tmp = pconj(pcplxflip(ResPacket(c.second)));
00413       tmp = padd(ResPacket(c.first),tmp);
00414     }
00415     else if((ConjLhs)&&(!ConjRhs))
00416     {
00417       tmp = pcplxflip(ResPacket(c.second));
00418       tmp = padd(pconj(ResPacket(c.first)),tmp);
00419     }
00420     else if((ConjLhs)&&(ConjRhs))
00421     {
00422       tmp = pcplxflip(ResPacket(c.second));
00423       tmp = psub(pconj(ResPacket(c.first)),tmp);
00424     }
00425     
00426     r = pmadd(tmp,alpha,r);
00427   }
00428 
00429 protected:
00430   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00431 };
00432 
00433 template<typename RealScalar, bool _ConjRhs>
00434 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
00435 {
00436 public:
00437   typedef std::complex<RealScalar>  Scalar;
00438   typedef RealScalar  LhsScalar;
00439   typedef Scalar      RhsScalar;
00440   typedef Scalar      ResScalar;
00441 
00442   enum {
00443     ConjLhs = false,
00444     ConjRhs = _ConjRhs,
00445     Vectorizable = packet_traits<RealScalar>::Vectorizable
00446                 && packet_traits<Scalar>::Vectorizable,
00447     LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00448     RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00449     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00450     
00451     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00452     nr = 4,
00453     mr = 2*ResPacketSize,
00454     WorkSpaceFactor = nr*RhsPacketSize,
00455 
00456     LhsProgress = ResPacketSize,
00457     RhsProgress = ResPacketSize
00458   };
00459 
00460   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00461   typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00462   typedef typename packet_traits<ResScalar>::type  _ResPacket;
00463 
00464   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00465   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00466   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00467 
00468   typedef ResPacket AccPacket;
00469 
00470   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00471   {
00472     p = pset1<ResPacket>(ResScalar(0));
00473   }
00474 
00475   EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00476   {
00477     for(DenseIndex k=0; k<n; k++)
00478       pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00479   }
00480 
00481   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00482   {
00483     dest = pload<RhsPacket>(b);
00484   }
00485 
00486   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00487   {
00488     dest = ploaddup<LhsPacket>(a);
00489   }
00490 
00491   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00492   {
00493     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00494   }
00495 
00496   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00497   {
00498     tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
00499   }
00500 
00501   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
00502   {
00503     c += a * b;
00504   }
00505 
00506   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00507   {
00508     r = cj.pmadd(alpha,c,r);
00509   }
00510 
00511 protected:
00512   conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
00513 };
00514 
00515 /* optimized GEneral packed Block * packed Panel product kernel
00516  *
00517  * Mixing type logic: C += A * B
00518  *  |  A  |  B  | comments
00519  *  |real |cplx | no vectorization yet, would require to pack A with duplication
00520  *  |cplx |real | easy vectorization
00521  */
00522 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
00523 struct gebp_kernel
00524 {
00525   typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
00526   typedef typename Traits::ResScalar ResScalar;
00527   typedef typename Traits::LhsPacket LhsPacket;
00528   typedef typename Traits::RhsPacket RhsPacket;
00529   typedef typename Traits::ResPacket ResPacket;
00530   typedef typename Traits::AccPacket AccPacket;
00531 
00532   enum {
00533     Vectorizable  = Traits::Vectorizable,
00534     LhsProgress   = Traits::LhsProgress,
00535     RhsProgress   = Traits::RhsProgress,
00536     ResPacketSize = Traits::ResPacketSize
00537   };
00538 
00539   EIGEN_FLATTEN_ATTRIB
00540   void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
00541                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
00542   {
00543     Traits traits;
00544     
00545     if(strideA==-1) strideA = depth;
00546     if(strideB==-1) strideB = depth;
00547     conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00548 //     conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00549     Index packet_cols = (cols/nr) * nr;
00550     const Index peeled_mc = (rows/mr)*mr;
00551     // FIXME:
00552     const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
00553     const Index peeled_kc = (depth/4)*4;
00554 
00555     if(unpackedB==0)
00556       unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
00557 
00558     // loops on each micro vertical panel of rhs (depth x nr)
00559     for(Index j2=0; j2<packet_cols; j2+=nr)
00560     {
00561       traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 
00562 
00563       // loops on each largest micro horizontal panel of lhs (mr x depth)
00564       // => we select a mr x nr micro block of res which is entirely
00565       //    stored into mr/packet_size x nr registers.
00566       for(Index i=0; i<peeled_mc; i+=mr)
00567       {
00568         const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
00569         prefetch(&blA[0]);
00570 
00571         // gets res block as register
00572         AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
00573                   traits.initAcc(C0);
00574                   traits.initAcc(C1);
00575         if(nr==4) traits.initAcc(C2);
00576         if(nr==4) traits.initAcc(C3);
00577                   traits.initAcc(C4);
00578                   traits.initAcc(C5);
00579         if(nr==4) traits.initAcc(C6);
00580         if(nr==4) traits.initAcc(C7);
00581 
00582         ResScalar* r0 = &res[(j2+0)*resStride + i];
00583         ResScalar* r1 = r0 + resStride;
00584         ResScalar* r2 = r1 + resStride;
00585         ResScalar* r3 = r2 + resStride;
00586 
00587         prefetch(r0+16);
00588         prefetch(r1+16);
00589         prefetch(r2+16);
00590         prefetch(r3+16);
00591 
00592         // performs "inner" product
00593         // TODO let's check wether the folowing peeled loop could not be
00594         //      optimized via optimal prefetching from one loop to the other
00595         const RhsScalar* blB = unpackedB;
00596         for(Index k=0; k<peeled_kc; k+=4)
00597         {
00598           if(nr==2)
00599           {
00600             LhsPacket A0, A1;
00601             RhsPacket B0;
00602             RhsPacket T0;
00603             
00604 EIGEN_ASM_COMMENT("mybegin2");
00605             traits.loadLhs(&blA[0*LhsProgress], A0);
00606             traits.loadLhs(&blA[1*LhsProgress], A1);
00607             traits.loadRhs(&blB[0*RhsProgress], B0);
00608             traits.madd(A0,B0,C0,T0);
00609             traits.madd(A1,B0,C4,B0);
00610             traits.loadRhs(&blB[1*RhsProgress], B0);
00611             traits.madd(A0,B0,C1,T0);
00612             traits.madd(A1,B0,C5,B0);
00613 
00614             traits.loadLhs(&blA[2*LhsProgress], A0);
00615             traits.loadLhs(&blA[3*LhsProgress], A1);
00616             traits.loadRhs(&blB[2*RhsProgress], B0);
00617             traits.madd(A0,B0,C0,T0);
00618             traits.madd(A1,B0,C4,B0);
00619             traits.loadRhs(&blB[3*RhsProgress], B0);
00620             traits.madd(A0,B0,C1,T0);
00621             traits.madd(A1,B0,C5,B0);
00622 
00623             traits.loadLhs(&blA[4*LhsProgress], A0);
00624             traits.loadLhs(&blA[5*LhsProgress], A1);
00625             traits.loadRhs(&blB[4*RhsProgress], B0);
00626             traits.madd(A0,B0,C0,T0);
00627             traits.madd(A1,B0,C4,B0);
00628             traits.loadRhs(&blB[5*RhsProgress], B0);
00629             traits.madd(A0,B0,C1,T0);
00630             traits.madd(A1,B0,C5,B0);
00631 
00632             traits.loadLhs(&blA[6*LhsProgress], A0);
00633             traits.loadLhs(&blA[7*LhsProgress], A1);
00634             traits.loadRhs(&blB[6*RhsProgress], B0);
00635             traits.madd(A0,B0,C0,T0);
00636             traits.madd(A1,B0,C4,B0);
00637             traits.loadRhs(&blB[7*RhsProgress], B0);
00638             traits.madd(A0,B0,C1,T0);
00639             traits.madd(A1,B0,C5,B0);
00640 EIGEN_ASM_COMMENT("myend");
00641           }
00642           else
00643           {
00644 EIGEN_ASM_COMMENT("mybegin4");
00645             LhsPacket A0, A1;
00646             RhsPacket B0, B1, B2, B3;
00647             RhsPacket T0;
00648             
00649             traits.loadLhs(&blA[0*LhsProgress], A0);
00650             traits.loadLhs(&blA[1*LhsProgress], A1);
00651             traits.loadRhs(&blB[0*RhsProgress], B0);
00652             traits.loadRhs(&blB[1*RhsProgress], B1);
00653 
00654             traits.madd(A0,B0,C0,T0);
00655             traits.loadRhs(&blB[2*RhsProgress], B2);
00656             traits.madd(A1,B0,C4,B0);
00657             traits.loadRhs(&blB[3*RhsProgress], B3);
00658             traits.loadRhs(&blB[4*RhsProgress], B0);
00659             traits.madd(A0,B1,C1,T0);
00660             traits.madd(A1,B1,C5,B1);
00661             traits.loadRhs(&blB[5*RhsProgress], B1);
00662             traits.madd(A0,B2,C2,T0);
00663             traits.madd(A1,B2,C6,B2);
00664             traits.loadRhs(&blB[6*RhsProgress], B2);
00665             traits.madd(A0,B3,C3,T0);
00666             traits.loadLhs(&blA[2*LhsProgress], A0);
00667             traits.madd(A1,B3,C7,B3);
00668             traits.loadLhs(&blA[3*LhsProgress], A1);
00669             traits.loadRhs(&blB[7*RhsProgress], B3);
00670             traits.madd(A0,B0,C0,T0);
00671             traits.madd(A1,B0,C4,B0);
00672             traits.loadRhs(&blB[8*RhsProgress], B0);
00673             traits.madd(A0,B1,C1,T0);
00674             traits.madd(A1,B1,C5,B1);
00675             traits.loadRhs(&blB[9*RhsProgress], B1);
00676             traits.madd(A0,B2,C2,T0);
00677             traits.madd(A1,B2,C6,B2);
00678             traits.loadRhs(&blB[10*RhsProgress], B2);
00679             traits.madd(A0,B3,C3,T0);
00680             traits.loadLhs(&blA[4*LhsProgress], A0);
00681             traits.madd(A1,B3,C7,B3);
00682             traits.loadLhs(&blA[5*LhsProgress], A1);
00683             traits.loadRhs(&blB[11*RhsProgress], B3);
00684 
00685             traits.madd(A0,B0,C0,T0);
00686             traits.madd(A1,B0,C4,B0);
00687             traits.loadRhs(&blB[12*RhsProgress], B0);
00688             traits.madd(A0,B1,C1,T0);
00689             traits.madd(A1,B1,C5,B1);
00690             traits.loadRhs(&blB[13*RhsProgress], B1);
00691             traits.madd(A0,B2,C2,T0);
00692             traits.madd(A1,B2,C6,B2);
00693             traits.loadRhs(&blB[14*RhsProgress], B2);
00694             traits.madd(A0,B3,C3,T0);
00695             traits.loadLhs(&blA[6*LhsProgress], A0);
00696             traits.madd(A1,B3,C7,B3);
00697             traits.loadLhs(&blA[7*LhsProgress], A1);
00698             traits.loadRhs(&blB[15*RhsProgress], B3);
00699             traits.madd(A0,B0,C0,T0);
00700             traits.madd(A1,B0,C4,B0);
00701             traits.madd(A0,B1,C1,T0);
00702             traits.madd(A1,B1,C5,B1);
00703             traits.madd(A0,B2,C2,T0);
00704             traits.madd(A1,B2,C6,B2);
00705             traits.madd(A0,B3,C3,T0);
00706             traits.madd(A1,B3,C7,B3);
00707           }
00708 
00709           blB += 4*nr*RhsProgress;
00710           blA += 4*mr;
00711         }
00712         // process remaining peeled loop
00713         for(Index k=peeled_kc; k<depth; k++)
00714         {
00715           if(nr==2)
00716           {
00717             LhsPacket A0, A1;
00718             RhsPacket B0;
00719             RhsPacket T0;
00720 
00721             traits.loadLhs(&blA[0*LhsProgress], A0);
00722             traits.loadLhs(&blA[1*LhsProgress], A1);
00723             traits.loadRhs(&blB[0*RhsProgress], B0);
00724             traits.madd(A0,B0,C0,T0);
00725             traits.madd(A1,B0,C4,B0);
00726             traits.loadRhs(&blB[1*RhsProgress], B0);
00727             traits.madd(A0,B0,C1,T0);
00728             traits.madd(A1,B0,C5,B0);
00729           }
00730           else
00731           {
00732             LhsPacket A0, A1;
00733             RhsPacket B0, B1, B2, B3;
00734             RhsPacket T0;
00735 
00736             traits.loadLhs(&blA[0*LhsProgress], A0);
00737             traits.loadLhs(&blA[1*LhsProgress], A1);
00738             traits.loadRhs(&blB[0*RhsProgress], B0);
00739             traits.loadRhs(&blB[1*RhsProgress], B1);
00740 
00741             traits.madd(A0,B0,C0,T0);
00742             traits.loadRhs(&blB[2*RhsProgress], B2);
00743             traits.madd(A1,B0,C4,B0);
00744             traits.loadRhs(&blB[3*RhsProgress], B3);
00745             traits.madd(A0,B1,C1,T0);
00746             traits.madd(A1,B1,C5,B1);
00747             traits.madd(A0,B2,C2,T0);
00748             traits.madd(A1,B2,C6,B2);
00749             traits.madd(A0,B3,C3,T0);
00750             traits.madd(A1,B3,C7,B3);
00751           }
00752 
00753           blB += nr*RhsProgress;
00754           blA += mr;
00755         }
00756 
00757         if(nr==4)
00758         {
00759           ResPacket R0, R1, R2, R3, R4, R5, R6;
00760           ResPacket alphav = pset1<ResPacket>(alpha);
00761 
00762           R0 = ploadu<ResPacket>(r0);
00763           R1 = ploadu<ResPacket>(r1);
00764           R2 = ploadu<ResPacket>(r2);
00765           R3 = ploadu<ResPacket>(r3);
00766           R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00767           R5 = ploadu<ResPacket>(r1 + ResPacketSize);
00768           R6 = ploadu<ResPacket>(r2 + ResPacketSize);
00769           traits.acc(C0, alphav, R0);
00770           pstoreu(r0, R0);
00771           R0 = ploadu<ResPacket>(r3 + ResPacketSize);
00772 
00773           traits.acc(C1, alphav, R1);
00774           traits.acc(C2, alphav, R2);
00775           traits.acc(C3, alphav, R3);
00776           traits.acc(C4, alphav, R4);
00777           traits.acc(C5, alphav, R5);
00778           traits.acc(C6, alphav, R6);
00779           traits.acc(C7, alphav, R0);
00780           
00781           pstoreu(r1, R1);
00782           pstoreu(r2, R2);
00783           pstoreu(r3, R3);
00784           pstoreu(r0 + ResPacketSize, R4);
00785           pstoreu(r1 + ResPacketSize, R5);
00786           pstoreu(r2 + ResPacketSize, R6);
00787           pstoreu(r3 + ResPacketSize, R0);
00788         }
00789         else
00790         {
00791           ResPacket R0, R1, R4;
00792           ResPacket alphav = pset1<ResPacket>(alpha);
00793 
00794           R0 = ploadu<ResPacket>(r0);
00795           R1 = ploadu<ResPacket>(r1);
00796           R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00797           traits.acc(C0, alphav, R0);
00798           pstoreu(r0, R0);
00799           R0 = ploadu<ResPacket>(r1 + ResPacketSize);
00800           traits.acc(C1, alphav, R1);
00801           traits.acc(C4, alphav, R4);
00802           traits.acc(C5, alphav, R0);
00803           pstoreu(r1, R1);
00804           pstoreu(r0 + ResPacketSize, R4);
00805           pstoreu(r1 + ResPacketSize, R0);
00806         }
00807         
00808       }
00809       
00810       if(rows-peeled_mc>=LhsProgress)
00811       {
00812         Index i = peeled_mc;
00813         const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
00814         prefetch(&blA[0]);
00815 
00816         // gets res block as register
00817         AccPacket C0, C1, C2, C3;
00818                   traits.initAcc(C0);
00819                   traits.initAcc(C1);
00820         if(nr==4) traits.initAcc(C2);
00821         if(nr==4) traits.initAcc(C3);
00822 
00823         // performs "inner" product
00824         const RhsScalar* blB = unpackedB;
00825         for(Index k=0; k<peeled_kc; k+=4)
00826         {
00827           if(nr==2)
00828           {
00829             LhsPacket A0;
00830             RhsPacket B0, B1;
00831 
00832             traits.loadLhs(&blA[0*LhsProgress], A0);
00833             traits.loadRhs(&blB[0*RhsProgress], B0);
00834             traits.loadRhs(&blB[1*RhsProgress], B1);
00835             traits.madd(A0,B0,C0,B0);
00836             traits.loadRhs(&blB[2*RhsProgress], B0);
00837             traits.madd(A0,B1,C1,B1);
00838             traits.loadLhs(&blA[1*LhsProgress], A0);
00839             traits.loadRhs(&blB[3*RhsProgress], B1);
00840             traits.madd(A0,B0,C0,B0);
00841             traits.loadRhs(&blB[4*RhsProgress], B0);
00842             traits.madd(A0,B1,C1,B1);
00843             traits.loadLhs(&blA[2*LhsProgress], A0);
00844             traits.loadRhs(&blB[5*RhsProgress], B1);
00845             traits.madd(A0,B0,C0,B0);
00846             traits.loadRhs(&blB[6*RhsProgress], B0);
00847             traits.madd(A0,B1,C1,B1);
00848             traits.loadLhs(&blA[3*LhsProgress], A0);
00849             traits.loadRhs(&blB[7*RhsProgress], B1);
00850             traits.madd(A0,B0,C0,B0);
00851             traits.madd(A0,B1,C1,B1);
00852           }
00853           else
00854           {
00855             LhsPacket A0;
00856             RhsPacket B0, B1, B2, B3;
00857 
00858             traits.loadLhs(&blA[0*LhsProgress], A0);
00859             traits.loadRhs(&blB[0*RhsProgress], B0);
00860             traits.loadRhs(&blB[1*RhsProgress], B1);
00861 
00862             traits.madd(A0,B0,C0,B0);
00863             traits.loadRhs(&blB[2*RhsProgress], B2);
00864             traits.loadRhs(&blB[3*RhsProgress], B3);
00865             traits.loadRhs(&blB[4*RhsProgress], B0);
00866             traits.madd(A0,B1,C1,B1);
00867             traits.loadRhs(&blB[5*RhsProgress], B1);
00868             traits.madd(A0,B2,C2,B2);
00869             traits.loadRhs(&blB[6*RhsProgress], B2);
00870             traits.madd(A0,B3,C3,B3);
00871             traits.loadLhs(&blA[1*LhsProgress], A0);
00872             traits.loadRhs(&blB[7*RhsProgress], B3);
00873             traits.madd(A0,B0,C0,B0);
00874             traits.loadRhs(&blB[8*RhsProgress], B0);
00875             traits.madd(A0,B1,C1,B1);
00876             traits.loadRhs(&blB[9*RhsProgress], B1);
00877             traits.madd(A0,B2,C2,B2);
00878             traits.loadRhs(&blB[10*RhsProgress], B2);
00879             traits.madd(A0,B3,C3,B3);
00880             traits.loadLhs(&blA[2*LhsProgress], A0);
00881             traits.loadRhs(&blB[11*RhsProgress], B3);
00882 
00883             traits.madd(A0,B0,C0,B0);
00884             traits.loadRhs(&blB[12*RhsProgress], B0);
00885             traits.madd(A0,B1,C1,B1);
00886             traits.loadRhs(&blB[13*RhsProgress], B1);
00887             traits.madd(A0,B2,C2,B2);
00888             traits.loadRhs(&blB[14*RhsProgress], B2);
00889             traits.madd(A0,B3,C3,B3);
00890 
00891             traits.loadLhs(&blA[3*LhsProgress], A0);
00892             traits.loadRhs(&blB[15*RhsProgress], B3);
00893             traits.madd(A0,B0,C0,B0);
00894             traits.madd(A0,B1,C1,B1);
00895             traits.madd(A0,B2,C2,B2);
00896             traits.madd(A0,B3,C3,B3);
00897           }
00898 
00899           blB += nr*4*RhsProgress;
00900           blA += 4*LhsProgress;
00901         }
00902         // process remaining peeled loop
00903         for(Index k=peeled_kc; k<depth; k++)
00904         {
00905           if(nr==2)
00906           {
00907             LhsPacket A0;
00908             RhsPacket B0, B1;
00909 
00910             traits.loadLhs(&blA[0*LhsProgress], A0);
00911             traits.loadRhs(&blB[0*RhsProgress], B0);
00912             traits.loadRhs(&blB[1*RhsProgress], B1);
00913             traits.madd(A0,B0,C0,B0);
00914             traits.madd(A0,B1,C1,B1);
00915           }
00916           else
00917           {
00918             LhsPacket A0;
00919             RhsPacket B0, B1, B2, B3;
00920 
00921             traits.loadLhs(&blA[0*LhsProgress], A0);
00922             traits.loadRhs(&blB[0*RhsProgress], B0);
00923             traits.loadRhs(&blB[1*RhsProgress], B1);
00924             traits.loadRhs(&blB[2*RhsProgress], B2);
00925             traits.loadRhs(&blB[3*RhsProgress], B3);
00926 
00927             traits.madd(A0,B0,C0,B0);
00928             traits.madd(A0,B1,C1,B1);
00929             traits.madd(A0,B2,C2,B2);
00930             traits.madd(A0,B3,C3,B3);
00931           }
00932 
00933           blB += nr*RhsProgress;
00934           blA += LhsProgress;
00935         }
00936 
00937         ResPacket R0, R1, R2, R3;
00938         ResPacket alphav = pset1<ResPacket>(alpha);
00939 
00940         ResScalar* r0 = &res[(j2+0)*resStride + i];
00941         ResScalar* r1 = r0 + resStride;
00942         ResScalar* r2 = r1 + resStride;
00943         ResScalar* r3 = r2 + resStride;
00944 
00945                   R0 = ploadu<ResPacket>(r0);
00946                   R1 = ploadu<ResPacket>(r1);
00947         if(nr==4) R2 = ploadu<ResPacket>(r2);
00948         if(nr==4) R3 = ploadu<ResPacket>(r3);
00949 
00950                   traits.acc(C0, alphav, R0);
00951                   traits.acc(C1, alphav, R1);
00952         if(nr==4) traits.acc(C2, alphav, R2);
00953         if(nr==4) traits.acc(C3, alphav, R3);
00954 
00955                   pstoreu(r0, R0);
00956                   pstoreu(r1, R1);
00957         if(nr==4) pstoreu(r2, R2);
00958         if(nr==4) pstoreu(r3, R3);
00959       }
00960       for(Index i=peeled_mc2; i<rows; i++)
00961       {
00962         const LhsScalar* blA = &blockA[i*strideA+offsetA];
00963         prefetch(&blA[0]);
00964 
00965         // gets a 1 x nr res block as registers
00966         ResScalar C0(0), C1(0), C2(0), C3(0);
00967         // TODO directly use blockB ???
00968         const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
00969         for(Index k=0; k<depth; k++)
00970         {
00971           if(nr==2)
00972           {
00973             LhsScalar A0;
00974             RhsScalar B0, B1;
00975 
00976             A0 = blA[k];
00977             B0 = blB[0];
00978             B1 = blB[1];
00979             MADD(cj,A0,B0,C0,B0);
00980             MADD(cj,A0,B1,C1,B1);
00981           }
00982           else
00983           {
00984             LhsScalar A0;
00985             RhsScalar B0, B1, B2, B3;
00986 
00987             A0 = blA[k];
00988             B0 = blB[0];
00989             B1 = blB[1];
00990             B2 = blB[2];
00991             B3 = blB[3];
00992 
00993             MADD(cj,A0,B0,C0,B0);
00994             MADD(cj,A0,B1,C1,B1);
00995             MADD(cj,A0,B2,C2,B2);
00996             MADD(cj,A0,B3,C3,B3);
00997           }
00998 
00999           blB += nr;
01000         }
01001                   res[(j2+0)*resStride + i] += alpha*C0;
01002                   res[(j2+1)*resStride + i] += alpha*C1;
01003         if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
01004         if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
01005       }
01006     }
01007     // process remaining rhs/res columns one at a time
01008     // => do the same but with nr==1
01009     for(Index j2=packet_cols; j2<cols; j2++)
01010     {
01011       // unpack B
01012       traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
01013 
01014       for(Index i=0; i<peeled_mc; i+=mr)
01015       {
01016         const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
01017         prefetch(&blA[0]);
01018 
01019         // TODO move the res loads to the stores
01020 
01021         // get res block as registers
01022         AccPacket C0, C4;
01023         traits.initAcc(C0);
01024         traits.initAcc(C4);
01025 
01026         const RhsScalar* blB = unpackedB;
01027         for(Index k=0; k<depth; k++)
01028         {
01029           LhsPacket A0, A1;
01030           RhsPacket B0;
01031           RhsPacket T0;
01032 
01033           traits.loadLhs(&blA[0*LhsProgress], A0);
01034           traits.loadLhs(&blA[1*LhsProgress], A1);
01035           traits.loadRhs(&blB[0*RhsProgress], B0);
01036           traits.madd(A0,B0,C0,T0);
01037           traits.madd(A1,B0,C4,B0);
01038 
01039           blB += RhsProgress;
01040           blA += 2*LhsProgress;
01041         }
01042         ResPacket R0, R4;
01043         ResPacket alphav = pset1<ResPacket>(alpha);
01044 
01045         ResScalar* r0 = &res[(j2+0)*resStride + i];
01046 
01047         R0 = ploadu<ResPacket>(r0);
01048         R4 = ploadu<ResPacket>(r0+ResPacketSize);
01049 
01050         traits.acc(C0, alphav, R0);
01051         traits.acc(C4, alphav, R4);
01052 
01053         pstoreu(r0,               R0);
01054         pstoreu(r0+ResPacketSize, R4);
01055       }
01056       if(rows-peeled_mc>=LhsProgress)
01057       {
01058         Index i = peeled_mc;
01059         const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
01060         prefetch(&blA[0]);
01061 
01062         AccPacket C0;
01063         traits.initAcc(C0);
01064 
01065         const RhsScalar* blB = unpackedB;
01066         for(Index k=0; k<depth; k++)
01067         {
01068           LhsPacket A0;
01069           RhsPacket B0;
01070           traits.loadLhs(blA, A0);
01071           traits.loadRhs(blB, B0);
01072           traits.madd(A0, B0, C0, B0);
01073           blB += RhsProgress;
01074           blA += LhsProgress;
01075         }
01076 
01077         ResPacket alphav = pset1<ResPacket>(alpha);
01078         ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
01079         traits.acc(C0, alphav, R0);
01080         pstoreu(&res[(j2+0)*resStride + i], R0);
01081       }
01082       for(Index i=peeled_mc2; i<rows; i++)
01083       {
01084         const LhsScalar* blA = &blockA[i*strideA+offsetA];
01085         prefetch(&blA[0]);
01086 
01087         // gets a 1 x 1 res block as registers
01088         ResScalar C0(0);
01089         // FIXME directly use blockB ??
01090         const RhsScalar* blB = &blockB[j2*strideB+offsetB];
01091         for(Index k=0; k<depth; k++)
01092         {
01093           LhsScalar A0 = blA[k];
01094           RhsScalar B0 = blB[k];
01095           MADD(cj, A0, B0, C0, B0);
01096         }
01097         res[(j2+0)*resStride + i] += alpha*C0;
01098       }
01099     }
01100   }
01101 };
01102 
01103 #undef CJMADD
01104 
01105 // pack a block of the lhs
01106 // The travesal is as follow (mr==4):
01107 //   0  4  8 12 ...
01108 //   1  5  9 13 ...
01109 //   2  6 10 14 ...
01110 //   3  7 11 15 ...
01111 //
01112 //  16 20 24 28 ...
01113 //  17 21 25 29 ...
01114 //  18 22 26 30 ...
01115 //  19 23 27 31 ...
01116 //
01117 //  32 33 34 35 ...
01118 //  36 36 38 39 ...
01119 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
01120 struct gemm_pack_lhs
01121 {
01122   void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
01123                   Index stride=0, Index offset=0)
01124   {
01125 //     enum { PacketSize = packet_traits<Scalar>::size };
01126     eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01127     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01128     const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
01129     Index count = 0;
01130     Index peeled_mc = (rows/Pack1)*Pack1;
01131     for(Index i=0; i<peeled_mc; i+=Pack1)
01132     {
01133       if(PanelMode) count += Pack1 * offset;
01134       for(Index k=0; k<depth; k++)
01135         for(Index w=0; w<Pack1; w++)
01136           blockA[count++] = cj(lhs(i+w, k));
01137       if(PanelMode) count += Pack1 * (stride-offset-depth);
01138     }
01139     if(rows-peeled_mc>=Pack2)
01140     {
01141       if(PanelMode) count += Pack2*offset;
01142       for(Index k=0; k<depth; k++)
01143         for(Index w=0; w<Pack2; w++)
01144           blockA[count++] = cj(lhs(peeled_mc+w, k));
01145       if(PanelMode) count += Pack2 * (stride-offset-depth);
01146       peeled_mc += Pack2;
01147     }
01148     for(Index i=peeled_mc; i<rows; i++)
01149     {
01150       if(PanelMode) count += offset;
01151       for(Index k=0; k<depth; k++)
01152         blockA[count++] = cj(lhs(i, k));
01153       if(PanelMode) count += (stride-offset-depth);
01154     }
01155   }
01156 };
01157 
01158 // copy a complete panel of the rhs
01159 // this version is optimized for column major matrices
01160 // The traversal order is as follow: (nr==4):
01161 //  0  1  2  3   12 13 14 15   24 27
01162 //  4  5  6  7   16 17 18 19   25 28
01163 //  8  9 10 11   20 21 22 23   26 29
01164 //  .  .  .  .    .  .  .  .    .  .
01165 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01166 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
01167 {
01168   typedef typename packet_traits<Scalar>::type Packet;
01169   enum { PacketSize = packet_traits<Scalar>::size };
01170   void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01171                   Index stride=0, Index offset=0)
01172   {
01173     eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01174     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01175     Index packet_cols = (cols/nr) * nr;
01176     Index count = 0;
01177     for(Index j2=0; j2<packet_cols; j2+=nr)
01178     {
01179       // skip what we have before
01180       if(PanelMode) count += nr * offset;
01181       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01182       const Scalar* b1 = &rhs[(j2+1)*rhsStride];
01183       const Scalar* b2 = &rhs[(j2+2)*rhsStride];
01184       const Scalar* b3 = &rhs[(j2+3)*rhsStride];
01185       for(Index k=0; k<depth; k++)
01186       {
01187                   blockB[count+0] = cj(b0[k]);
01188                   blockB[count+1] = cj(b1[k]);
01189         if(nr==4) blockB[count+2] = cj(b2[k]);
01190         if(nr==4) blockB[count+3] = cj(b3[k]);
01191         count += nr;
01192       }
01193       // skip what we have after
01194       if(PanelMode) count += nr * (stride-offset-depth);
01195     }
01196 
01197     // copy the remaining columns one at a time (nr==1)
01198     for(Index j2=packet_cols; j2<cols; ++j2)
01199     {
01200       if(PanelMode) count += offset;
01201       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01202       for(Index k=0; k<depth; k++)
01203       {
01204         blockB[count] = cj(b0[k]);
01205         count += 1;
01206       }
01207       if(PanelMode) count += (stride-offset-depth);
01208     }
01209   }
01210 };
01211 
01212 // this version is optimized for row major matrices
01213 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01214 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
01215 {
01216   enum { PacketSize = packet_traits<Scalar>::size };
01217   void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01218                   Index stride=0, Index offset=0)
01219   {
01220     eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01221     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01222     Index packet_cols = (cols/nr) * nr;
01223     Index count = 0;
01224     for(Index j2=0; j2<packet_cols; j2+=nr)
01225     {
01226       // skip what we have before
01227       if(PanelMode) count += nr * offset;
01228       for(Index k=0; k<depth; k++)
01229       {
01230         const Scalar* b0 = &rhs[k*rhsStride + j2];
01231                   blockB[count+0] = cj(b0[0]);
01232                   blockB[count+1] = cj(b0[1]);
01233         if(nr==4) blockB[count+2] = cj(b0[2]);
01234         if(nr==4) blockB[count+3] = cj(b0[3]);
01235         count += nr;
01236       }
01237       // skip what we have after
01238       if(PanelMode) count += nr * (stride-offset-depth);
01239     }
01240     // copy the remaining columns one at a time (nr==1)
01241     for(Index j2=packet_cols; j2<cols; ++j2)
01242     {
01243       if(PanelMode) count += offset;
01244       const Scalar* b0 = &rhs[j2];
01245       for(Index k=0; k<depth; k++)
01246       {
01247         blockB[count] = cj(b0[k*rhsStride]);
01248         count += 1;
01249       }
01250       if(PanelMode) count += stride-offset-depth;
01251     }
01252   }
01253 };
01254 
01255 } // end namespace internal
01256 
01259 inline std::ptrdiff_t l1CacheSize()
01260 {
01261   std::ptrdiff_t l1, l2;
01262   internal::manage_caching_sizes(GetAction, &l1, &l2);
01263   return l1;
01264 }
01265 
01268 inline std::ptrdiff_t l2CacheSize()
01269 {
01270   std::ptrdiff_t l1, l2;
01271   internal::manage_caching_sizes(GetAction, &l1, &l2);
01272   return l2;
01273 }
01274 
01280 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
01281 {
01282   internal::manage_caching_sizes(SetAction, &l1, &l2);
01283 }
01284 
01285 #endif // EIGEN_GENERAL_BLOCK_PANEL_H


re_vision
Author(s): Dorian Galvez-Lopez
autogenerated on Sun Jan 5 2014 11:31:13