00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
00011 #define EIGEN_GENERAL_BLOCK_PANEL_H
00012
00013 namespace Eigen {
00014
00015 namespace internal {
00016
00017 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
00018 class gebp_traits;
00019
00020
00022 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
00023 {
00024 return a<=0 ? b : a;
00025 }
00026
00028 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
00029 {
00030 static std::ptrdiff_t m_l1CacheSize = 0;
00031 static std::ptrdiff_t m_l2CacheSize = 0;
00032 if(m_l2CacheSize==0)
00033 {
00034 m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
00035 m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
00036 }
00037
00038 if(action==SetAction)
00039 {
00040
00041 eigen_internal_assert(l1!=0 && l2!=0);
00042 m_l1CacheSize = *l1;
00043 m_l2CacheSize = *l2;
00044 }
00045 else if(action==GetAction)
00046 {
00047 eigen_internal_assert(l1!=0 && l2!=0);
00048 *l1 = m_l1CacheSize;
00049 *l2 = m_l2CacheSize;
00050 }
00051 else
00052 {
00053 eigen_internal_assert(false);
00054 }
00055 }
00056
00072 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
00073 void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
00074 {
00075 EIGEN_UNUSED_VARIABLE(n);
00076
00077
00078
00079
00080
00081
00082
00083 std::ptrdiff_t l1, l2;
00084
00085 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
00086 enum {
00087 kdiv = KcFactor * 2 * Traits::nr
00088 * Traits::RhsProgress * sizeof(RhsScalar),
00089 mr = gebp_traits<LhsScalar,RhsScalar>::mr,
00090 mr_mask = (0xffffffff/mr)*mr
00091 };
00092
00093 manage_caching_sizes(GetAction, &l1, &l2);
00094 k = std::min<SizeType>(k, l1/kdiv);
00095 SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
00096 if(_m<m) m = _m & mr_mask;
00097 }
00098
00099 template<typename LhsScalar, typename RhsScalar, typename SizeType>
00100 inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
00101 {
00102 computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
00103 }
00104
00105 #ifdef EIGEN_HAS_FUSE_CJMADD
00106 #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
00107 #else
00108
00109
00110
00111 template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
00112 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& )
00113 {
00114 c = cj.pmadd(a,b,c);
00115 }
00116 };
00117
00118 template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
00119 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
00120 {
00121 t = b; t = cj.pmul(a,t); c = padd(c,t);
00122 }
00123 };
00124
00125 template<typename CJ, typename A, typename B, typename C, typename T>
00126 EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
00127 {
00128 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
00129 }
00130
00131 #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
00132
00133 #endif
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
00146 class gebp_traits
00147 {
00148 public:
00149 typedef _LhsScalar LhsScalar;
00150 typedef _RhsScalar RhsScalar;
00151 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00152
00153 enum {
00154 ConjLhs = _ConjLhs,
00155 ConjRhs = _ConjRhs,
00156 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00157 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00158 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00159 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00160
00161 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00162
00163
00164 nr = NumberOfRegisters/4,
00165
00166
00167 mr = 2 * LhsPacketSize,
00168
00169 WorkSpaceFactor = nr * RhsPacketSize,
00170
00171 LhsProgress = LhsPacketSize,
00172 RhsProgress = RhsPacketSize
00173 };
00174
00175 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00176 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00177 typedef typename packet_traits<ResScalar>::type _ResPacket;
00178
00179 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00180 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00181 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00182
00183 typedef ResPacket AccPacket;
00184
00185 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00186 {
00187 p = pset1<ResPacket>(ResScalar(0));
00188 }
00189
00190 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00191 {
00192 for(DenseIndex k=0; k<n; k++)
00193 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00194 }
00195
00196 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00197 {
00198 dest = pload<RhsPacket>(b);
00199 }
00200
00201 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00202 {
00203 dest = pload<LhsPacket>(a);
00204 }
00205
00206 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
00207 {
00208 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
00209 }
00210
00211 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00212 {
00213 r = pmadd(c,alpha,r);
00214 }
00215
00216 protected:
00217
00218
00219 };
00220
00221 template<typename RealScalar, bool _ConjLhs>
00222 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
00223 {
00224 public:
00225 typedef std::complex<RealScalar> LhsScalar;
00226 typedef RealScalar RhsScalar;
00227 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00228
00229 enum {
00230 ConjLhs = _ConjLhs,
00231 ConjRhs = false,
00232 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00233 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00234 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00235 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00236
00237 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00238 nr = NumberOfRegisters/4,
00239 mr = 2 * LhsPacketSize,
00240 WorkSpaceFactor = nr*RhsPacketSize,
00241
00242 LhsProgress = LhsPacketSize,
00243 RhsProgress = RhsPacketSize
00244 };
00245
00246 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00247 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00248 typedef typename packet_traits<ResScalar>::type _ResPacket;
00249
00250 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00251 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00252 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00253
00254 typedef ResPacket AccPacket;
00255
00256 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00257 {
00258 p = pset1<ResPacket>(ResScalar(0));
00259 }
00260
00261 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00262 {
00263 for(DenseIndex k=0; k<n; k++)
00264 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00265 }
00266
00267 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00268 {
00269 dest = pload<RhsPacket>(b);
00270 }
00271
00272 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00273 {
00274 dest = pload<LhsPacket>(a);
00275 }
00276
00277 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00278 {
00279 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00280 }
00281
00282 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00283 {
00284 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
00285 }
00286
00287 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00288 {
00289 c += a * b;
00290 }
00291
00292 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00293 {
00294 r = cj.pmadd(c,alpha,r);
00295 }
00296
00297 protected:
00298 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
00299 };
00300
00301 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
00302 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
00303 {
00304 public:
00305 typedef std::complex<RealScalar> Scalar;
00306 typedef std::complex<RealScalar> LhsScalar;
00307 typedef std::complex<RealScalar> RhsScalar;
00308 typedef std::complex<RealScalar> ResScalar;
00309
00310 enum {
00311 ConjLhs = _ConjLhs,
00312 ConjRhs = _ConjRhs,
00313 Vectorizable = packet_traits<RealScalar>::Vectorizable
00314 && packet_traits<Scalar>::Vectorizable,
00315 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
00316 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00317
00318 nr = 2,
00319 mr = 2 * ResPacketSize,
00320 WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
00321
00322 LhsProgress = ResPacketSize,
00323 RhsProgress = Vectorizable ? 2*ResPacketSize : 1
00324 };
00325
00326 typedef typename packet_traits<RealScalar>::type RealPacket;
00327 typedef typename packet_traits<Scalar>::type ScalarPacket;
00328 struct DoublePacket
00329 {
00330 RealPacket first;
00331 RealPacket second;
00332 };
00333
00334 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
00335 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
00336 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
00337 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
00338
00339 EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
00340
00341 EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
00342 {
00343 p.first = pset1<RealPacket>(RealScalar(0));
00344 p.second = pset1<RealPacket>(RealScalar(0));
00345 }
00346
00347
00348
00349
00350
00351 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
00352 {
00353 for(DenseIndex k=0; k<n; k++)
00354 {
00355 if(Vectorizable)
00356 {
00357 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k]));
00358 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
00359 }
00360 else
00361 b[k] = rhs[k];
00362 }
00363 }
00364
00365 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
00366
00367 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
00368 {
00369 dest.first = pload<RealPacket>((const RealScalar*)b);
00370 dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
00371 }
00372
00373
00374 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00375 {
00376 dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
00377 }
00378
00379 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& ) const
00380 {
00381 c.first = padd(pmul(a,b.first), c.first);
00382 c.second = padd(pmul(a,b.second),c.second);
00383 }
00384
00385 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& ) const
00386 {
00387 c = cj.pmadd(a,b,c);
00388 }
00389
00390 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
00391
00392 EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
00393 {
00394
00395 ResPacket tmp;
00396 if((!ConjLhs)&&(!ConjRhs))
00397 {
00398 tmp = pcplxflip(pconj(ResPacket(c.second)));
00399 tmp = padd(ResPacket(c.first),tmp);
00400 }
00401 else if((!ConjLhs)&&(ConjRhs))
00402 {
00403 tmp = pconj(pcplxflip(ResPacket(c.second)));
00404 tmp = padd(ResPacket(c.first),tmp);
00405 }
00406 else if((ConjLhs)&&(!ConjRhs))
00407 {
00408 tmp = pcplxflip(ResPacket(c.second));
00409 tmp = padd(pconj(ResPacket(c.first)),tmp);
00410 }
00411 else if((ConjLhs)&&(ConjRhs))
00412 {
00413 tmp = pcplxflip(ResPacket(c.second));
00414 tmp = psub(pconj(ResPacket(c.first)),tmp);
00415 }
00416
00417 r = pmadd(tmp,alpha,r);
00418 }
00419
00420 protected:
00421 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00422 };
00423
00424 template<typename RealScalar, bool _ConjRhs>
00425 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
00426 {
00427 public:
00428 typedef std::complex<RealScalar> Scalar;
00429 typedef RealScalar LhsScalar;
00430 typedef Scalar RhsScalar;
00431 typedef Scalar ResScalar;
00432
00433 enum {
00434 ConjLhs = false,
00435 ConjRhs = _ConjRhs,
00436 Vectorizable = packet_traits<RealScalar>::Vectorizable
00437 && packet_traits<Scalar>::Vectorizable,
00438 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00439 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00440 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00441
00442 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00443 nr = 4,
00444 mr = 2*ResPacketSize,
00445 WorkSpaceFactor = nr*RhsPacketSize,
00446
00447 LhsProgress = ResPacketSize,
00448 RhsProgress = ResPacketSize
00449 };
00450
00451 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00452 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00453 typedef typename packet_traits<ResScalar>::type _ResPacket;
00454
00455 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00456 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00457 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00458
00459 typedef ResPacket AccPacket;
00460
00461 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00462 {
00463 p = pset1<ResPacket>(ResScalar(0));
00464 }
00465
00466 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00467 {
00468 for(DenseIndex k=0; k<n; k++)
00469 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00470 }
00471
00472 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00473 {
00474 dest = pload<RhsPacket>(b);
00475 }
00476
00477 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00478 {
00479 dest = ploaddup<LhsPacket>(a);
00480 }
00481
00482 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00483 {
00484 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00485 }
00486
00487 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00488 {
00489 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
00490 }
00491
00492 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00493 {
00494 c += a * b;
00495 }
00496
00497 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00498 {
00499 r = cj.pmadd(alpha,c,r);
00500 }
00501
00502 protected:
00503 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
00504 };
00505
00506
00507
00508
00509
00510
00511
00512
00513 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
00514 struct gebp_kernel
00515 {
00516 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
00517 typedef typename Traits::ResScalar ResScalar;
00518 typedef typename Traits::LhsPacket LhsPacket;
00519 typedef typename Traits::RhsPacket RhsPacket;
00520 typedef typename Traits::ResPacket ResPacket;
00521 typedef typename Traits::AccPacket AccPacket;
00522
00523 enum {
00524 Vectorizable = Traits::Vectorizable,
00525 LhsProgress = Traits::LhsProgress,
00526 RhsProgress = Traits::RhsProgress,
00527 ResPacketSize = Traits::ResPacketSize
00528 };
00529
00530 EIGEN_DONT_INLINE
00531 void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
00532 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
00533 };
00534
00535 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
00536 EIGEN_DONT_INLINE
00537 void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
00538 ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
00539 Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
00540 {
00541 Traits traits;
00542
00543 if(strideA==-1) strideA = depth;
00544 if(strideB==-1) strideB = depth;
00545 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00546
00547 Index packet_cols = (cols/nr) * nr;
00548 const Index peeled_mc = (rows/mr)*mr;
00549
00550 const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
00551 const Index peeled_kc = (depth/4)*4;
00552
00553 if(unpackedB==0)
00554 unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
00555
00556
00557 for(Index j2=0; j2<packet_cols; j2+=nr)
00558 {
00559 traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB);
00560
00561
00562
00563
00564 for(Index i=0; i<peeled_mc; i+=mr)
00565 {
00566 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
00567 prefetch(&blA[0]);
00568
00569
00570 AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
00571 traits.initAcc(C0);
00572 traits.initAcc(C1);
00573 if(nr==4) traits.initAcc(C2);
00574 if(nr==4) traits.initAcc(C3);
00575 traits.initAcc(C4);
00576 traits.initAcc(C5);
00577 if(nr==4) traits.initAcc(C6);
00578 if(nr==4) traits.initAcc(C7);
00579
00580 ResScalar* r0 = &res[(j2+0)*resStride + i];
00581 ResScalar* r1 = r0 + resStride;
00582 ResScalar* r2 = r1 + resStride;
00583 ResScalar* r3 = r2 + resStride;
00584
00585 prefetch(r0+16);
00586 prefetch(r1+16);
00587 prefetch(r2+16);
00588 prefetch(r3+16);
00589
00590
00591
00592
00593 const RhsScalar* blB = unpackedB;
00594 for(Index k=0; k<peeled_kc; k+=4)
00595 {
00596 if(nr==2)
00597 {
00598 LhsPacket A0, A1;
00599 RhsPacket B_0;
00600 RhsPacket T0;
00601
00602 EIGEN_ASM_COMMENT("mybegin2");
00603 traits.loadLhs(&blA[0*LhsProgress], A0);
00604 traits.loadLhs(&blA[1*LhsProgress], A1);
00605 traits.loadRhs(&blB[0*RhsProgress], B_0);
00606 traits.madd(A0,B_0,C0,T0);
00607 traits.madd(A1,B_0,C4,B_0);
00608 traits.loadRhs(&blB[1*RhsProgress], B_0);
00609 traits.madd(A0,B_0,C1,T0);
00610 traits.madd(A1,B_0,C5,B_0);
00611
00612 traits.loadLhs(&blA[2*LhsProgress], A0);
00613 traits.loadLhs(&blA[3*LhsProgress], A1);
00614 traits.loadRhs(&blB[2*RhsProgress], B_0);
00615 traits.madd(A0,B_0,C0,T0);
00616 traits.madd(A1,B_0,C4,B_0);
00617 traits.loadRhs(&blB[3*RhsProgress], B_0);
00618 traits.madd(A0,B_0,C1,T0);
00619 traits.madd(A1,B_0,C5,B_0);
00620
00621 traits.loadLhs(&blA[4*LhsProgress], A0);
00622 traits.loadLhs(&blA[5*LhsProgress], A1);
00623 traits.loadRhs(&blB[4*RhsProgress], B_0);
00624 traits.madd(A0,B_0,C0,T0);
00625 traits.madd(A1,B_0,C4,B_0);
00626 traits.loadRhs(&blB[5*RhsProgress], B_0);
00627 traits.madd(A0,B_0,C1,T0);
00628 traits.madd(A1,B_0,C5,B_0);
00629
00630 traits.loadLhs(&blA[6*LhsProgress], A0);
00631 traits.loadLhs(&blA[7*LhsProgress], A1);
00632 traits.loadRhs(&blB[6*RhsProgress], B_0);
00633 traits.madd(A0,B_0,C0,T0);
00634 traits.madd(A1,B_0,C4,B_0);
00635 traits.loadRhs(&blB[7*RhsProgress], B_0);
00636 traits.madd(A0,B_0,C1,T0);
00637 traits.madd(A1,B_0,C5,B_0);
00638 EIGEN_ASM_COMMENT("myend");
00639 }
00640 else
00641 {
00642 EIGEN_ASM_COMMENT("mybegin4");
00643 LhsPacket A0, A1;
00644 RhsPacket B_0, B1, B2, B3;
00645 RhsPacket T0;
00646
00647 traits.loadLhs(&blA[0*LhsProgress], A0);
00648 traits.loadLhs(&blA[1*LhsProgress], A1);
00649 traits.loadRhs(&blB[0*RhsProgress], B_0);
00650 traits.loadRhs(&blB[1*RhsProgress], B1);
00651
00652 traits.madd(A0,B_0,C0,T0);
00653 traits.loadRhs(&blB[2*RhsProgress], B2);
00654 traits.madd(A1,B_0,C4,B_0);
00655 traits.loadRhs(&blB[3*RhsProgress], B3);
00656 traits.loadRhs(&blB[4*RhsProgress], B_0);
00657 traits.madd(A0,B1,C1,T0);
00658 traits.madd(A1,B1,C5,B1);
00659 traits.loadRhs(&blB[5*RhsProgress], B1);
00660 traits.madd(A0,B2,C2,T0);
00661 traits.madd(A1,B2,C6,B2);
00662 traits.loadRhs(&blB[6*RhsProgress], B2);
00663 traits.madd(A0,B3,C3,T0);
00664 traits.loadLhs(&blA[2*LhsProgress], A0);
00665 traits.madd(A1,B3,C7,B3);
00666 traits.loadLhs(&blA[3*LhsProgress], A1);
00667 traits.loadRhs(&blB[7*RhsProgress], B3);
00668 traits.madd(A0,B_0,C0,T0);
00669 traits.madd(A1,B_0,C4,B_0);
00670 traits.loadRhs(&blB[8*RhsProgress], B_0);
00671 traits.madd(A0,B1,C1,T0);
00672 traits.madd(A1,B1,C5,B1);
00673 traits.loadRhs(&blB[9*RhsProgress], B1);
00674 traits.madd(A0,B2,C2,T0);
00675 traits.madd(A1,B2,C6,B2);
00676 traits.loadRhs(&blB[10*RhsProgress], B2);
00677 traits.madd(A0,B3,C3,T0);
00678 traits.loadLhs(&blA[4*LhsProgress], A0);
00679 traits.madd(A1,B3,C7,B3);
00680 traits.loadLhs(&blA[5*LhsProgress], A1);
00681 traits.loadRhs(&blB[11*RhsProgress], B3);
00682
00683 traits.madd(A0,B_0,C0,T0);
00684 traits.madd(A1,B_0,C4,B_0);
00685 traits.loadRhs(&blB[12*RhsProgress], B_0);
00686 traits.madd(A0,B1,C1,T0);
00687 traits.madd(A1,B1,C5,B1);
00688 traits.loadRhs(&blB[13*RhsProgress], B1);
00689 traits.madd(A0,B2,C2,T0);
00690 traits.madd(A1,B2,C6,B2);
00691 traits.loadRhs(&blB[14*RhsProgress], B2);
00692 traits.madd(A0,B3,C3,T0);
00693 traits.loadLhs(&blA[6*LhsProgress], A0);
00694 traits.madd(A1,B3,C7,B3);
00695 traits.loadLhs(&blA[7*LhsProgress], A1);
00696 traits.loadRhs(&blB[15*RhsProgress], B3);
00697 traits.madd(A0,B_0,C0,T0);
00698 traits.madd(A1,B_0,C4,B_0);
00699 traits.madd(A0,B1,C1,T0);
00700 traits.madd(A1,B1,C5,B1);
00701 traits.madd(A0,B2,C2,T0);
00702 traits.madd(A1,B2,C6,B2);
00703 traits.madd(A0,B3,C3,T0);
00704 traits.madd(A1,B3,C7,B3);
00705 }
00706
00707 blB += 4*nr*RhsProgress;
00708 blA += 4*mr;
00709 }
00710
00711 for(Index k=peeled_kc; k<depth; k++)
00712 {
00713 if(nr==2)
00714 {
00715 LhsPacket A0, A1;
00716 RhsPacket B_0;
00717 RhsPacket T0;
00718
00719 traits.loadLhs(&blA[0*LhsProgress], A0);
00720 traits.loadLhs(&blA[1*LhsProgress], A1);
00721 traits.loadRhs(&blB[0*RhsProgress], B_0);
00722 traits.madd(A0,B_0,C0,T0);
00723 traits.madd(A1,B_0,C4,B_0);
00724 traits.loadRhs(&blB[1*RhsProgress], B_0);
00725 traits.madd(A0,B_0,C1,T0);
00726 traits.madd(A1,B_0,C5,B_0);
00727 }
00728 else
00729 {
00730 LhsPacket A0, A1;
00731 RhsPacket B_0, B1, B2, B3;
00732 RhsPacket T0;
00733
00734 traits.loadLhs(&blA[0*LhsProgress], A0);
00735 traits.loadLhs(&blA[1*LhsProgress], A1);
00736 traits.loadRhs(&blB[0*RhsProgress], B_0);
00737 traits.loadRhs(&blB[1*RhsProgress], B1);
00738
00739 traits.madd(A0,B_0,C0,T0);
00740 traits.loadRhs(&blB[2*RhsProgress], B2);
00741 traits.madd(A1,B_0,C4,B_0);
00742 traits.loadRhs(&blB[3*RhsProgress], B3);
00743 traits.madd(A0,B1,C1,T0);
00744 traits.madd(A1,B1,C5,B1);
00745 traits.madd(A0,B2,C2,T0);
00746 traits.madd(A1,B2,C6,B2);
00747 traits.madd(A0,B3,C3,T0);
00748 traits.madd(A1,B3,C7,B3);
00749 }
00750
00751 blB += nr*RhsProgress;
00752 blA += mr;
00753 }
00754
00755 if(nr==4)
00756 {
00757 ResPacket R0, R1, R2, R3, R4, R5, R6;
00758 ResPacket alphav = pset1<ResPacket>(alpha);
00759
00760 R0 = ploadu<ResPacket>(r0);
00761 R1 = ploadu<ResPacket>(r1);
00762 R2 = ploadu<ResPacket>(r2);
00763 R3 = ploadu<ResPacket>(r3);
00764 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00765 R5 = ploadu<ResPacket>(r1 + ResPacketSize);
00766 R6 = ploadu<ResPacket>(r2 + ResPacketSize);
00767 traits.acc(C0, alphav, R0);
00768 pstoreu(r0, R0);
00769 R0 = ploadu<ResPacket>(r3 + ResPacketSize);
00770
00771 traits.acc(C1, alphav, R1);
00772 traits.acc(C2, alphav, R2);
00773 traits.acc(C3, alphav, R3);
00774 traits.acc(C4, alphav, R4);
00775 traits.acc(C5, alphav, R5);
00776 traits.acc(C6, alphav, R6);
00777 traits.acc(C7, alphav, R0);
00778
00779 pstoreu(r1, R1);
00780 pstoreu(r2, R2);
00781 pstoreu(r3, R3);
00782 pstoreu(r0 + ResPacketSize, R4);
00783 pstoreu(r1 + ResPacketSize, R5);
00784 pstoreu(r2 + ResPacketSize, R6);
00785 pstoreu(r3 + ResPacketSize, R0);
00786 }
00787 else
00788 {
00789 ResPacket R0, R1, R4;
00790 ResPacket alphav = pset1<ResPacket>(alpha);
00791
00792 R0 = ploadu<ResPacket>(r0);
00793 R1 = ploadu<ResPacket>(r1);
00794 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00795 traits.acc(C0, alphav, R0);
00796 pstoreu(r0, R0);
00797 R0 = ploadu<ResPacket>(r1 + ResPacketSize);
00798 traits.acc(C1, alphav, R1);
00799 traits.acc(C4, alphav, R4);
00800 traits.acc(C5, alphav, R0);
00801 pstoreu(r1, R1);
00802 pstoreu(r0 + ResPacketSize, R4);
00803 pstoreu(r1 + ResPacketSize, R0);
00804 }
00805
00806 }
00807
00808 if(rows-peeled_mc>=LhsProgress)
00809 {
00810 Index i = peeled_mc;
00811 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
00812 prefetch(&blA[0]);
00813
00814
00815 AccPacket C0, C1, C2, C3;
00816 traits.initAcc(C0);
00817 traits.initAcc(C1);
00818 if(nr==4) traits.initAcc(C2);
00819 if(nr==4) traits.initAcc(C3);
00820
00821
00822 const RhsScalar* blB = unpackedB;
00823 for(Index k=0; k<peeled_kc; k+=4)
00824 {
00825 if(nr==2)
00826 {
00827 LhsPacket A0;
00828 RhsPacket B_0, B1;
00829
00830 traits.loadLhs(&blA[0*LhsProgress], A0);
00831 traits.loadRhs(&blB[0*RhsProgress], B_0);
00832 traits.loadRhs(&blB[1*RhsProgress], B1);
00833 traits.madd(A0,B_0,C0,B_0);
00834 traits.loadRhs(&blB[2*RhsProgress], B_0);
00835 traits.madd(A0,B1,C1,B1);
00836 traits.loadLhs(&blA[1*LhsProgress], A0);
00837 traits.loadRhs(&blB[3*RhsProgress], B1);
00838 traits.madd(A0,B_0,C0,B_0);
00839 traits.loadRhs(&blB[4*RhsProgress], B_0);
00840 traits.madd(A0,B1,C1,B1);
00841 traits.loadLhs(&blA[2*LhsProgress], A0);
00842 traits.loadRhs(&blB[5*RhsProgress], B1);
00843 traits.madd(A0,B_0,C0,B_0);
00844 traits.loadRhs(&blB[6*RhsProgress], B_0);
00845 traits.madd(A0,B1,C1,B1);
00846 traits.loadLhs(&blA[3*LhsProgress], A0);
00847 traits.loadRhs(&blB[7*RhsProgress], B1);
00848 traits.madd(A0,B_0,C0,B_0);
00849 traits.madd(A0,B1,C1,B1);
00850 }
00851 else
00852 {
00853 LhsPacket A0;
00854 RhsPacket B_0, B1, B2, B3;
00855
00856 traits.loadLhs(&blA[0*LhsProgress], A0);
00857 traits.loadRhs(&blB[0*RhsProgress], B_0);
00858 traits.loadRhs(&blB[1*RhsProgress], B1);
00859
00860 traits.madd(A0,B_0,C0,B_0);
00861 traits.loadRhs(&blB[2*RhsProgress], B2);
00862 traits.loadRhs(&blB[3*RhsProgress], B3);
00863 traits.loadRhs(&blB[4*RhsProgress], B_0);
00864 traits.madd(A0,B1,C1,B1);
00865 traits.loadRhs(&blB[5*RhsProgress], B1);
00866 traits.madd(A0,B2,C2,B2);
00867 traits.loadRhs(&blB[6*RhsProgress], B2);
00868 traits.madd(A0,B3,C3,B3);
00869 traits.loadLhs(&blA[1*LhsProgress], A0);
00870 traits.loadRhs(&blB[7*RhsProgress], B3);
00871 traits.madd(A0,B_0,C0,B_0);
00872 traits.loadRhs(&blB[8*RhsProgress], B_0);
00873 traits.madd(A0,B1,C1,B1);
00874 traits.loadRhs(&blB[9*RhsProgress], B1);
00875 traits.madd(A0,B2,C2,B2);
00876 traits.loadRhs(&blB[10*RhsProgress], B2);
00877 traits.madd(A0,B3,C3,B3);
00878 traits.loadLhs(&blA[2*LhsProgress], A0);
00879 traits.loadRhs(&blB[11*RhsProgress], B3);
00880
00881 traits.madd(A0,B_0,C0,B_0);
00882 traits.loadRhs(&blB[12*RhsProgress], B_0);
00883 traits.madd(A0,B1,C1,B1);
00884 traits.loadRhs(&blB[13*RhsProgress], B1);
00885 traits.madd(A0,B2,C2,B2);
00886 traits.loadRhs(&blB[14*RhsProgress], B2);
00887 traits.madd(A0,B3,C3,B3);
00888
00889 traits.loadLhs(&blA[3*LhsProgress], A0);
00890 traits.loadRhs(&blB[15*RhsProgress], B3);
00891 traits.madd(A0,B_0,C0,B_0);
00892 traits.madd(A0,B1,C1,B1);
00893 traits.madd(A0,B2,C2,B2);
00894 traits.madd(A0,B3,C3,B3);
00895 }
00896
00897 blB += nr*4*RhsProgress;
00898 blA += 4*LhsProgress;
00899 }
00900
00901 for(Index k=peeled_kc; k<depth; k++)
00902 {
00903 if(nr==2)
00904 {
00905 LhsPacket A0;
00906 RhsPacket B_0, B1;
00907
00908 traits.loadLhs(&blA[0*LhsProgress], A0);
00909 traits.loadRhs(&blB[0*RhsProgress], B_0);
00910 traits.loadRhs(&blB[1*RhsProgress], B1);
00911 traits.madd(A0,B_0,C0,B_0);
00912 traits.madd(A0,B1,C1,B1);
00913 }
00914 else
00915 {
00916 LhsPacket A0;
00917 RhsPacket B_0, B1, B2, B3;
00918
00919 traits.loadLhs(&blA[0*LhsProgress], A0);
00920 traits.loadRhs(&blB[0*RhsProgress], B_0);
00921 traits.loadRhs(&blB[1*RhsProgress], B1);
00922 traits.loadRhs(&blB[2*RhsProgress], B2);
00923 traits.loadRhs(&blB[3*RhsProgress], B3);
00924
00925 traits.madd(A0,B_0,C0,B_0);
00926 traits.madd(A0,B1,C1,B1);
00927 traits.madd(A0,B2,C2,B2);
00928 traits.madd(A0,B3,C3,B3);
00929 }
00930
00931 blB += nr*RhsProgress;
00932 blA += LhsProgress;
00933 }
00934
00935 ResPacket R0, R1, R2, R3;
00936 ResPacket alphav = pset1<ResPacket>(alpha);
00937
00938 ResScalar* r0 = &res[(j2+0)*resStride + i];
00939 ResScalar* r1 = r0 + resStride;
00940 ResScalar* r2 = r1 + resStride;
00941 ResScalar* r3 = r2 + resStride;
00942
00943 R0 = ploadu<ResPacket>(r0);
00944 R1 = ploadu<ResPacket>(r1);
00945 if(nr==4) R2 = ploadu<ResPacket>(r2);
00946 if(nr==4) R3 = ploadu<ResPacket>(r3);
00947
00948 traits.acc(C0, alphav, R0);
00949 traits.acc(C1, alphav, R1);
00950 if(nr==4) traits.acc(C2, alphav, R2);
00951 if(nr==4) traits.acc(C3, alphav, R3);
00952
00953 pstoreu(r0, R0);
00954 pstoreu(r1, R1);
00955 if(nr==4) pstoreu(r2, R2);
00956 if(nr==4) pstoreu(r3, R3);
00957 }
00958 for(Index i=peeled_mc2; i<rows; i++)
00959 {
00960 const LhsScalar* blA = &blockA[i*strideA+offsetA];
00961 prefetch(&blA[0]);
00962
00963
00964 ResScalar C0(0), C1(0), C2(0), C3(0);
00965
00966 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
00967 for(Index k=0; k<depth; k++)
00968 {
00969 if(nr==2)
00970 {
00971 LhsScalar A0;
00972 RhsScalar B_0, B1;
00973
00974 A0 = blA[k];
00975 B_0 = blB[0];
00976 B1 = blB[1];
00977 MADD(cj,A0,B_0,C0,B_0);
00978 MADD(cj,A0,B1,C1,B1);
00979 }
00980 else
00981 {
00982 LhsScalar A0;
00983 RhsScalar B_0, B1, B2, B3;
00984
00985 A0 = blA[k];
00986 B_0 = blB[0];
00987 B1 = blB[1];
00988 B2 = blB[2];
00989 B3 = blB[3];
00990
00991 MADD(cj,A0,B_0,C0,B_0);
00992 MADD(cj,A0,B1,C1,B1);
00993 MADD(cj,A0,B2,C2,B2);
00994 MADD(cj,A0,B3,C3,B3);
00995 }
00996
00997 blB += nr;
00998 }
00999 res[(j2+0)*resStride + i] += alpha*C0;
01000 res[(j2+1)*resStride + i] += alpha*C1;
01001 if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
01002 if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
01003 }
01004 }
01005
01006
01007 for(Index j2=packet_cols; j2<cols; j2++)
01008 {
01009
01010 traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
01011
01012 for(Index i=0; i<peeled_mc; i+=mr)
01013 {
01014 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
01015 prefetch(&blA[0]);
01016
01017
01018
01019
01020 AccPacket C0, C4;
01021 traits.initAcc(C0);
01022 traits.initAcc(C4);
01023
01024 const RhsScalar* blB = unpackedB;
01025 for(Index k=0; k<depth; k++)
01026 {
01027 LhsPacket A0, A1;
01028 RhsPacket B_0;
01029 RhsPacket T0;
01030
01031 traits.loadLhs(&blA[0*LhsProgress], A0);
01032 traits.loadLhs(&blA[1*LhsProgress], A1);
01033 traits.loadRhs(&blB[0*RhsProgress], B_0);
01034 traits.madd(A0,B_0,C0,T0);
01035 traits.madd(A1,B_0,C4,B_0);
01036
01037 blB += RhsProgress;
01038 blA += 2*LhsProgress;
01039 }
01040 ResPacket R0, R4;
01041 ResPacket alphav = pset1<ResPacket>(alpha);
01042
01043 ResScalar* r0 = &res[(j2+0)*resStride + i];
01044
01045 R0 = ploadu<ResPacket>(r0);
01046 R4 = ploadu<ResPacket>(r0+ResPacketSize);
01047
01048 traits.acc(C0, alphav, R0);
01049 traits.acc(C4, alphav, R4);
01050
01051 pstoreu(r0, R0);
01052 pstoreu(r0+ResPacketSize, R4);
01053 }
01054 if(rows-peeled_mc>=LhsProgress)
01055 {
01056 Index i = peeled_mc;
01057 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
01058 prefetch(&blA[0]);
01059
01060 AccPacket C0;
01061 traits.initAcc(C0);
01062
01063 const RhsScalar* blB = unpackedB;
01064 for(Index k=0; k<depth; k++)
01065 {
01066 LhsPacket A0;
01067 RhsPacket B_0;
01068 traits.loadLhs(blA, A0);
01069 traits.loadRhs(blB, B_0);
01070 traits.madd(A0, B_0, C0, B_0);
01071 blB += RhsProgress;
01072 blA += LhsProgress;
01073 }
01074
01075 ResPacket alphav = pset1<ResPacket>(alpha);
01076 ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
01077 traits.acc(C0, alphav, R0);
01078 pstoreu(&res[(j2+0)*resStride + i], R0);
01079 }
01080 for(Index i=peeled_mc2; i<rows; i++)
01081 {
01082 const LhsScalar* blA = &blockA[i*strideA+offsetA];
01083 prefetch(&blA[0]);
01084
01085
01086 ResScalar C0(0);
01087
01088 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
01089 for(Index k=0; k<depth; k++)
01090 {
01091 LhsScalar A0 = blA[k];
01092 RhsScalar B_0 = blB[k];
01093 MADD(cj, A0, B_0, C0, B_0);
01094 }
01095 res[(j2+0)*resStride + i] += alpha*C0;
01096 }
01097 }
01098 }
01099
01100
01101 #undef CJMADD
01102
01103
01104
01105
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
01118 struct gemm_pack_lhs
01119 {
01120 EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
01121 };
01122
01123 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
01124 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode>
01125 ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
01126 {
01127 typedef typename packet_traits<Scalar>::type Packet;
01128 enum { PacketSize = packet_traits<Scalar>::size };
01129
01130 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
01131 EIGEN_UNUSED_VARIABLE(stride)
01132 EIGEN_UNUSED_VARIABLE(offset)
01133 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01134 eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
01135 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01136 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
01137 Index count = 0;
01138 Index peeled_mc = (rows/Pack1)*Pack1;
01139 for(Index i=0; i<peeled_mc; i+=Pack1)
01140 {
01141 if(PanelMode) count += Pack1 * offset;
01142
01143 if(StorageOrder==ColMajor)
01144 {
01145 for(Index k=0; k<depth; k++)
01146 {
01147 Packet A, B, C, D;
01148 if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
01149 if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
01150 if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
01151 if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
01152 if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
01153 if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
01154 if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
01155 if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
01156 }
01157 }
01158 else
01159 {
01160 for(Index k=0; k<depth; k++)
01161 {
01162
01163 Index w=0;
01164 for(; w<Pack1-3; w+=4)
01165 {
01166 Scalar a(cj(lhs(i+w+0, k))),
01167 b(cj(lhs(i+w+1, k))),
01168 c(cj(lhs(i+w+2, k))),
01169 d(cj(lhs(i+w+3, k)));
01170 blockA[count++] = a;
01171 blockA[count++] = b;
01172 blockA[count++] = c;
01173 blockA[count++] = d;
01174 }
01175 if(Pack1%4)
01176 for(;w<Pack1;++w)
01177 blockA[count++] = cj(lhs(i+w, k));
01178 }
01179 }
01180 if(PanelMode) count += Pack1 * (stride-offset-depth);
01181 }
01182 if(rows-peeled_mc>=Pack2)
01183 {
01184 if(PanelMode) count += Pack2*offset;
01185 for(Index k=0; k<depth; k++)
01186 for(Index w=0; w<Pack2; w++)
01187 blockA[count++] = cj(lhs(peeled_mc+w, k));
01188 if(PanelMode) count += Pack2 * (stride-offset-depth);
01189 peeled_mc += Pack2;
01190 }
01191 for(Index i=peeled_mc; i<rows; i++)
01192 {
01193 if(PanelMode) count += offset;
01194 for(Index k=0; k<depth; k++)
01195 blockA[count++] = cj(lhs(i, k));
01196 if(PanelMode) count += (stride-offset-depth);
01197 }
01198 }
01199
01200
01201
01202
01203
01204
01205
01206
01207 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01208 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
01209 {
01210 typedef typename packet_traits<Scalar>::type Packet;
01211 enum { PacketSize = packet_traits<Scalar>::size };
01212 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
01213 };
01214
01215 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01216 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
01217 ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
01218 {
01219 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
01220 EIGEN_UNUSED_VARIABLE(stride)
01221 EIGEN_UNUSED_VARIABLE(offset)
01222 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01223 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01224 Index packet_cols = (cols/nr) * nr;
01225 Index count = 0;
01226 for(Index j2=0; j2<packet_cols; j2+=nr)
01227 {
01228
01229 if(PanelMode) count += nr * offset;
01230 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01231 const Scalar* b1 = &rhs[(j2+1)*rhsStride];
01232 const Scalar* b2 = &rhs[(j2+2)*rhsStride];
01233 const Scalar* b3 = &rhs[(j2+3)*rhsStride];
01234 for(Index k=0; k<depth; k++)
01235 {
01236 blockB[count+0] = cj(b0[k]);
01237 blockB[count+1] = cj(b1[k]);
01238 if(nr==4) blockB[count+2] = cj(b2[k]);
01239 if(nr==4) blockB[count+3] = cj(b3[k]);
01240 count += nr;
01241 }
01242
01243 if(PanelMode) count += nr * (stride-offset-depth);
01244 }
01245
01246
01247 for(Index j2=packet_cols; j2<cols; ++j2)
01248 {
01249 if(PanelMode) count += offset;
01250 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01251 for(Index k=0; k<depth; k++)
01252 {
01253 blockB[count] = cj(b0[k]);
01254 count += 1;
01255 }
01256 if(PanelMode) count += (stride-offset-depth);
01257 }
01258 }
01259
01260
01261 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01262 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
01263 {
01264 enum { PacketSize = packet_traits<Scalar>::size };
01265 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
01266 };
01267
01268 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01269 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
01270 ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
01271 {
01272 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
01273 EIGEN_UNUSED_VARIABLE(stride)
01274 EIGEN_UNUSED_VARIABLE(offset)
01275 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01276 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01277 Index packet_cols = (cols/nr) * nr;
01278 Index count = 0;
01279 for(Index j2=0; j2<packet_cols; j2+=nr)
01280 {
01281
01282 if(PanelMode) count += nr * offset;
01283 for(Index k=0; k<depth; k++)
01284 {
01285 const Scalar* b0 = &rhs[k*rhsStride + j2];
01286 blockB[count+0] = cj(b0[0]);
01287 blockB[count+1] = cj(b0[1]);
01288 if(nr==4) blockB[count+2] = cj(b0[2]);
01289 if(nr==4) blockB[count+3] = cj(b0[3]);
01290 count += nr;
01291 }
01292
01293 if(PanelMode) count += nr * (stride-offset-depth);
01294 }
01295
01296 for(Index j2=packet_cols; j2<cols; ++j2)
01297 {
01298 if(PanelMode) count += offset;
01299 const Scalar* b0 = &rhs[j2];
01300 for(Index k=0; k<depth; k++)
01301 {
01302 blockB[count] = cj(b0[k*rhsStride]);
01303 count += 1;
01304 }
01305 if(PanelMode) count += stride-offset-depth;
01306 }
01307 }
01308
01309 }
01310
01313 inline std::ptrdiff_t l1CacheSize()
01314 {
01315 std::ptrdiff_t l1, l2;
01316 internal::manage_caching_sizes(GetAction, &l1, &l2);
01317 return l1;
01318 }
01319
01322 inline std::ptrdiff_t l2CacheSize()
01323 {
01324 std::ptrdiff_t l1, l2;
01325 internal::manage_caching_sizes(GetAction, &l1, &l2);
01326 return l2;
01327 }
01328
01334 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
01335 {
01336 internal::manage_caching_sizes(SetAction, &l1, &l2);
01337 }
01338
01339 }
01340
01341 #endif // EIGEN_GENERAL_BLOCK_PANEL_H