00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
00011 #define EIGEN_GENERAL_BLOCK_PANEL_H
00012
00013 namespace Eigen {
00014
00015 namespace internal {
00016
00017 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
00018 class gebp_traits;
00019
00020
00022 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
00023 {
00024 return a<=0 ? b : a;
00025 }
00026
00028 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
00029 {
00030 static std::ptrdiff_t m_l1CacheSize = 0;
00031 static std::ptrdiff_t m_l2CacheSize = 0;
00032 if(m_l2CacheSize==0)
00033 {
00034 m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
00035 m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
00036 }
00037
00038 if(action==SetAction)
00039 {
00040
00041 eigen_internal_assert(l1!=0 && l2!=0);
00042 m_l1CacheSize = *l1;
00043 m_l2CacheSize = *l2;
00044 }
00045 else if(action==GetAction)
00046 {
00047 eigen_internal_assert(l1!=0 && l2!=0);
00048 *l1 = m_l1CacheSize;
00049 *l2 = m_l2CacheSize;
00050 }
00051 else
00052 {
00053 eigen_internal_assert(false);
00054 }
00055 }
00056
00072 template<typename LhsScalar, typename RhsScalar, int KcFactor>
00073 void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00074 {
00075 EIGEN_UNUSED_VARIABLE(n);
00076
00077
00078
00079
00080
00081
00082
00083 std::ptrdiff_t l1, l2;
00084
00085 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
00086 enum {
00087 kdiv = KcFactor * 2 * Traits::nr
00088 * Traits::RhsProgress * sizeof(RhsScalar),
00089 mr = gebp_traits<LhsScalar,RhsScalar>::mr,
00090 mr_mask = (0xffffffff/mr)*mr
00091 };
00092
00093 manage_caching_sizes(GetAction, &l1, &l2);
00094 k = std::min<std::ptrdiff_t>(k, l1/kdiv);
00095 std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
00096 if(_m<m) m = _m & mr_mask;
00097 }
00098
00099 template<typename LhsScalar, typename RhsScalar>
00100 inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00101 {
00102 computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
00103 }
00104
00105 #ifdef EIGEN_HAS_FUSE_CJMADD
00106 #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
00107 #else
00108
00109
00110
00111 template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
00112 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& )
00113 {
00114 c = cj.pmadd(a,b,c);
00115 }
00116 };
00117
00118 template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
00119 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
00120 {
00121 t = b; t = cj.pmul(a,t); c = padd(c,t);
00122 }
00123 };
00124
00125 template<typename CJ, typename A, typename B, typename C, typename T>
00126 EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
00127 {
00128 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
00129 }
00130
00131 #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
00132
00133 #endif
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
00146 class gebp_traits
00147 {
00148 public:
00149 typedef _LhsScalar LhsScalar;
00150 typedef _RhsScalar RhsScalar;
00151 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00152
00153 enum {
00154 ConjLhs = _ConjLhs,
00155 ConjRhs = _ConjRhs,
00156 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00157 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00158 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00159 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00160
00161 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00162
00163
00164 nr = NumberOfRegisters/4,
00165
00166
00167 mr = 2 * LhsPacketSize,
00168
00169 WorkSpaceFactor = nr * RhsPacketSize,
00170
00171 LhsProgress = LhsPacketSize,
00172 RhsProgress = RhsPacketSize
00173 };
00174
00175 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00176 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00177 typedef typename packet_traits<ResScalar>::type _ResPacket;
00178
00179 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00180 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00181 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00182
00183 typedef ResPacket AccPacket;
00184
00185 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00186 {
00187 p = pset1<ResPacket>(ResScalar(0));
00188 }
00189
00190 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00191 {
00192 for(DenseIndex k=0; k<n; k++)
00193 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00194 }
00195
00196 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00197 {
00198 dest = pload<RhsPacket>(b);
00199 }
00200
00201 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00202 {
00203 dest = pload<LhsPacket>(a);
00204 }
00205
00206 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
00207 {
00208 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
00209 }
00210
00211 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00212 {
00213 r = pmadd(c,alpha,r);
00214 }
00215
00216 protected:
00217
00218
00219 };
00220
00221 template<typename RealScalar, bool _ConjLhs>
00222 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
00223 {
00224 public:
00225 typedef std::complex<RealScalar> LhsScalar;
00226 typedef RealScalar RhsScalar;
00227 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00228
00229 enum {
00230 ConjLhs = _ConjLhs,
00231 ConjRhs = false,
00232 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00233 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00234 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00235 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00236
00237 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00238 nr = NumberOfRegisters/4,
00239 mr = 2 * LhsPacketSize,
00240 WorkSpaceFactor = nr*RhsPacketSize,
00241
00242 LhsProgress = LhsPacketSize,
00243 RhsProgress = RhsPacketSize
00244 };
00245
00246 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00247 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00248 typedef typename packet_traits<ResScalar>::type _ResPacket;
00249
00250 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00251 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00252 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00253
00254 typedef ResPacket AccPacket;
00255
00256 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00257 {
00258 p = pset1<ResPacket>(ResScalar(0));
00259 }
00260
00261 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00262 {
00263 for(DenseIndex k=0; k<n; k++)
00264 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00265 }
00266
00267 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00268 {
00269 dest = pload<RhsPacket>(b);
00270 }
00271
00272 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00273 {
00274 dest = pload<LhsPacket>(a);
00275 }
00276
00277 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00278 {
00279 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00280 }
00281
00282 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00283 {
00284 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
00285 }
00286
00287 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00288 {
00289 c += a * b;
00290 }
00291
00292 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00293 {
00294 r = cj.pmadd(c,alpha,r);
00295 }
00296
00297 protected:
00298 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
00299 };
00300
00301 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
00302 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
00303 {
00304 public:
00305 typedef std::complex<RealScalar> Scalar;
00306 typedef std::complex<RealScalar> LhsScalar;
00307 typedef std::complex<RealScalar> RhsScalar;
00308 typedef std::complex<RealScalar> ResScalar;
00309
00310 enum {
00311 ConjLhs = _ConjLhs,
00312 ConjRhs = _ConjRhs,
00313 Vectorizable = packet_traits<RealScalar>::Vectorizable
00314 && packet_traits<Scalar>::Vectorizable,
00315 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
00316 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00317
00318 nr = 2,
00319 mr = 2 * ResPacketSize,
00320 WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
00321
00322 LhsProgress = ResPacketSize,
00323 RhsProgress = Vectorizable ? 2*ResPacketSize : 1
00324 };
00325
00326 typedef typename packet_traits<RealScalar>::type RealPacket;
00327 typedef typename packet_traits<Scalar>::type ScalarPacket;
00328 struct DoublePacket
00329 {
00330 RealPacket first;
00331 RealPacket second;
00332 };
00333
00334 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
00335 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
00336 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
00337 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
00338
00339 EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
00340
00341 EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
00342 {
00343 p.first = pset1<RealPacket>(RealScalar(0));
00344 p.second = pset1<RealPacket>(RealScalar(0));
00345 }
00346
00347
00348
00349
00350
00351 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
00352 {
00353 for(DenseIndex k=0; k<n; k++)
00354 {
00355 if(Vectorizable)
00356 {
00357 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k]));
00358 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
00359 }
00360 else
00361 b[k] = rhs[k];
00362 }
00363 }
00364
00365 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
00366
00367 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
00368 {
00369 dest.first = pload<RealPacket>((const RealScalar*)b);
00370 dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
00371 }
00372
00373
00374 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00375 {
00376 dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
00377 }
00378
00379 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& ) const
00380 {
00381 c.first = padd(pmul(a,b.first), c.first);
00382 c.second = padd(pmul(a,b.second),c.second);
00383 }
00384
00385 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& ) const
00386 {
00387 c = cj.pmadd(a,b,c);
00388 }
00389
00390 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
00391
00392 EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
00393 {
00394
00395 ResPacket tmp;
00396 if((!ConjLhs)&&(!ConjRhs))
00397 {
00398 tmp = pcplxflip(pconj(ResPacket(c.second)));
00399 tmp = padd(ResPacket(c.first),tmp);
00400 }
00401 else if((!ConjLhs)&&(ConjRhs))
00402 {
00403 tmp = pconj(pcplxflip(ResPacket(c.second)));
00404 tmp = padd(ResPacket(c.first),tmp);
00405 }
00406 else if((ConjLhs)&&(!ConjRhs))
00407 {
00408 tmp = pcplxflip(ResPacket(c.second));
00409 tmp = padd(pconj(ResPacket(c.first)),tmp);
00410 }
00411 else if((ConjLhs)&&(ConjRhs))
00412 {
00413 tmp = pcplxflip(ResPacket(c.second));
00414 tmp = psub(pconj(ResPacket(c.first)),tmp);
00415 }
00416
00417 r = pmadd(tmp,alpha,r);
00418 }
00419
00420 protected:
00421 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00422 };
00423
00424 template<typename RealScalar, bool _ConjRhs>
00425 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
00426 {
00427 public:
00428 typedef std::complex<RealScalar> Scalar;
00429 typedef RealScalar LhsScalar;
00430 typedef Scalar RhsScalar;
00431 typedef Scalar ResScalar;
00432
00433 enum {
00434 ConjLhs = false,
00435 ConjRhs = _ConjRhs,
00436 Vectorizable = packet_traits<RealScalar>::Vectorizable
00437 && packet_traits<Scalar>::Vectorizable,
00438 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00439 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00440 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00441
00442 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00443 nr = 4,
00444 mr = 2*ResPacketSize,
00445 WorkSpaceFactor = nr*RhsPacketSize,
00446
00447 LhsProgress = ResPacketSize,
00448 RhsProgress = ResPacketSize
00449 };
00450
00451 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00452 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00453 typedef typename packet_traits<ResScalar>::type _ResPacket;
00454
00455 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00456 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00457 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00458
00459 typedef ResPacket AccPacket;
00460
00461 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00462 {
00463 p = pset1<ResPacket>(ResScalar(0));
00464 }
00465
00466 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00467 {
00468 for(DenseIndex k=0; k<n; k++)
00469 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00470 }
00471
00472 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00473 {
00474 dest = pload<RhsPacket>(b);
00475 }
00476
00477 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00478 {
00479 dest = ploaddup<LhsPacket>(a);
00480 }
00481
00482 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00483 {
00484 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00485 }
00486
00487 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00488 {
00489 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
00490 }
00491
00492 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00493 {
00494 c += a * b;
00495 }
00496
00497 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00498 {
00499 r = cj.pmadd(alpha,c,r);
00500 }
00501
00502 protected:
00503 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
00504 };
00505
00506
00507
00508
00509
00510
00511
00512
00513 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
00514 struct gebp_kernel
00515 {
00516 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
00517 typedef typename Traits::ResScalar ResScalar;
00518 typedef typename Traits::LhsPacket LhsPacket;
00519 typedef typename Traits::RhsPacket RhsPacket;
00520 typedef typename Traits::ResPacket ResPacket;
00521 typedef typename Traits::AccPacket AccPacket;
00522
00523 enum {
00524 Vectorizable = Traits::Vectorizable,
00525 LhsProgress = Traits::LhsProgress,
00526 RhsProgress = Traits::RhsProgress,
00527 ResPacketSize = Traits::ResPacketSize
00528 };
00529
00530 EIGEN_DONT_INLINE EIGEN_FLATTEN_ATTRIB
00531 void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
00532 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
00533 {
00534 Traits traits;
00535
00536 if(strideA==-1) strideA = depth;
00537 if(strideB==-1) strideB = depth;
00538 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00539
00540 Index packet_cols = (cols/nr) * nr;
00541 const Index peeled_mc = (rows/mr)*mr;
00542
00543 const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
00544 const Index peeled_kc = (depth/4)*4;
00545
00546 if(unpackedB==0)
00547 unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
00548
00549
00550 for(Index j2=0; j2<packet_cols; j2+=nr)
00551 {
00552 traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB);
00553
00554
00555
00556
00557 for(Index i=0; i<peeled_mc; i+=mr)
00558 {
00559 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
00560 prefetch(&blA[0]);
00561
00562
00563 AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
00564 traits.initAcc(C0);
00565 traits.initAcc(C1);
00566 if(nr==4) traits.initAcc(C2);
00567 if(nr==4) traits.initAcc(C3);
00568 traits.initAcc(C4);
00569 traits.initAcc(C5);
00570 if(nr==4) traits.initAcc(C6);
00571 if(nr==4) traits.initAcc(C7);
00572
00573 ResScalar* r0 = &res[(j2+0)*resStride + i];
00574 ResScalar* r1 = r0 + resStride;
00575 ResScalar* r2 = r1 + resStride;
00576 ResScalar* r3 = r2 + resStride;
00577
00578 prefetch(r0+16);
00579 prefetch(r1+16);
00580 prefetch(r2+16);
00581 prefetch(r3+16);
00582
00583
00584
00585
00586 const RhsScalar* blB = unpackedB;
00587 for(Index k=0; k<peeled_kc; k+=4)
00588 {
00589 if(nr==2)
00590 {
00591 LhsPacket A0, A1;
00592 RhsPacket B_0;
00593 RhsPacket T0;
00594
00595 EIGEN_ASM_COMMENT("mybegin2");
00596 traits.loadLhs(&blA[0*LhsProgress], A0);
00597 traits.loadLhs(&blA[1*LhsProgress], A1);
00598 traits.loadRhs(&blB[0*RhsProgress], B_0);
00599 traits.madd(A0,B_0,C0,T0);
00600 traits.madd(A1,B_0,C4,B_0);
00601 traits.loadRhs(&blB[1*RhsProgress], B_0);
00602 traits.madd(A0,B_0,C1,T0);
00603 traits.madd(A1,B_0,C5,B_0);
00604
00605 traits.loadLhs(&blA[2*LhsProgress], A0);
00606 traits.loadLhs(&blA[3*LhsProgress], A1);
00607 traits.loadRhs(&blB[2*RhsProgress], B_0);
00608 traits.madd(A0,B_0,C0,T0);
00609 traits.madd(A1,B_0,C4,B_0);
00610 traits.loadRhs(&blB[3*RhsProgress], B_0);
00611 traits.madd(A0,B_0,C1,T0);
00612 traits.madd(A1,B_0,C5,B_0);
00613
00614 traits.loadLhs(&blA[4*LhsProgress], A0);
00615 traits.loadLhs(&blA[5*LhsProgress], A1);
00616 traits.loadRhs(&blB[4*RhsProgress], B_0);
00617 traits.madd(A0,B_0,C0,T0);
00618 traits.madd(A1,B_0,C4,B_0);
00619 traits.loadRhs(&blB[5*RhsProgress], B_0);
00620 traits.madd(A0,B_0,C1,T0);
00621 traits.madd(A1,B_0,C5,B_0);
00622
00623 traits.loadLhs(&blA[6*LhsProgress], A0);
00624 traits.loadLhs(&blA[7*LhsProgress], A1);
00625 traits.loadRhs(&blB[6*RhsProgress], B_0);
00626 traits.madd(A0,B_0,C0,T0);
00627 traits.madd(A1,B_0,C4,B_0);
00628 traits.loadRhs(&blB[7*RhsProgress], B_0);
00629 traits.madd(A0,B_0,C1,T0);
00630 traits.madd(A1,B_0,C5,B_0);
00631 EIGEN_ASM_COMMENT("myend");
00632 }
00633 else
00634 {
00635 EIGEN_ASM_COMMENT("mybegin4");
00636 LhsPacket A0, A1;
00637 RhsPacket B_0, B1, B2, B3;
00638 RhsPacket T0;
00639
00640 traits.loadLhs(&blA[0*LhsProgress], A0);
00641 traits.loadLhs(&blA[1*LhsProgress], A1);
00642 traits.loadRhs(&blB[0*RhsProgress], B_0);
00643 traits.loadRhs(&blB[1*RhsProgress], B1);
00644
00645 traits.madd(A0,B_0,C0,T0);
00646 traits.loadRhs(&blB[2*RhsProgress], B2);
00647 traits.madd(A1,B_0,C4,B_0);
00648 traits.loadRhs(&blB[3*RhsProgress], B3);
00649 traits.loadRhs(&blB[4*RhsProgress], B_0);
00650 traits.madd(A0,B1,C1,T0);
00651 traits.madd(A1,B1,C5,B1);
00652 traits.loadRhs(&blB[5*RhsProgress], B1);
00653 traits.madd(A0,B2,C2,T0);
00654 traits.madd(A1,B2,C6,B2);
00655 traits.loadRhs(&blB[6*RhsProgress], B2);
00656 traits.madd(A0,B3,C3,T0);
00657 traits.loadLhs(&blA[2*LhsProgress], A0);
00658 traits.madd(A1,B3,C7,B3);
00659 traits.loadLhs(&blA[3*LhsProgress], A1);
00660 traits.loadRhs(&blB[7*RhsProgress], B3);
00661 traits.madd(A0,B_0,C0,T0);
00662 traits.madd(A1,B_0,C4,B_0);
00663 traits.loadRhs(&blB[8*RhsProgress], B_0);
00664 traits.madd(A0,B1,C1,T0);
00665 traits.madd(A1,B1,C5,B1);
00666 traits.loadRhs(&blB[9*RhsProgress], B1);
00667 traits.madd(A0,B2,C2,T0);
00668 traits.madd(A1,B2,C6,B2);
00669 traits.loadRhs(&blB[10*RhsProgress], B2);
00670 traits.madd(A0,B3,C3,T0);
00671 traits.loadLhs(&blA[4*LhsProgress], A0);
00672 traits.madd(A1,B3,C7,B3);
00673 traits.loadLhs(&blA[5*LhsProgress], A1);
00674 traits.loadRhs(&blB[11*RhsProgress], B3);
00675
00676 traits.madd(A0,B_0,C0,T0);
00677 traits.madd(A1,B_0,C4,B_0);
00678 traits.loadRhs(&blB[12*RhsProgress], B_0);
00679 traits.madd(A0,B1,C1,T0);
00680 traits.madd(A1,B1,C5,B1);
00681 traits.loadRhs(&blB[13*RhsProgress], B1);
00682 traits.madd(A0,B2,C2,T0);
00683 traits.madd(A1,B2,C6,B2);
00684 traits.loadRhs(&blB[14*RhsProgress], B2);
00685 traits.madd(A0,B3,C3,T0);
00686 traits.loadLhs(&blA[6*LhsProgress], A0);
00687 traits.madd(A1,B3,C7,B3);
00688 traits.loadLhs(&blA[7*LhsProgress], A1);
00689 traits.loadRhs(&blB[15*RhsProgress], B3);
00690 traits.madd(A0,B_0,C0,T0);
00691 traits.madd(A1,B_0,C4,B_0);
00692 traits.madd(A0,B1,C1,T0);
00693 traits.madd(A1,B1,C5,B1);
00694 traits.madd(A0,B2,C2,T0);
00695 traits.madd(A1,B2,C6,B2);
00696 traits.madd(A0,B3,C3,T0);
00697 traits.madd(A1,B3,C7,B3);
00698 }
00699
00700 blB += 4*nr*RhsProgress;
00701 blA += 4*mr;
00702 }
00703
00704 for(Index k=peeled_kc; k<depth; k++)
00705 {
00706 if(nr==2)
00707 {
00708 LhsPacket A0, A1;
00709 RhsPacket B_0;
00710 RhsPacket T0;
00711
00712 traits.loadLhs(&blA[0*LhsProgress], A0);
00713 traits.loadLhs(&blA[1*LhsProgress], A1);
00714 traits.loadRhs(&blB[0*RhsProgress], B_0);
00715 traits.madd(A0,B_0,C0,T0);
00716 traits.madd(A1,B_0,C4,B_0);
00717 traits.loadRhs(&blB[1*RhsProgress], B_0);
00718 traits.madd(A0,B_0,C1,T0);
00719 traits.madd(A1,B_0,C5,B_0);
00720 }
00721 else
00722 {
00723 LhsPacket A0, A1;
00724 RhsPacket B_0, B1, B2, B3;
00725 RhsPacket T0;
00726
00727 traits.loadLhs(&blA[0*LhsProgress], A0);
00728 traits.loadLhs(&blA[1*LhsProgress], A1);
00729 traits.loadRhs(&blB[0*RhsProgress], B_0);
00730 traits.loadRhs(&blB[1*RhsProgress], B1);
00731
00732 traits.madd(A0,B_0,C0,T0);
00733 traits.loadRhs(&blB[2*RhsProgress], B2);
00734 traits.madd(A1,B_0,C4,B_0);
00735 traits.loadRhs(&blB[3*RhsProgress], B3);
00736 traits.madd(A0,B1,C1,T0);
00737 traits.madd(A1,B1,C5,B1);
00738 traits.madd(A0,B2,C2,T0);
00739 traits.madd(A1,B2,C6,B2);
00740 traits.madd(A0,B3,C3,T0);
00741 traits.madd(A1,B3,C7,B3);
00742 }
00743
00744 blB += nr*RhsProgress;
00745 blA += mr;
00746 }
00747
00748 if(nr==4)
00749 {
00750 ResPacket R0, R1, R2, R3, R4, R5, R6;
00751 ResPacket alphav = pset1<ResPacket>(alpha);
00752
00753 R0 = ploadu<ResPacket>(r0);
00754 R1 = ploadu<ResPacket>(r1);
00755 R2 = ploadu<ResPacket>(r2);
00756 R3 = ploadu<ResPacket>(r3);
00757 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00758 R5 = ploadu<ResPacket>(r1 + ResPacketSize);
00759 R6 = ploadu<ResPacket>(r2 + ResPacketSize);
00760 traits.acc(C0, alphav, R0);
00761 pstoreu(r0, R0);
00762 R0 = ploadu<ResPacket>(r3 + ResPacketSize);
00763
00764 traits.acc(C1, alphav, R1);
00765 traits.acc(C2, alphav, R2);
00766 traits.acc(C3, alphav, R3);
00767 traits.acc(C4, alphav, R4);
00768 traits.acc(C5, alphav, R5);
00769 traits.acc(C6, alphav, R6);
00770 traits.acc(C7, alphav, R0);
00771
00772 pstoreu(r1, R1);
00773 pstoreu(r2, R2);
00774 pstoreu(r3, R3);
00775 pstoreu(r0 + ResPacketSize, R4);
00776 pstoreu(r1 + ResPacketSize, R5);
00777 pstoreu(r2 + ResPacketSize, R6);
00778 pstoreu(r3 + ResPacketSize, R0);
00779 }
00780 else
00781 {
00782 ResPacket R0, R1, R4;
00783 ResPacket alphav = pset1<ResPacket>(alpha);
00784
00785 R0 = ploadu<ResPacket>(r0);
00786 R1 = ploadu<ResPacket>(r1);
00787 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00788 traits.acc(C0, alphav, R0);
00789 pstoreu(r0, R0);
00790 R0 = ploadu<ResPacket>(r1 + ResPacketSize);
00791 traits.acc(C1, alphav, R1);
00792 traits.acc(C4, alphav, R4);
00793 traits.acc(C5, alphav, R0);
00794 pstoreu(r1, R1);
00795 pstoreu(r0 + ResPacketSize, R4);
00796 pstoreu(r1 + ResPacketSize, R0);
00797 }
00798
00799 }
00800
00801 if(rows-peeled_mc>=LhsProgress)
00802 {
00803 Index i = peeled_mc;
00804 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
00805 prefetch(&blA[0]);
00806
00807
00808 AccPacket C0, C1, C2, C3;
00809 traits.initAcc(C0);
00810 traits.initAcc(C1);
00811 if(nr==4) traits.initAcc(C2);
00812 if(nr==4) traits.initAcc(C3);
00813
00814
00815 const RhsScalar* blB = unpackedB;
00816 for(Index k=0; k<peeled_kc; k+=4)
00817 {
00818 if(nr==2)
00819 {
00820 LhsPacket A0;
00821 RhsPacket B_0, B1;
00822
00823 traits.loadLhs(&blA[0*LhsProgress], A0);
00824 traits.loadRhs(&blB[0*RhsProgress], B_0);
00825 traits.loadRhs(&blB[1*RhsProgress], B1);
00826 traits.madd(A0,B_0,C0,B_0);
00827 traits.loadRhs(&blB[2*RhsProgress], B_0);
00828 traits.madd(A0,B1,C1,B1);
00829 traits.loadLhs(&blA[1*LhsProgress], A0);
00830 traits.loadRhs(&blB[3*RhsProgress], B1);
00831 traits.madd(A0,B_0,C0,B_0);
00832 traits.loadRhs(&blB[4*RhsProgress], B_0);
00833 traits.madd(A0,B1,C1,B1);
00834 traits.loadLhs(&blA[2*LhsProgress], A0);
00835 traits.loadRhs(&blB[5*RhsProgress], B1);
00836 traits.madd(A0,B_0,C0,B_0);
00837 traits.loadRhs(&blB[6*RhsProgress], B_0);
00838 traits.madd(A0,B1,C1,B1);
00839 traits.loadLhs(&blA[3*LhsProgress], A0);
00840 traits.loadRhs(&blB[7*RhsProgress], B1);
00841 traits.madd(A0,B_0,C0,B_0);
00842 traits.madd(A0,B1,C1,B1);
00843 }
00844 else
00845 {
00846 LhsPacket A0;
00847 RhsPacket B_0, B1, B2, B3;
00848
00849 traits.loadLhs(&blA[0*LhsProgress], A0);
00850 traits.loadRhs(&blB[0*RhsProgress], B_0);
00851 traits.loadRhs(&blB[1*RhsProgress], B1);
00852
00853 traits.madd(A0,B_0,C0,B_0);
00854 traits.loadRhs(&blB[2*RhsProgress], B2);
00855 traits.loadRhs(&blB[3*RhsProgress], B3);
00856 traits.loadRhs(&blB[4*RhsProgress], B_0);
00857 traits.madd(A0,B1,C1,B1);
00858 traits.loadRhs(&blB[5*RhsProgress], B1);
00859 traits.madd(A0,B2,C2,B2);
00860 traits.loadRhs(&blB[6*RhsProgress], B2);
00861 traits.madd(A0,B3,C3,B3);
00862 traits.loadLhs(&blA[1*LhsProgress], A0);
00863 traits.loadRhs(&blB[7*RhsProgress], B3);
00864 traits.madd(A0,B_0,C0,B_0);
00865 traits.loadRhs(&blB[8*RhsProgress], B_0);
00866 traits.madd(A0,B1,C1,B1);
00867 traits.loadRhs(&blB[9*RhsProgress], B1);
00868 traits.madd(A0,B2,C2,B2);
00869 traits.loadRhs(&blB[10*RhsProgress], B2);
00870 traits.madd(A0,B3,C3,B3);
00871 traits.loadLhs(&blA[2*LhsProgress], A0);
00872 traits.loadRhs(&blB[11*RhsProgress], B3);
00873
00874 traits.madd(A0,B_0,C0,B_0);
00875 traits.loadRhs(&blB[12*RhsProgress], B_0);
00876 traits.madd(A0,B1,C1,B1);
00877 traits.loadRhs(&blB[13*RhsProgress], B1);
00878 traits.madd(A0,B2,C2,B2);
00879 traits.loadRhs(&blB[14*RhsProgress], B2);
00880 traits.madd(A0,B3,C3,B3);
00881
00882 traits.loadLhs(&blA[3*LhsProgress], A0);
00883 traits.loadRhs(&blB[15*RhsProgress], B3);
00884 traits.madd(A0,B_0,C0,B_0);
00885 traits.madd(A0,B1,C1,B1);
00886 traits.madd(A0,B2,C2,B2);
00887 traits.madd(A0,B3,C3,B3);
00888 }
00889
00890 blB += nr*4*RhsProgress;
00891 blA += 4*LhsProgress;
00892 }
00893
00894 for(Index k=peeled_kc; k<depth; k++)
00895 {
00896 if(nr==2)
00897 {
00898 LhsPacket A0;
00899 RhsPacket B_0, B1;
00900
00901 traits.loadLhs(&blA[0*LhsProgress], A0);
00902 traits.loadRhs(&blB[0*RhsProgress], B_0);
00903 traits.loadRhs(&blB[1*RhsProgress], B1);
00904 traits.madd(A0,B_0,C0,B_0);
00905 traits.madd(A0,B1,C1,B1);
00906 }
00907 else
00908 {
00909 LhsPacket A0;
00910 RhsPacket B_0, B1, B2, B3;
00911
00912 traits.loadLhs(&blA[0*LhsProgress], A0);
00913 traits.loadRhs(&blB[0*RhsProgress], B_0);
00914 traits.loadRhs(&blB[1*RhsProgress], B1);
00915 traits.loadRhs(&blB[2*RhsProgress], B2);
00916 traits.loadRhs(&blB[3*RhsProgress], B3);
00917
00918 traits.madd(A0,B_0,C0,B_0);
00919 traits.madd(A0,B1,C1,B1);
00920 traits.madd(A0,B2,C2,B2);
00921 traits.madd(A0,B3,C3,B3);
00922 }
00923
00924 blB += nr*RhsProgress;
00925 blA += LhsProgress;
00926 }
00927
00928 ResPacket R0, R1, R2, R3;
00929 ResPacket alphav = pset1<ResPacket>(alpha);
00930
00931 ResScalar* r0 = &res[(j2+0)*resStride + i];
00932 ResScalar* r1 = r0 + resStride;
00933 ResScalar* r2 = r1 + resStride;
00934 ResScalar* r3 = r2 + resStride;
00935
00936 R0 = ploadu<ResPacket>(r0);
00937 R1 = ploadu<ResPacket>(r1);
00938 if(nr==4) R2 = ploadu<ResPacket>(r2);
00939 if(nr==4) R3 = ploadu<ResPacket>(r3);
00940
00941 traits.acc(C0, alphav, R0);
00942 traits.acc(C1, alphav, R1);
00943 if(nr==4) traits.acc(C2, alphav, R2);
00944 if(nr==4) traits.acc(C3, alphav, R3);
00945
00946 pstoreu(r0, R0);
00947 pstoreu(r1, R1);
00948 if(nr==4) pstoreu(r2, R2);
00949 if(nr==4) pstoreu(r3, R3);
00950 }
00951 for(Index i=peeled_mc2; i<rows; i++)
00952 {
00953 const LhsScalar* blA = &blockA[i*strideA+offsetA];
00954 prefetch(&blA[0]);
00955
00956
00957 ResScalar C0(0), C1(0), C2(0), C3(0);
00958
00959 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
00960 for(Index k=0; k<depth; k++)
00961 {
00962 if(nr==2)
00963 {
00964 LhsScalar A0;
00965 RhsScalar B_0, B1;
00966
00967 A0 = blA[k];
00968 B_0 = blB[0];
00969 B1 = blB[1];
00970 MADD(cj,A0,B_0,C0,B_0);
00971 MADD(cj,A0,B1,C1,B1);
00972 }
00973 else
00974 {
00975 LhsScalar A0;
00976 RhsScalar B_0, B1, B2, B3;
00977
00978 A0 = blA[k];
00979 B_0 = blB[0];
00980 B1 = blB[1];
00981 B2 = blB[2];
00982 B3 = blB[3];
00983
00984 MADD(cj,A0,B_0,C0,B_0);
00985 MADD(cj,A0,B1,C1,B1);
00986 MADD(cj,A0,B2,C2,B2);
00987 MADD(cj,A0,B3,C3,B3);
00988 }
00989
00990 blB += nr;
00991 }
00992 res[(j2+0)*resStride + i] += alpha*C0;
00993 res[(j2+1)*resStride + i] += alpha*C1;
00994 if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
00995 if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
00996 }
00997 }
00998
00999
01000 for(Index j2=packet_cols; j2<cols; j2++)
01001 {
01002
01003 traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
01004
01005 for(Index i=0; i<peeled_mc; i+=mr)
01006 {
01007 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
01008 prefetch(&blA[0]);
01009
01010
01011
01012
01013 AccPacket C0, C4;
01014 traits.initAcc(C0);
01015 traits.initAcc(C4);
01016
01017 const RhsScalar* blB = unpackedB;
01018 for(Index k=0; k<depth; k++)
01019 {
01020 LhsPacket A0, A1;
01021 RhsPacket B_0;
01022 RhsPacket T0;
01023
01024 traits.loadLhs(&blA[0*LhsProgress], A0);
01025 traits.loadLhs(&blA[1*LhsProgress], A1);
01026 traits.loadRhs(&blB[0*RhsProgress], B_0);
01027 traits.madd(A0,B_0,C0,T0);
01028 traits.madd(A1,B_0,C4,B_0);
01029
01030 blB += RhsProgress;
01031 blA += 2*LhsProgress;
01032 }
01033 ResPacket R0, R4;
01034 ResPacket alphav = pset1<ResPacket>(alpha);
01035
01036 ResScalar* r0 = &res[(j2+0)*resStride + i];
01037
01038 R0 = ploadu<ResPacket>(r0);
01039 R4 = ploadu<ResPacket>(r0+ResPacketSize);
01040
01041 traits.acc(C0, alphav, R0);
01042 traits.acc(C4, alphav, R4);
01043
01044 pstoreu(r0, R0);
01045 pstoreu(r0+ResPacketSize, R4);
01046 }
01047 if(rows-peeled_mc>=LhsProgress)
01048 {
01049 Index i = peeled_mc;
01050 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
01051 prefetch(&blA[0]);
01052
01053 AccPacket C0;
01054 traits.initAcc(C0);
01055
01056 const RhsScalar* blB = unpackedB;
01057 for(Index k=0; k<depth; k++)
01058 {
01059 LhsPacket A0;
01060 RhsPacket B_0;
01061 traits.loadLhs(blA, A0);
01062 traits.loadRhs(blB, B_0);
01063 traits.madd(A0, B_0, C0, B_0);
01064 blB += RhsProgress;
01065 blA += LhsProgress;
01066 }
01067
01068 ResPacket alphav = pset1<ResPacket>(alpha);
01069 ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
01070 traits.acc(C0, alphav, R0);
01071 pstoreu(&res[(j2+0)*resStride + i], R0);
01072 }
01073 for(Index i=peeled_mc2; i<rows; i++)
01074 {
01075 const LhsScalar* blA = &blockA[i*strideA+offsetA];
01076 prefetch(&blA[0]);
01077
01078
01079 ResScalar C0(0);
01080
01081 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
01082 for(Index k=0; k<depth; k++)
01083 {
01084 LhsScalar A0 = blA[k];
01085 RhsScalar B_0 = blB[k];
01086 MADD(cj, A0, B_0, C0, B_0);
01087 }
01088 res[(j2+0)*resStride + i] += alpha*C0;
01089 }
01090 }
01091 }
01092 };
01093
01094 #undef CJMADD
01095
01096
01097
01098
01099
01100
01101
01102
01103
01104
01105
01106
01107
01108
01109
01110 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
01111 struct gemm_pack_lhs
01112 {
01113 EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
01114 Index stride=0, Index offset=0)
01115 {
01116 typedef typename packet_traits<Scalar>::type Packet;
01117 enum { PacketSize = packet_traits<Scalar>::size };
01118
01119 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
01120 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01121 eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
01122 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01123 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
01124 Index count = 0;
01125 Index peeled_mc = (rows/Pack1)*Pack1;
01126 for(Index i=0; i<peeled_mc; i+=Pack1)
01127 {
01128 if(PanelMode) count += Pack1 * offset;
01129
01130 if(StorageOrder==ColMajor)
01131 {
01132 for(Index k=0; k<depth; k++)
01133 {
01134 Packet A, B, C, D;
01135 if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
01136 if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
01137 if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
01138 if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
01139 if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
01140 if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
01141 if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
01142 if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
01143 }
01144 }
01145 else
01146 {
01147 for(Index k=0; k<depth; k++)
01148 {
01149
01150 Index w=0;
01151 for(; w<Pack1-3; w+=4)
01152 {
01153 Scalar a(cj(lhs(i+w+0, k))),
01154 b(cj(lhs(i+w+1, k))),
01155 c(cj(lhs(i+w+2, k))),
01156 d(cj(lhs(i+w+3, k)));
01157 blockA[count++] = a;
01158 blockA[count++] = b;
01159 blockA[count++] = c;
01160 blockA[count++] = d;
01161 }
01162 if(Pack1%4)
01163 for(;w<Pack1;++w)
01164 blockA[count++] = cj(lhs(i+w, k));
01165 }
01166 }
01167 if(PanelMode) count += Pack1 * (stride-offset-depth);
01168 }
01169 if(rows-peeled_mc>=Pack2)
01170 {
01171 if(PanelMode) count += Pack2*offset;
01172 for(Index k=0; k<depth; k++)
01173 for(Index w=0; w<Pack2; w++)
01174 blockA[count++] = cj(lhs(peeled_mc+w, k));
01175 if(PanelMode) count += Pack2 * (stride-offset-depth);
01176 peeled_mc += Pack2;
01177 }
01178 for(Index i=peeled_mc; i<rows; i++)
01179 {
01180 if(PanelMode) count += offset;
01181 for(Index k=0; k<depth; k++)
01182 blockA[count++] = cj(lhs(i, k));
01183 if(PanelMode) count += (stride-offset-depth);
01184 }
01185 }
01186 };
01187
01188
01189
01190
01191
01192
01193
01194
01195 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01196 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
01197 {
01198 typedef typename packet_traits<Scalar>::type Packet;
01199 enum { PacketSize = packet_traits<Scalar>::size };
01200 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01201 Index stride=0, Index offset=0)
01202 {
01203 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
01204 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01205 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01206 Index packet_cols = (cols/nr) * nr;
01207 Index count = 0;
01208 for(Index j2=0; j2<packet_cols; j2+=nr)
01209 {
01210
01211 if(PanelMode) count += nr * offset;
01212 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01213 const Scalar* b1 = &rhs[(j2+1)*rhsStride];
01214 const Scalar* b2 = &rhs[(j2+2)*rhsStride];
01215 const Scalar* b3 = &rhs[(j2+3)*rhsStride];
01216 for(Index k=0; k<depth; k++)
01217 {
01218 blockB[count+0] = cj(b0[k]);
01219 blockB[count+1] = cj(b1[k]);
01220 if(nr==4) blockB[count+2] = cj(b2[k]);
01221 if(nr==4) blockB[count+3] = cj(b3[k]);
01222 count += nr;
01223 }
01224
01225 if(PanelMode) count += nr * (stride-offset-depth);
01226 }
01227
01228
01229 for(Index j2=packet_cols; j2<cols; ++j2)
01230 {
01231 if(PanelMode) count += offset;
01232 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01233 for(Index k=0; k<depth; k++)
01234 {
01235 blockB[count] = cj(b0[k]);
01236 count += 1;
01237 }
01238 if(PanelMode) count += (stride-offset-depth);
01239 }
01240 }
01241 };
01242
01243
01244 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01245 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
01246 {
01247 enum { PacketSize = packet_traits<Scalar>::size };
01248 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01249 Index stride=0, Index offset=0)
01250 {
01251 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
01252 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01253 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01254 Index packet_cols = (cols/nr) * nr;
01255 Index count = 0;
01256 for(Index j2=0; j2<packet_cols; j2+=nr)
01257 {
01258
01259 if(PanelMode) count += nr * offset;
01260 for(Index k=0; k<depth; k++)
01261 {
01262 const Scalar* b0 = &rhs[k*rhsStride + j2];
01263 blockB[count+0] = cj(b0[0]);
01264 blockB[count+1] = cj(b0[1]);
01265 if(nr==4) blockB[count+2] = cj(b0[2]);
01266 if(nr==4) blockB[count+3] = cj(b0[3]);
01267 count += nr;
01268 }
01269
01270 if(PanelMode) count += nr * (stride-offset-depth);
01271 }
01272
01273 for(Index j2=packet_cols; j2<cols; ++j2)
01274 {
01275 if(PanelMode) count += offset;
01276 const Scalar* b0 = &rhs[j2];
01277 for(Index k=0; k<depth; k++)
01278 {
01279 blockB[count] = cj(b0[k*rhsStride]);
01280 count += 1;
01281 }
01282 if(PanelMode) count += stride-offset-depth;
01283 }
01284 }
01285 };
01286
01287 }
01288
01291 inline std::ptrdiff_t l1CacheSize()
01292 {
01293 std::ptrdiff_t l1, l2;
01294 internal::manage_caching_sizes(GetAction, &l1, &l2);
01295 return l1;
01296 }
01297
01300 inline std::ptrdiff_t l2CacheSize()
01301 {
01302 std::ptrdiff_t l1, l2;
01303 internal::manage_caching_sizes(GetAction, &l1, &l2);
01304 return l2;
01305 }
01306
01312 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
01313 {
01314 internal::manage_caching_sizes(SetAction, &l1, &l2);
01315 }
01316
01317 }
01318
01319 #endif // EIGEN_GENERAL_BLOCK_PANEL_H