00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
00026 #define EIGEN_GENERAL_BLOCK_PANEL_H
00027
00028 namespace internal {
00029
00030 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
00031 class gebp_traits;
00032
00034 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
00035 {
00036 static std::ptrdiff_t m_l1CacheSize = 0;
00037 static std::ptrdiff_t m_l2CacheSize = 0;
00038 if(m_l1CacheSize==0)
00039 {
00040 m_l1CacheSize = queryL1CacheSize();
00041 m_l2CacheSize = queryTopLevelCacheSize();
00042
00043 if(m_l1CacheSize<=0) m_l1CacheSize = 8 * 1024;
00044 if(m_l2CacheSize<=0) m_l2CacheSize = 1 * 1024 * 1024;
00045 }
00046
00047 if(action==SetAction)
00048 {
00049
00050 eigen_internal_assert(l1!=0 && l2!=0);
00051 m_l1CacheSize = *l1;
00052 m_l2CacheSize = *l2;
00053 }
00054 else if(action==GetAction)
00055 {
00056 eigen_internal_assert(l1!=0 && l2!=0);
00057 *l1 = m_l1CacheSize;
00058 *l2 = m_l2CacheSize;
00059 }
00060 else
00061 {
00062 eigen_internal_assert(false);
00063 }
00064 }
00065
00081 template<typename LhsScalar, typename RhsScalar, int KcFactor>
00082 void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00083 {
00084
00085
00086
00087
00088
00089
00090
00091 std::ptrdiff_t l1, l2;
00092
00093 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
00094 enum {
00095 kdiv = KcFactor * 2 * Traits::nr
00096 * Traits::RhsProgress * sizeof(RhsScalar),
00097 mr = gebp_traits<LhsScalar,RhsScalar>::mr,
00098 mr_mask = (0xffffffff/mr)*mr
00099 };
00100
00101 manage_caching_sizes(GetAction, &l1, &l2);
00102 k = std::min<std::ptrdiff_t>(k, l1/kdiv);
00103 std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
00104 if(_m<m) m = _m & mr_mask;
00105 n = n;
00106 }
00107
00108 template<typename LhsScalar, typename RhsScalar>
00109 inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00110 {
00111 computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
00112 }
00113
00114 #ifdef EIGEN_HAS_FUSE_CJMADD
00115 #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
00116 #else
00117
00118
00119
00120 template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
00121 EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& )
00122 {
00123 c = cj.pmadd(a,b,c);
00124 }
00125 };
00126
00127 template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
00128 EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t)
00129 {
00130 t = b; t = cj.pmul(a,t); c = padd(c,t);
00131 }
00132 };
00133
00134 template<typename CJ, typename A, typename B, typename C, typename T>
00135 EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
00136 {
00137 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
00138 }
00139
00140 #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
00141
00142 #endif
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
00155 class gebp_traits
00156 {
00157 public:
00158 typedef _LhsScalar LhsScalar;
00159 typedef _RhsScalar RhsScalar;
00160 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00161
00162 enum {
00163 ConjLhs = _ConjLhs,
00164 ConjRhs = _ConjRhs,
00165 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00166 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00167 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00168 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00169
00170 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00171
00172
00173 nr = NumberOfRegisters/4,
00174
00175
00176 mr = 2 * LhsPacketSize,
00177
00178 WorkSpaceFactor = nr * RhsPacketSize,
00179
00180 LhsProgress = LhsPacketSize,
00181 RhsProgress = RhsPacketSize
00182 };
00183
00184 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00185 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00186 typedef typename packet_traits<ResScalar>::type _ResPacket;
00187
00188 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00189 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00190 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00191
00192 typedef ResPacket AccPacket;
00193
00194 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00195 {
00196 p = pset1<ResPacket>(ResScalar(0));
00197 }
00198
00199 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00200 {
00201 for(DenseIndex k=0; k<n; k++)
00202 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00203 }
00204
00205 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00206 {
00207 dest = pload<RhsPacket>(b);
00208 }
00209
00210 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00211 {
00212 dest = pload<LhsPacket>(a);
00213 }
00214
00215 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
00216 {
00217 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
00218 }
00219
00220 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00221 {
00222 r = pmadd(c,alpha,r);
00223 }
00224
00225 protected:
00226
00227
00228 };
00229
00230 template<typename RealScalar, bool _ConjLhs>
00231 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
00232 {
00233 public:
00234 typedef std::complex<RealScalar> LhsScalar;
00235 typedef RealScalar RhsScalar;
00236 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00237
00238 enum {
00239 ConjLhs = _ConjLhs,
00240 ConjRhs = false,
00241 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00242 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00243 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00244 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00245
00246 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00247 nr = NumberOfRegisters/4,
00248 mr = 2 * LhsPacketSize,
00249 WorkSpaceFactor = nr*RhsPacketSize,
00250
00251 LhsProgress = LhsPacketSize,
00252 RhsProgress = RhsPacketSize
00253 };
00254
00255 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00256 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00257 typedef typename packet_traits<ResScalar>::type _ResPacket;
00258
00259 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00260 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00261 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00262
00263 typedef ResPacket AccPacket;
00264
00265 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00266 {
00267 p = pset1<ResPacket>(ResScalar(0));
00268 }
00269
00270 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00271 {
00272 for(DenseIndex k=0; k<n; k++)
00273 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00274 }
00275
00276 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00277 {
00278 dest = pload<RhsPacket>(b);
00279 }
00280
00281 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00282 {
00283 dest = pload<LhsPacket>(a);
00284 }
00285
00286 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00287 {
00288 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00289 }
00290
00291 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00292 {
00293 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
00294 }
00295
00296 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00297 {
00298 c += a * b;
00299 }
00300
00301 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00302 {
00303 r = cj.pmadd(c,alpha,r);
00304 }
00305
00306 protected:
00307 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
00308 };
00309
00310 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
00311 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
00312 {
00313 public:
00314 typedef std::complex<RealScalar> Scalar;
00315 typedef std::complex<RealScalar> LhsScalar;
00316 typedef std::complex<RealScalar> RhsScalar;
00317 typedef std::complex<RealScalar> ResScalar;
00318
00319 enum {
00320 ConjLhs = _ConjLhs,
00321 ConjRhs = _ConjRhs,
00322 Vectorizable = packet_traits<RealScalar>::Vectorizable
00323 && packet_traits<Scalar>::Vectorizable,
00324 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
00325 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00326
00327 nr = 2,
00328 mr = 2 * ResPacketSize,
00329 WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
00330
00331 LhsProgress = ResPacketSize,
00332 RhsProgress = Vectorizable ? 2*ResPacketSize : 1
00333 };
00334
00335 typedef typename packet_traits<RealScalar>::type RealPacket;
00336 typedef typename packet_traits<Scalar>::type ScalarPacket;
00337 struct DoublePacket
00338 {
00339 RealPacket first;
00340 RealPacket second;
00341 };
00342
00343 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
00344 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
00345 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
00346 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
00347
00348 EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
00349
00350 EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
00351 {
00352 p.first = pset1<RealPacket>(RealScalar(0));
00353 p.second = pset1<RealPacket>(RealScalar(0));
00354 }
00355
00356
00357
00358
00359
00360 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
00361 {
00362 for(DenseIndex k=0; k<n; k++)
00363 {
00364 if(Vectorizable)
00365 {
00366 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k]));
00367 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
00368 }
00369 else
00370 b[k] = rhs[k];
00371 }
00372 }
00373
00374 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
00375
00376 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
00377 {
00378 dest.first = pload<RealPacket>((const RealScalar*)b);
00379 dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
00380 }
00381
00382
00383 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00384 {
00385 dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
00386 }
00387
00388 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& ) const
00389 {
00390 c.first = padd(pmul(a,b.first), c.first);
00391 c.second = padd(pmul(a,b.second),c.second);
00392 }
00393
00394 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& ) const
00395 {
00396 c = cj.pmadd(a,b,c);
00397 }
00398
00399 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
00400
00401 EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
00402 {
00403
00404 ResPacket tmp;
00405 if((!ConjLhs)&&(!ConjRhs))
00406 {
00407 tmp = pcplxflip(pconj(ResPacket(c.second)));
00408 tmp = padd(ResPacket(c.first),tmp);
00409 }
00410 else if((!ConjLhs)&&(ConjRhs))
00411 {
00412 tmp = pconj(pcplxflip(ResPacket(c.second)));
00413 tmp = padd(ResPacket(c.first),tmp);
00414 }
00415 else if((ConjLhs)&&(!ConjRhs))
00416 {
00417 tmp = pcplxflip(ResPacket(c.second));
00418 tmp = padd(pconj(ResPacket(c.first)),tmp);
00419 }
00420 else if((ConjLhs)&&(ConjRhs))
00421 {
00422 tmp = pcplxflip(ResPacket(c.second));
00423 tmp = psub(pconj(ResPacket(c.first)),tmp);
00424 }
00425
00426 r = pmadd(tmp,alpha,r);
00427 }
00428
00429 protected:
00430 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00431 };
00432
00433 template<typename RealScalar, bool _ConjRhs>
00434 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
00435 {
00436 public:
00437 typedef std::complex<RealScalar> Scalar;
00438 typedef RealScalar LhsScalar;
00439 typedef Scalar RhsScalar;
00440 typedef Scalar ResScalar;
00441
00442 enum {
00443 ConjLhs = false,
00444 ConjRhs = _ConjRhs,
00445 Vectorizable = packet_traits<RealScalar>::Vectorizable
00446 && packet_traits<Scalar>::Vectorizable,
00447 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00448 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00449 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00450
00451 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00452 nr = 4,
00453 mr = 2*ResPacketSize,
00454 WorkSpaceFactor = nr*RhsPacketSize,
00455
00456 LhsProgress = ResPacketSize,
00457 RhsProgress = ResPacketSize
00458 };
00459
00460 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00461 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00462 typedef typename packet_traits<ResScalar>::type _ResPacket;
00463
00464 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00465 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00466 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00467
00468 typedef ResPacket AccPacket;
00469
00470 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00471 {
00472 p = pset1<ResPacket>(ResScalar(0));
00473 }
00474
00475 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00476 {
00477 for(DenseIndex k=0; k<n; k++)
00478 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
00479 }
00480
00481 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00482 {
00483 dest = pload<RhsPacket>(b);
00484 }
00485
00486 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00487 {
00488 dest = ploaddup<LhsPacket>(a);
00489 }
00490
00491 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00492 {
00493 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00494 }
00495
00496 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00497 {
00498 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
00499 }
00500
00501 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00502 {
00503 c += a * b;
00504 }
00505
00506 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00507 {
00508 r = cj.pmadd(alpha,c,r);
00509 }
00510
00511 protected:
00512 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
00513 };
00514
00515
00516
00517
00518
00519
00520
00521
00522 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
00523 struct gebp_kernel
00524 {
00525 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
00526 typedef typename Traits::ResScalar ResScalar;
00527 typedef typename Traits::LhsPacket LhsPacket;
00528 typedef typename Traits::RhsPacket RhsPacket;
00529 typedef typename Traits::ResPacket ResPacket;
00530 typedef typename Traits::AccPacket AccPacket;
00531
00532 enum {
00533 Vectorizable = Traits::Vectorizable,
00534 LhsProgress = Traits::LhsProgress,
00535 RhsProgress = Traits::RhsProgress,
00536 ResPacketSize = Traits::ResPacketSize
00537 };
00538
00539 EIGEN_FLATTEN_ATTRIB
00540 void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
00541 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
00542 {
00543 Traits traits;
00544
00545 if(strideA==-1) strideA = depth;
00546 if(strideB==-1) strideB = depth;
00547 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00548
00549 Index packet_cols = (cols/nr) * nr;
00550 const Index peeled_mc = (rows/mr)*mr;
00551
00552 const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
00553 const Index peeled_kc = (depth/4)*4;
00554
00555 if(unpackedB==0)
00556 unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
00557
00558
00559 for(Index j2=0; j2<packet_cols; j2+=nr)
00560 {
00561 traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB);
00562
00563
00564
00565
00566 for(Index i=0; i<peeled_mc; i+=mr)
00567 {
00568 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
00569 prefetch(&blA[0]);
00570
00571
00572 AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
00573 traits.initAcc(C0);
00574 traits.initAcc(C1);
00575 if(nr==4) traits.initAcc(C2);
00576 if(nr==4) traits.initAcc(C3);
00577 traits.initAcc(C4);
00578 traits.initAcc(C5);
00579 if(nr==4) traits.initAcc(C6);
00580 if(nr==4) traits.initAcc(C7);
00581
00582 ResScalar* r0 = &res[(j2+0)*resStride + i];
00583 ResScalar* r1 = r0 + resStride;
00584 ResScalar* r2 = r1 + resStride;
00585 ResScalar* r3 = r2 + resStride;
00586
00587 prefetch(r0+16);
00588 prefetch(r1+16);
00589 prefetch(r2+16);
00590 prefetch(r3+16);
00591
00592
00593
00594
00595 const RhsScalar* blB = unpackedB;
00596 for(Index k=0; k<peeled_kc; k+=4)
00597 {
00598 if(nr==2)
00599 {
00600 LhsPacket A0, A1;
00601 RhsPacket B0;
00602 RhsPacket T0;
00603
00604 EIGEN_ASM_COMMENT("mybegin2");
00605 traits.loadLhs(&blA[0*LhsProgress], A0);
00606 traits.loadLhs(&blA[1*LhsProgress], A1);
00607 traits.loadRhs(&blB[0*RhsProgress], B0);
00608 traits.madd(A0,B0,C0,T0);
00609 traits.madd(A1,B0,C4,B0);
00610 traits.loadRhs(&blB[1*RhsProgress], B0);
00611 traits.madd(A0,B0,C1,T0);
00612 traits.madd(A1,B0,C5,B0);
00613
00614 traits.loadLhs(&blA[2*LhsProgress], A0);
00615 traits.loadLhs(&blA[3*LhsProgress], A1);
00616 traits.loadRhs(&blB[2*RhsProgress], B0);
00617 traits.madd(A0,B0,C0,T0);
00618 traits.madd(A1,B0,C4,B0);
00619 traits.loadRhs(&blB[3*RhsProgress], B0);
00620 traits.madd(A0,B0,C1,T0);
00621 traits.madd(A1,B0,C5,B0);
00622
00623 traits.loadLhs(&blA[4*LhsProgress], A0);
00624 traits.loadLhs(&blA[5*LhsProgress], A1);
00625 traits.loadRhs(&blB[4*RhsProgress], B0);
00626 traits.madd(A0,B0,C0,T0);
00627 traits.madd(A1,B0,C4,B0);
00628 traits.loadRhs(&blB[5*RhsProgress], B0);
00629 traits.madd(A0,B0,C1,T0);
00630 traits.madd(A1,B0,C5,B0);
00631
00632 traits.loadLhs(&blA[6*LhsProgress], A0);
00633 traits.loadLhs(&blA[7*LhsProgress], A1);
00634 traits.loadRhs(&blB[6*RhsProgress], B0);
00635 traits.madd(A0,B0,C0,T0);
00636 traits.madd(A1,B0,C4,B0);
00637 traits.loadRhs(&blB[7*RhsProgress], B0);
00638 traits.madd(A0,B0,C1,T0);
00639 traits.madd(A1,B0,C5,B0);
00640 EIGEN_ASM_COMMENT("myend");
00641 }
00642 else
00643 {
00644 EIGEN_ASM_COMMENT("mybegin4");
00645 LhsPacket A0, A1;
00646 RhsPacket B0, B1, B2, B3;
00647 RhsPacket T0;
00648
00649 traits.loadLhs(&blA[0*LhsProgress], A0);
00650 traits.loadLhs(&blA[1*LhsProgress], A1);
00651 traits.loadRhs(&blB[0*RhsProgress], B0);
00652 traits.loadRhs(&blB[1*RhsProgress], B1);
00653
00654 traits.madd(A0,B0,C0,T0);
00655 traits.loadRhs(&blB[2*RhsProgress], B2);
00656 traits.madd(A1,B0,C4,B0);
00657 traits.loadRhs(&blB[3*RhsProgress], B3);
00658 traits.loadRhs(&blB[4*RhsProgress], B0);
00659 traits.madd(A0,B1,C1,T0);
00660 traits.madd(A1,B1,C5,B1);
00661 traits.loadRhs(&blB[5*RhsProgress], B1);
00662 traits.madd(A0,B2,C2,T0);
00663 traits.madd(A1,B2,C6,B2);
00664 traits.loadRhs(&blB[6*RhsProgress], B2);
00665 traits.madd(A0,B3,C3,T0);
00666 traits.loadLhs(&blA[2*LhsProgress], A0);
00667 traits.madd(A1,B3,C7,B3);
00668 traits.loadLhs(&blA[3*LhsProgress], A1);
00669 traits.loadRhs(&blB[7*RhsProgress], B3);
00670 traits.madd(A0,B0,C0,T0);
00671 traits.madd(A1,B0,C4,B0);
00672 traits.loadRhs(&blB[8*RhsProgress], B0);
00673 traits.madd(A0,B1,C1,T0);
00674 traits.madd(A1,B1,C5,B1);
00675 traits.loadRhs(&blB[9*RhsProgress], B1);
00676 traits.madd(A0,B2,C2,T0);
00677 traits.madd(A1,B2,C6,B2);
00678 traits.loadRhs(&blB[10*RhsProgress], B2);
00679 traits.madd(A0,B3,C3,T0);
00680 traits.loadLhs(&blA[4*LhsProgress], A0);
00681 traits.madd(A1,B3,C7,B3);
00682 traits.loadLhs(&blA[5*LhsProgress], A1);
00683 traits.loadRhs(&blB[11*RhsProgress], B3);
00684
00685 traits.madd(A0,B0,C0,T0);
00686 traits.madd(A1,B0,C4,B0);
00687 traits.loadRhs(&blB[12*RhsProgress], B0);
00688 traits.madd(A0,B1,C1,T0);
00689 traits.madd(A1,B1,C5,B1);
00690 traits.loadRhs(&blB[13*RhsProgress], B1);
00691 traits.madd(A0,B2,C2,T0);
00692 traits.madd(A1,B2,C6,B2);
00693 traits.loadRhs(&blB[14*RhsProgress], B2);
00694 traits.madd(A0,B3,C3,T0);
00695 traits.loadLhs(&blA[6*LhsProgress], A0);
00696 traits.madd(A1,B3,C7,B3);
00697 traits.loadLhs(&blA[7*LhsProgress], A1);
00698 traits.loadRhs(&blB[15*RhsProgress], B3);
00699 traits.madd(A0,B0,C0,T0);
00700 traits.madd(A1,B0,C4,B0);
00701 traits.madd(A0,B1,C1,T0);
00702 traits.madd(A1,B1,C5,B1);
00703 traits.madd(A0,B2,C2,T0);
00704 traits.madd(A1,B2,C6,B2);
00705 traits.madd(A0,B3,C3,T0);
00706 traits.madd(A1,B3,C7,B3);
00707 }
00708
00709 blB += 4*nr*RhsProgress;
00710 blA += 4*mr;
00711 }
00712
00713 for(Index k=peeled_kc; k<depth; k++)
00714 {
00715 if(nr==2)
00716 {
00717 LhsPacket A0, A1;
00718 RhsPacket B0;
00719 RhsPacket T0;
00720
00721 traits.loadLhs(&blA[0*LhsProgress], A0);
00722 traits.loadLhs(&blA[1*LhsProgress], A1);
00723 traits.loadRhs(&blB[0*RhsProgress], B0);
00724 traits.madd(A0,B0,C0,T0);
00725 traits.madd(A1,B0,C4,B0);
00726 traits.loadRhs(&blB[1*RhsProgress], B0);
00727 traits.madd(A0,B0,C1,T0);
00728 traits.madd(A1,B0,C5,B0);
00729 }
00730 else
00731 {
00732 LhsPacket A0, A1;
00733 RhsPacket B0, B1, B2, B3;
00734 RhsPacket T0;
00735
00736 traits.loadLhs(&blA[0*LhsProgress], A0);
00737 traits.loadLhs(&blA[1*LhsProgress], A1);
00738 traits.loadRhs(&blB[0*RhsProgress], B0);
00739 traits.loadRhs(&blB[1*RhsProgress], B1);
00740
00741 traits.madd(A0,B0,C0,T0);
00742 traits.loadRhs(&blB[2*RhsProgress], B2);
00743 traits.madd(A1,B0,C4,B0);
00744 traits.loadRhs(&blB[3*RhsProgress], B3);
00745 traits.madd(A0,B1,C1,T0);
00746 traits.madd(A1,B1,C5,B1);
00747 traits.madd(A0,B2,C2,T0);
00748 traits.madd(A1,B2,C6,B2);
00749 traits.madd(A0,B3,C3,T0);
00750 traits.madd(A1,B3,C7,B3);
00751 }
00752
00753 blB += nr*RhsProgress;
00754 blA += mr;
00755 }
00756
00757 if(nr==4)
00758 {
00759 ResPacket R0, R1, R2, R3, R4, R5, R6;
00760 ResPacket alphav = pset1<ResPacket>(alpha);
00761
00762 R0 = ploadu<ResPacket>(r0);
00763 R1 = ploadu<ResPacket>(r1);
00764 R2 = ploadu<ResPacket>(r2);
00765 R3 = ploadu<ResPacket>(r3);
00766 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00767 R5 = ploadu<ResPacket>(r1 + ResPacketSize);
00768 R6 = ploadu<ResPacket>(r2 + ResPacketSize);
00769 traits.acc(C0, alphav, R0);
00770 pstoreu(r0, R0);
00771 R0 = ploadu<ResPacket>(r3 + ResPacketSize);
00772
00773 traits.acc(C1, alphav, R1);
00774 traits.acc(C2, alphav, R2);
00775 traits.acc(C3, alphav, R3);
00776 traits.acc(C4, alphav, R4);
00777 traits.acc(C5, alphav, R5);
00778 traits.acc(C6, alphav, R6);
00779 traits.acc(C7, alphav, R0);
00780
00781 pstoreu(r1, R1);
00782 pstoreu(r2, R2);
00783 pstoreu(r3, R3);
00784 pstoreu(r0 + ResPacketSize, R4);
00785 pstoreu(r1 + ResPacketSize, R5);
00786 pstoreu(r2 + ResPacketSize, R6);
00787 pstoreu(r3 + ResPacketSize, R0);
00788 }
00789 else
00790 {
00791 ResPacket R0, R1, R4;
00792 ResPacket alphav = pset1<ResPacket>(alpha);
00793
00794 R0 = ploadu<ResPacket>(r0);
00795 R1 = ploadu<ResPacket>(r1);
00796 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00797 traits.acc(C0, alphav, R0);
00798 pstoreu(r0, R0);
00799 R0 = ploadu<ResPacket>(r1 + ResPacketSize);
00800 traits.acc(C1, alphav, R1);
00801 traits.acc(C4, alphav, R4);
00802 traits.acc(C5, alphav, R0);
00803 pstoreu(r1, R1);
00804 pstoreu(r0 + ResPacketSize, R4);
00805 pstoreu(r1 + ResPacketSize, R0);
00806 }
00807
00808 }
00809
00810 if(rows-peeled_mc>=LhsProgress)
00811 {
00812 Index i = peeled_mc;
00813 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
00814 prefetch(&blA[0]);
00815
00816
00817 AccPacket C0, C1, C2, C3;
00818 traits.initAcc(C0);
00819 traits.initAcc(C1);
00820 if(nr==4) traits.initAcc(C2);
00821 if(nr==4) traits.initAcc(C3);
00822
00823
00824 const RhsScalar* blB = unpackedB;
00825 for(Index k=0; k<peeled_kc; k+=4)
00826 {
00827 if(nr==2)
00828 {
00829 LhsPacket A0;
00830 RhsPacket B0, B1;
00831
00832 traits.loadLhs(&blA[0*LhsProgress], A0);
00833 traits.loadRhs(&blB[0*RhsProgress], B0);
00834 traits.loadRhs(&blB[1*RhsProgress], B1);
00835 traits.madd(A0,B0,C0,B0);
00836 traits.loadRhs(&blB[2*RhsProgress], B0);
00837 traits.madd(A0,B1,C1,B1);
00838 traits.loadLhs(&blA[1*LhsProgress], A0);
00839 traits.loadRhs(&blB[3*RhsProgress], B1);
00840 traits.madd(A0,B0,C0,B0);
00841 traits.loadRhs(&blB[4*RhsProgress], B0);
00842 traits.madd(A0,B1,C1,B1);
00843 traits.loadLhs(&blA[2*LhsProgress], A0);
00844 traits.loadRhs(&blB[5*RhsProgress], B1);
00845 traits.madd(A0,B0,C0,B0);
00846 traits.loadRhs(&blB[6*RhsProgress], B0);
00847 traits.madd(A0,B1,C1,B1);
00848 traits.loadLhs(&blA[3*LhsProgress], A0);
00849 traits.loadRhs(&blB[7*RhsProgress], B1);
00850 traits.madd(A0,B0,C0,B0);
00851 traits.madd(A0,B1,C1,B1);
00852 }
00853 else
00854 {
00855 LhsPacket A0;
00856 RhsPacket B0, B1, B2, B3;
00857
00858 traits.loadLhs(&blA[0*LhsProgress], A0);
00859 traits.loadRhs(&blB[0*RhsProgress], B0);
00860 traits.loadRhs(&blB[1*RhsProgress], B1);
00861
00862 traits.madd(A0,B0,C0,B0);
00863 traits.loadRhs(&blB[2*RhsProgress], B2);
00864 traits.loadRhs(&blB[3*RhsProgress], B3);
00865 traits.loadRhs(&blB[4*RhsProgress], B0);
00866 traits.madd(A0,B1,C1,B1);
00867 traits.loadRhs(&blB[5*RhsProgress], B1);
00868 traits.madd(A0,B2,C2,B2);
00869 traits.loadRhs(&blB[6*RhsProgress], B2);
00870 traits.madd(A0,B3,C3,B3);
00871 traits.loadLhs(&blA[1*LhsProgress], A0);
00872 traits.loadRhs(&blB[7*RhsProgress], B3);
00873 traits.madd(A0,B0,C0,B0);
00874 traits.loadRhs(&blB[8*RhsProgress], B0);
00875 traits.madd(A0,B1,C1,B1);
00876 traits.loadRhs(&blB[9*RhsProgress], B1);
00877 traits.madd(A0,B2,C2,B2);
00878 traits.loadRhs(&blB[10*RhsProgress], B2);
00879 traits.madd(A0,B3,C3,B3);
00880 traits.loadLhs(&blA[2*LhsProgress], A0);
00881 traits.loadRhs(&blB[11*RhsProgress], B3);
00882
00883 traits.madd(A0,B0,C0,B0);
00884 traits.loadRhs(&blB[12*RhsProgress], B0);
00885 traits.madd(A0,B1,C1,B1);
00886 traits.loadRhs(&blB[13*RhsProgress], B1);
00887 traits.madd(A0,B2,C2,B2);
00888 traits.loadRhs(&blB[14*RhsProgress], B2);
00889 traits.madd(A0,B3,C3,B3);
00890
00891 traits.loadLhs(&blA[3*LhsProgress], A0);
00892 traits.loadRhs(&blB[15*RhsProgress], B3);
00893 traits.madd(A0,B0,C0,B0);
00894 traits.madd(A0,B1,C1,B1);
00895 traits.madd(A0,B2,C2,B2);
00896 traits.madd(A0,B3,C3,B3);
00897 }
00898
00899 blB += nr*4*RhsProgress;
00900 blA += 4*LhsProgress;
00901 }
00902
00903 for(Index k=peeled_kc; k<depth; k++)
00904 {
00905 if(nr==2)
00906 {
00907 LhsPacket A0;
00908 RhsPacket B0, B1;
00909
00910 traits.loadLhs(&blA[0*LhsProgress], A0);
00911 traits.loadRhs(&blB[0*RhsProgress], B0);
00912 traits.loadRhs(&blB[1*RhsProgress], B1);
00913 traits.madd(A0,B0,C0,B0);
00914 traits.madd(A0,B1,C1,B1);
00915 }
00916 else
00917 {
00918 LhsPacket A0;
00919 RhsPacket B0, B1, B2, B3;
00920
00921 traits.loadLhs(&blA[0*LhsProgress], A0);
00922 traits.loadRhs(&blB[0*RhsProgress], B0);
00923 traits.loadRhs(&blB[1*RhsProgress], B1);
00924 traits.loadRhs(&blB[2*RhsProgress], B2);
00925 traits.loadRhs(&blB[3*RhsProgress], B3);
00926
00927 traits.madd(A0,B0,C0,B0);
00928 traits.madd(A0,B1,C1,B1);
00929 traits.madd(A0,B2,C2,B2);
00930 traits.madd(A0,B3,C3,B3);
00931 }
00932
00933 blB += nr*RhsProgress;
00934 blA += LhsProgress;
00935 }
00936
00937 ResPacket R0, R1, R2, R3;
00938 ResPacket alphav = pset1<ResPacket>(alpha);
00939
00940 ResScalar* r0 = &res[(j2+0)*resStride + i];
00941 ResScalar* r1 = r0 + resStride;
00942 ResScalar* r2 = r1 + resStride;
00943 ResScalar* r3 = r2 + resStride;
00944
00945 R0 = ploadu<ResPacket>(r0);
00946 R1 = ploadu<ResPacket>(r1);
00947 if(nr==4) R2 = ploadu<ResPacket>(r2);
00948 if(nr==4) R3 = ploadu<ResPacket>(r3);
00949
00950 traits.acc(C0, alphav, R0);
00951 traits.acc(C1, alphav, R1);
00952 if(nr==4) traits.acc(C2, alphav, R2);
00953 if(nr==4) traits.acc(C3, alphav, R3);
00954
00955 pstoreu(r0, R0);
00956 pstoreu(r1, R1);
00957 if(nr==4) pstoreu(r2, R2);
00958 if(nr==4) pstoreu(r3, R3);
00959 }
00960 for(Index i=peeled_mc2; i<rows; i++)
00961 {
00962 const LhsScalar* blA = &blockA[i*strideA+offsetA];
00963 prefetch(&blA[0]);
00964
00965
00966 ResScalar C0(0), C1(0), C2(0), C3(0);
00967
00968 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
00969 for(Index k=0; k<depth; k++)
00970 {
00971 if(nr==2)
00972 {
00973 LhsScalar A0;
00974 RhsScalar B0, B1;
00975
00976 A0 = blA[k];
00977 B0 = blB[0];
00978 B1 = blB[1];
00979 MADD(cj,A0,B0,C0,B0);
00980 MADD(cj,A0,B1,C1,B1);
00981 }
00982 else
00983 {
00984 LhsScalar A0;
00985 RhsScalar B0, B1, B2, B3;
00986
00987 A0 = blA[k];
00988 B0 = blB[0];
00989 B1 = blB[1];
00990 B2 = blB[2];
00991 B3 = blB[3];
00992
00993 MADD(cj,A0,B0,C0,B0);
00994 MADD(cj,A0,B1,C1,B1);
00995 MADD(cj,A0,B2,C2,B2);
00996 MADD(cj,A0,B3,C3,B3);
00997 }
00998
00999 blB += nr;
01000 }
01001 res[(j2+0)*resStride + i] += alpha*C0;
01002 res[(j2+1)*resStride + i] += alpha*C1;
01003 if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
01004 if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
01005 }
01006 }
01007
01008
01009 for(Index j2=packet_cols; j2<cols; j2++)
01010 {
01011
01012 traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
01013
01014 for(Index i=0; i<peeled_mc; i+=mr)
01015 {
01016 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
01017 prefetch(&blA[0]);
01018
01019
01020
01021
01022 AccPacket C0, C4;
01023 traits.initAcc(C0);
01024 traits.initAcc(C4);
01025
01026 const RhsScalar* blB = unpackedB;
01027 for(Index k=0; k<depth; k++)
01028 {
01029 LhsPacket A0, A1;
01030 RhsPacket B0;
01031 RhsPacket T0;
01032
01033 traits.loadLhs(&blA[0*LhsProgress], A0);
01034 traits.loadLhs(&blA[1*LhsProgress], A1);
01035 traits.loadRhs(&blB[0*RhsProgress], B0);
01036 traits.madd(A0,B0,C0,T0);
01037 traits.madd(A1,B0,C4,B0);
01038
01039 blB += RhsProgress;
01040 blA += 2*LhsProgress;
01041 }
01042 ResPacket R0, R4;
01043 ResPacket alphav = pset1<ResPacket>(alpha);
01044
01045 ResScalar* r0 = &res[(j2+0)*resStride + i];
01046
01047 R0 = ploadu<ResPacket>(r0);
01048 R4 = ploadu<ResPacket>(r0+ResPacketSize);
01049
01050 traits.acc(C0, alphav, R0);
01051 traits.acc(C4, alphav, R4);
01052
01053 pstoreu(r0, R0);
01054 pstoreu(r0+ResPacketSize, R4);
01055 }
01056 if(rows-peeled_mc>=LhsProgress)
01057 {
01058 Index i = peeled_mc;
01059 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
01060 prefetch(&blA[0]);
01061
01062 AccPacket C0;
01063 traits.initAcc(C0);
01064
01065 const RhsScalar* blB = unpackedB;
01066 for(Index k=0; k<depth; k++)
01067 {
01068 LhsPacket A0;
01069 RhsPacket B0;
01070 traits.loadLhs(blA, A0);
01071 traits.loadRhs(blB, B0);
01072 traits.madd(A0, B0, C0, B0);
01073 blB += RhsProgress;
01074 blA += LhsProgress;
01075 }
01076
01077 ResPacket alphav = pset1<ResPacket>(alpha);
01078 ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
01079 traits.acc(C0, alphav, R0);
01080 pstoreu(&res[(j2+0)*resStride + i], R0);
01081 }
01082 for(Index i=peeled_mc2; i<rows; i++)
01083 {
01084 const LhsScalar* blA = &blockA[i*strideA+offsetA];
01085 prefetch(&blA[0]);
01086
01087
01088 ResScalar C0(0);
01089
01090 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
01091 for(Index k=0; k<depth; k++)
01092 {
01093 LhsScalar A0 = blA[k];
01094 RhsScalar B0 = blB[k];
01095 MADD(cj, A0, B0, C0, B0);
01096 }
01097 res[(j2+0)*resStride + i] += alpha*C0;
01098 }
01099 }
01100 }
01101 };
01102
01103 #undef CJMADD
01104
01105
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
01120 struct gemm_pack_lhs
01121 {
01122 void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
01123 Index stride=0, Index offset=0)
01124 {
01125
01126 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01127 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01128 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
01129 Index count = 0;
01130 Index peeled_mc = (rows/Pack1)*Pack1;
01131 for(Index i=0; i<peeled_mc; i+=Pack1)
01132 {
01133 if(PanelMode) count += Pack1 * offset;
01134 for(Index k=0; k<depth; k++)
01135 for(Index w=0; w<Pack1; w++)
01136 blockA[count++] = cj(lhs(i+w, k));
01137 if(PanelMode) count += Pack1 * (stride-offset-depth);
01138 }
01139 if(rows-peeled_mc>=Pack2)
01140 {
01141 if(PanelMode) count += Pack2*offset;
01142 for(Index k=0; k<depth; k++)
01143 for(Index w=0; w<Pack2; w++)
01144 blockA[count++] = cj(lhs(peeled_mc+w, k));
01145 if(PanelMode) count += Pack2 * (stride-offset-depth);
01146 peeled_mc += Pack2;
01147 }
01148 for(Index i=peeled_mc; i<rows; i++)
01149 {
01150 if(PanelMode) count += offset;
01151 for(Index k=0; k<depth; k++)
01152 blockA[count++] = cj(lhs(i, k));
01153 if(PanelMode) count += (stride-offset-depth);
01154 }
01155 }
01156 };
01157
01158
01159
01160
01161
01162
01163
01164
01165 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01166 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
01167 {
01168 typedef typename packet_traits<Scalar>::type Packet;
01169 enum { PacketSize = packet_traits<Scalar>::size };
01170 void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01171 Index stride=0, Index offset=0)
01172 {
01173 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01174 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01175 Index packet_cols = (cols/nr) * nr;
01176 Index count = 0;
01177 for(Index j2=0; j2<packet_cols; j2+=nr)
01178 {
01179
01180 if(PanelMode) count += nr * offset;
01181 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01182 const Scalar* b1 = &rhs[(j2+1)*rhsStride];
01183 const Scalar* b2 = &rhs[(j2+2)*rhsStride];
01184 const Scalar* b3 = &rhs[(j2+3)*rhsStride];
01185 for(Index k=0; k<depth; k++)
01186 {
01187 blockB[count+0] = cj(b0[k]);
01188 blockB[count+1] = cj(b1[k]);
01189 if(nr==4) blockB[count+2] = cj(b2[k]);
01190 if(nr==4) blockB[count+3] = cj(b3[k]);
01191 count += nr;
01192 }
01193
01194 if(PanelMode) count += nr * (stride-offset-depth);
01195 }
01196
01197
01198 for(Index j2=packet_cols; j2<cols; ++j2)
01199 {
01200 if(PanelMode) count += offset;
01201 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01202 for(Index k=0; k<depth; k++)
01203 {
01204 blockB[count] = cj(b0[k]);
01205 count += 1;
01206 }
01207 if(PanelMode) count += (stride-offset-depth);
01208 }
01209 }
01210 };
01211
01212
01213 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01214 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
01215 {
01216 enum { PacketSize = packet_traits<Scalar>::size };
01217 void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01218 Index stride=0, Index offset=0)
01219 {
01220 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01221 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01222 Index packet_cols = (cols/nr) * nr;
01223 Index count = 0;
01224 for(Index j2=0; j2<packet_cols; j2+=nr)
01225 {
01226
01227 if(PanelMode) count += nr * offset;
01228 for(Index k=0; k<depth; k++)
01229 {
01230 const Scalar* b0 = &rhs[k*rhsStride + j2];
01231 blockB[count+0] = cj(b0[0]);
01232 blockB[count+1] = cj(b0[1]);
01233 if(nr==4) blockB[count+2] = cj(b0[2]);
01234 if(nr==4) blockB[count+3] = cj(b0[3]);
01235 count += nr;
01236 }
01237
01238 if(PanelMode) count += nr * (stride-offset-depth);
01239 }
01240
01241 for(Index j2=packet_cols; j2<cols; ++j2)
01242 {
01243 if(PanelMode) count += offset;
01244 const Scalar* b0 = &rhs[j2];
01245 for(Index k=0; k<depth; k++)
01246 {
01247 blockB[count] = cj(b0[k*rhsStride]);
01248 count += 1;
01249 }
01250 if(PanelMode) count += stride-offset-depth;
01251 }
01252 }
01253 };
01254
01255 }
01256
01259 inline std::ptrdiff_t l1CacheSize()
01260 {
01261 std::ptrdiff_t l1, l2;
01262 internal::manage_caching_sizes(GetAction, &l1, &l2);
01263 return l1;
01264 }
01265
01268 inline std::ptrdiff_t l2CacheSize()
01269 {
01270 std::ptrdiff_t l1, l2;
01271 internal::manage_caching_sizes(GetAction, &l1, &l2);
01272 return l2;
01273 }
01274
01280 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
01281 {
01282 internal::manage_caching_sizes(SetAction, &l1, &l2);
01283 }
01284
01285 #endif // EIGEN_GENERAL_BLOCK_PANEL_H