10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H    11 #define EIGEN_GENERAL_BLOCK_PANEL_H    17 template<
typename _LhsScalar, 
typename _RhsScalar, 
bool _ConjLhs=false, 
bool _ConjRhs=false>
    30   static std::ptrdiff_t m_l1CacheSize = 0;
    31   static std::ptrdiff_t m_l2CacheSize = 0;
    72 template<
typename LhsScalar, 
typename RhsScalar, 
int KcFactor, 
typename SizeType>
    83   std::ptrdiff_t l1, l2;
    87     kdiv = KcFactor * 2 * Traits::nr
    88          * Traits::RhsProgress * 
sizeof(RhsScalar),
    90     mr_mask = (0xffffffff/mr)*mr
    94   k = std::min<SizeType>(k, l1/kdiv);
    95   SizeType _m = k>0 ? l2/(4 * 
sizeof(LhsScalar) * k) : 0;
    96   if(_m<m) m = _m & mr_mask;
    99 template<
typename LhsScalar, 
typename RhsScalar, 
typename SizeType>
   102   computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
   105 #ifdef EIGEN_HAS_FUSE_CJMADD   106   #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);   121       t = b; t = cj.pmul(a,t); c = 
padd(c,t);
   125   template<
typename CJ, 
typename A, 
typename B, 
typename C, 
typename T>
   131   #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);   145 template<
typename _LhsScalar, 
typename _RhsScalar, 
bool _ConjLhs, 
bool _ConjRhs>
   164     nr = NumberOfRegisters/4,
   167     mr = 2 * LhsPacketSize,
   169     WorkSpaceFactor = nr * RhsPacketSize,
   171     LhsProgress = LhsPacketSize,
   172     RhsProgress = RhsPacketSize
   187     p = pset1<ResPacket>(ResScalar(0));
   193       pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
   198     dest = pload<RhsPacket>(b);
   203     dest = pload<LhsPacket>(a);
   208     tmp = b; tmp = 
pmul(a,tmp); c = 
padd(c,tmp);
   213     r = 
pmadd(c,alpha,r);
   221 template<
typename RealScalar, 
bool _ConjLhs>
   238     nr = NumberOfRegisters/4,
   239     mr = 2 * LhsPacketSize,
   240     WorkSpaceFactor = nr*RhsPacketSize,
   242     LhsProgress = LhsPacketSize,
   243     RhsProgress = RhsPacketSize
   258     p = pset1<ResPacket>(ResScalar(0));
   264       pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
   269     dest = pload<RhsPacket>(b);
   274     dest = pload<LhsPacket>(a);
   284     tmp = b; tmp = 
pmul(a.v,tmp); c.v = 
padd(c.v,tmp);
   294     r = cj.pmadd(c,alpha,r);
   301 template<
typename RealScalar, 
bool _ConjLhs, 
bool _ConjRhs>
   302 class gebp_traits<
std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
   319     mr = 2 * ResPacketSize,
   320     WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
   322     LhsProgress = ResPacketSize,
   323     RhsProgress = Vectorizable ? 2*ResPacketSize : 1
   343     p.first   = pset1<RealPacket>(RealScalar(0));
   344     p.second  = pset1<RealPacket>(RealScalar(0));
   357         pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0],             
real(rhs[k]));
   358         pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], 
imag(rhs[k]));
   369     dest.first  = pload<RealPacket>((
const RealScalar*)b);
   370     dest.second = pload<RealPacket>((
const RealScalar*)(b+ResPacketSize));
   381     c.first   = 
padd(
pmul(a,b.first), c.first);
   382     c.second  = 
padd(
pmul(a,b.second),c.second);
   396     if((!ConjLhs)&&(!ConjRhs))
   399       tmp = 
padd(ResPacket(c.first),tmp);
   401     else if((!ConjLhs)&&(ConjRhs))
   404       tmp = 
padd(ResPacket(c.first),tmp);
   406     else if((ConjLhs)&&(!ConjRhs))
   409       tmp = 
padd(
pconj(ResPacket(c.first)),tmp);
   411     else if((ConjLhs)&&(ConjRhs))
   414       tmp = 
psub(
pconj(ResPacket(c.first)),tmp);
   417     r = 
pmadd(tmp,alpha,r);
   424 template<
typename RealScalar, 
bool _ConjRhs>
   444     mr = 2*ResPacketSize,
   445     WorkSpaceFactor = nr*RhsPacketSize,
   447     LhsProgress = ResPacketSize,
   448     RhsProgress = ResPacketSize
   463     p = pset1<ResPacket>(ResScalar(0));
   469       pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
   474     dest = pload<RhsPacket>(b);
   479     dest = ploaddup<LhsPacket>(a);
   489     tmp = b; tmp.v = 
pmul(a,tmp.v); c = 
padd(c,tmp);
   499     r = cj.pmadd(alpha,c,r);
   513 template<
typename LhsScalar, 
typename RhsScalar, 
typename Index, 
int mr, 
int nr, 
bool ConjugateLhs, 
bool ConjugateRhs>
   524     Vectorizable  = Traits::Vectorizable,
   525     LhsProgress   = Traits::LhsProgress,
   526     RhsProgress   = Traits::RhsProgress,
   527     ResPacketSize = Traits::ResPacketSize
   531   void operator()(ResScalar* res, Index resStride, 
const LhsScalar* blockA, 
const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
   532                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
   535 template<
typename LhsScalar, 
typename RhsScalar, 
typename Index, 
int mr, 
int nr, 
bool ConjugateLhs, 
bool ConjugateRhs>
   539                Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
   543     if(strideA==-1) strideA = depth;
   544     if(strideB==-1) strideB = depth;
   547     Index packet_cols = (cols/nr) * nr;
   548     const Index peeled_mc = (rows/mr)*mr;
   550     const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
   551     const Index peeled_kc = (depth/4)*4;
   554       unpackedB = 
const_cast<RhsScalar*
>(blockB - strideB * nr * RhsProgress);
   557     for(Index j2=0; j2<packet_cols; j2+=nr)
   559       traits.
unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 
   564       for(Index i=0; i<peeled_mc; i+=mr)
   566         const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
   570         AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
   580         ResScalar* r0 = &res[(j2+0)*resStride + i];
   593         const RhsScalar* blB = unpackedB;
   594         for(Index k=0; k<peeled_kc; k+=4)
   603             traits.
loadLhs(&blA[0*LhsProgress], A0);
   604             traits.
loadLhs(&blA[1*LhsProgress], A1);
   605             traits.
loadRhs(&blB[0*RhsProgress], B_0);
   606             traits.
madd(A0,B_0,C0,T0);
   607             traits.
madd(A1,B_0,C4,B_0);
   608             traits.
loadRhs(&blB[1*RhsProgress], B_0);
   609             traits.
madd(A0,B_0,C1,T0);
   610             traits.
madd(A1,B_0,C5,B_0);
   612             traits.
loadLhs(&blA[2*LhsProgress], A0);
   613             traits.
loadLhs(&blA[3*LhsProgress], A1);
   614             traits.
loadRhs(&blB[2*RhsProgress], B_0);
   615             traits.
madd(A0,B_0,C0,T0);
   616             traits.
madd(A1,B_0,C4,B_0);
   617             traits.
loadRhs(&blB[3*RhsProgress], B_0);
   618             traits.
madd(A0,B_0,C1,T0);
   619             traits.
madd(A1,B_0,C5,B_0);
   621             traits.
loadLhs(&blA[4*LhsProgress], A0);
   622             traits.
loadLhs(&blA[5*LhsProgress], A1);
   623             traits.
loadRhs(&blB[4*RhsProgress], B_0);
   624             traits.
madd(A0,B_0,C0,T0);
   625             traits.
madd(A1,B_0,C4,B_0);
   626             traits.
loadRhs(&blB[5*RhsProgress], B_0);
   627             traits.
madd(A0,B_0,C1,T0);
   628             traits.
madd(A1,B_0,C5,B_0);
   630             traits.
loadLhs(&blA[6*LhsProgress], A0);
   631             traits.
loadLhs(&blA[7*LhsProgress], A1);
   632             traits.
loadRhs(&blB[6*RhsProgress], B_0);
   633             traits.
madd(A0,B_0,C0,T0);
   634             traits.
madd(A1,B_0,C4,B_0);
   635             traits.
loadRhs(&blB[7*RhsProgress], B_0);
   636             traits.
madd(A0,B_0,C1,T0);
   637             traits.
madd(A1,B_0,C5,B_0);
   647             traits.
loadLhs(&blA[0*LhsProgress], A0);
   648             traits.
loadLhs(&blA[1*LhsProgress], A1);
   649             traits.
loadRhs(&blB[0*RhsProgress], B_0);
   650             traits.
loadRhs(&blB[1*RhsProgress], B1);
   652             traits.
madd(A0,B_0,C0,T0);
   653             traits.
loadRhs(&blB[2*RhsProgress], B2);
   654             traits.
madd(A1,B_0,C4,B_0);
   655             traits.
loadRhs(&blB[3*RhsProgress], B3);
   656             traits.
loadRhs(&blB[4*RhsProgress], B_0);
   657             traits.
madd(A0,B1,C1,T0);
   658             traits.
madd(A1,B1,C5,B1);
   659             traits.
loadRhs(&blB[5*RhsProgress], B1);
   660             traits.
madd(A0,B2,C2,T0);
   661             traits.
madd(A1,B2,C6,B2);
   662             traits.
loadRhs(&blB[6*RhsProgress], B2);
   663             traits.
madd(A0,B3,C3,T0);
   664             traits.
loadLhs(&blA[2*LhsProgress], A0);
   665             traits.
madd(A1,B3,C7,B3);
   666             traits.
loadLhs(&blA[3*LhsProgress], A1);
   667             traits.
loadRhs(&blB[7*RhsProgress], B3);
   668             traits.
madd(A0,B_0,C0,T0);
   669             traits.
madd(A1,B_0,C4,B_0);
   670             traits.
loadRhs(&blB[8*RhsProgress], B_0);
   671             traits.
madd(A0,B1,C1,T0);
   672             traits.
madd(A1,B1,C5,B1);
   673             traits.
loadRhs(&blB[9*RhsProgress], B1);
   674             traits.
madd(A0,B2,C2,T0);
   675             traits.
madd(A1,B2,C6,B2);
   676             traits.
loadRhs(&blB[10*RhsProgress], B2);
   677             traits.
madd(A0,B3,C3,T0);
   678             traits.
loadLhs(&blA[4*LhsProgress], A0);
   679             traits.
madd(A1,B3,C7,B3);
   680             traits.
loadLhs(&blA[5*LhsProgress], A1);
   681             traits.
loadRhs(&blB[11*RhsProgress], B3);
   683             traits.
madd(A0,B_0,C0,T0);
   684             traits.
madd(A1,B_0,C4,B_0);
   685             traits.
loadRhs(&blB[12*RhsProgress], B_0);
   686             traits.
madd(A0,B1,C1,T0);
   687             traits.
madd(A1,B1,C5,B1);
   688             traits.
loadRhs(&blB[13*RhsProgress], B1);
   689             traits.
madd(A0,B2,C2,T0);
   690             traits.
madd(A1,B2,C6,B2);
   691             traits.
loadRhs(&blB[14*RhsProgress], B2);
   692             traits.
madd(A0,B3,C3,T0);
   693             traits.
loadLhs(&blA[6*LhsProgress], A0);
   694             traits.
madd(A1,B3,C7,B3);
   695             traits.
loadLhs(&blA[7*LhsProgress], A1);
   696             traits.
loadRhs(&blB[15*RhsProgress], B3);
   697             traits.
madd(A0,B_0,C0,T0);
   698             traits.
madd(A1,B_0,C4,B_0);
   699             traits.
madd(A0,B1,C1,T0);
   700             traits.
madd(A1,B1,C5,B1);
   701             traits.
madd(A0,B2,C2,T0);
   702             traits.
madd(A1,B2,C6,B2);
   703             traits.
madd(A0,B3,C3,T0);
   704             traits.
madd(A1,B3,C7,B3);
   707           blB += 4*nr*RhsProgress;
   711         for(Index k=peeled_kc; k<depth; k++)
   719             traits.
loadLhs(&blA[0*LhsProgress], A0);
   720             traits.
loadLhs(&blA[1*LhsProgress], A1);
   721             traits.
loadRhs(&blB[0*RhsProgress], B_0);
   722             traits.
madd(A0,B_0,C0,T0);
   723             traits.
madd(A1,B_0,C4,B_0);
   724             traits.
loadRhs(&blB[1*RhsProgress], B_0);
   725             traits.
madd(A0,B_0,C1,T0);
   726             traits.
madd(A1,B_0,C5,B_0);
   734             traits.
loadLhs(&blA[0*LhsProgress], A0);
   735             traits.
loadLhs(&blA[1*LhsProgress], A1);
   736             traits.
loadRhs(&blB[0*RhsProgress], B_0);
   737             traits.
loadRhs(&blB[1*RhsProgress], B1);
   739             traits.
madd(A0,B_0,C0,T0);
   740             traits.
loadRhs(&blB[2*RhsProgress], B2);
   741             traits.
madd(A1,B_0,C4,B_0);
   742             traits.
loadRhs(&blB[3*RhsProgress], B3);
   743             traits.
madd(A0,B1,C1,T0);
   744             traits.
madd(A1,B1,C5,B1);
   745             traits.
madd(A0,B2,C2,T0);
   746             traits.
madd(A1,B2,C6,B2);
   747             traits.
madd(A0,B3,C3,T0);
   748             traits.
madd(A1,B3,C7,B3);
   751           blB += nr*RhsProgress;
   758           ResPacket alphav = pset1<ResPacket>(alpha);
   760           R0 = ploadu<ResPacket>(r0);
   761           R1 = ploadu<ResPacket>(r1);
   762           R2 = ploadu<ResPacket>(r2);
   763           R3 = ploadu<ResPacket>(r3);
   764           R4 = ploadu<ResPacket>(r0 + ResPacketSize);
   765           R5 = ploadu<ResPacket>(r1 + ResPacketSize);
   766           R6 = ploadu<ResPacket>(r2 + ResPacketSize);
   767           traits.
acc(C0, alphav, R0);
   769           R0 = ploadu<ResPacket>(r3 + ResPacketSize);
   771           traits.
acc(C1, alphav, R1);
   772           traits.
acc(C2, alphav, R2);
   773           traits.
acc(C3, alphav, R3);
   774           traits.
acc(C4, alphav, R4);
   775           traits.
acc(C5, alphav, R5);
   776           traits.
acc(C6, alphav, R6);
   777           traits.
acc(C7, alphav, R0);
   782           pstoreu(r0 + ResPacketSize, R4);
   783           pstoreu(r1 + ResPacketSize, R5);
   784           pstoreu(r2 + ResPacketSize, R6);
   785           pstoreu(r3 + ResPacketSize, R0);
   790           ResPacket alphav = pset1<ResPacket>(alpha);
   792           R0 = ploadu<ResPacket>(r0);
   793           R1 = ploadu<ResPacket>(r1);
   794           R4 = ploadu<ResPacket>(r0 + ResPacketSize);
   795           traits.
acc(C0, alphav, R0);
   797           R0 = ploadu<ResPacket>(r1 + ResPacketSize);
   798           traits.
acc(C1, alphav, R1);
   799           traits.
acc(C4, alphav, R4);
   800           traits.
acc(C5, alphav, R0);
   802           pstoreu(r0 + ResPacketSize, R4);
   803           pstoreu(r1 + ResPacketSize, R0);
   808       if(rows-peeled_mc>=LhsProgress)
   811         const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
   822         const RhsScalar* blB = unpackedB;
   823         for(Index k=0; k<peeled_kc; k+=4)
   830             traits.
loadLhs(&blA[0*LhsProgress], A0);
   831             traits.
loadRhs(&blB[0*RhsProgress], B_0);
   832             traits.
loadRhs(&blB[1*RhsProgress], B1);
   833             traits.
madd(A0,B_0,C0,B_0);
   834             traits.
loadRhs(&blB[2*RhsProgress], B_0);
   835             traits.
madd(A0,B1,C1,B1);
   836             traits.
loadLhs(&blA[1*LhsProgress], A0);
   837             traits.
loadRhs(&blB[3*RhsProgress], B1);
   838             traits.
madd(A0,B_0,C0,B_0);
   839             traits.
loadRhs(&blB[4*RhsProgress], B_0);
   840             traits.
madd(A0,B1,C1,B1);
   841             traits.
loadLhs(&blA[2*LhsProgress], A0);
   842             traits.
loadRhs(&blB[5*RhsProgress], B1);
   843             traits.
madd(A0,B_0,C0,B_0);
   844             traits.
loadRhs(&blB[6*RhsProgress], B_0);
   845             traits.
madd(A0,B1,C1,B1);
   846             traits.
loadLhs(&blA[3*LhsProgress], A0);
   847             traits.
loadRhs(&blB[7*RhsProgress], B1);
   848             traits.
madd(A0,B_0,C0,B_0);
   849             traits.
madd(A0,B1,C1,B1);
   856             traits.
loadLhs(&blA[0*LhsProgress], A0);
   857             traits.
loadRhs(&blB[0*RhsProgress], B_0);
   858             traits.
loadRhs(&blB[1*RhsProgress], B1);
   860             traits.
madd(A0,B_0,C0,B_0);
   861             traits.
loadRhs(&blB[2*RhsProgress], B2);
   862             traits.
loadRhs(&blB[3*RhsProgress], B3);
   863             traits.
loadRhs(&blB[4*RhsProgress], B_0);
   864             traits.
madd(A0,B1,C1,B1);
   865             traits.
loadRhs(&blB[5*RhsProgress], B1);
   866             traits.
madd(A0,B2,C2,B2);
   867             traits.
loadRhs(&blB[6*RhsProgress], B2);
   868             traits.
madd(A0,B3,C3,B3);
   869             traits.
loadLhs(&blA[1*LhsProgress], A0);
   870             traits.
loadRhs(&blB[7*RhsProgress], B3);
   871             traits.
madd(A0,B_0,C0,B_0);
   872             traits.
loadRhs(&blB[8*RhsProgress], B_0);
   873             traits.
madd(A0,B1,C1,B1);
   874             traits.
loadRhs(&blB[9*RhsProgress], B1);
   875             traits.
madd(A0,B2,C2,B2);
   876             traits.
loadRhs(&blB[10*RhsProgress], B2);
   877             traits.
madd(A0,B3,C3,B3);
   878             traits.
loadLhs(&blA[2*LhsProgress], A0);
   879             traits.
loadRhs(&blB[11*RhsProgress], B3);
   881             traits.
madd(A0,B_0,C0,B_0);
   882             traits.
loadRhs(&blB[12*RhsProgress], B_0);
   883             traits.
madd(A0,B1,C1,B1);
   884             traits.
loadRhs(&blB[13*RhsProgress], B1);
   885             traits.
madd(A0,B2,C2,B2);
   886             traits.
loadRhs(&blB[14*RhsProgress], B2);
   887             traits.
madd(A0,B3,C3,B3);
   889             traits.
loadLhs(&blA[3*LhsProgress], A0);
   890             traits.
loadRhs(&blB[15*RhsProgress], B3);
   891             traits.
madd(A0,B_0,C0,B_0);
   892             traits.
madd(A0,B1,C1,B1);
   893             traits.
madd(A0,B2,C2,B2);
   894             traits.
madd(A0,B3,C3,B3);
   897           blB += nr*4*RhsProgress;
   898           blA += 4*LhsProgress;
   901         for(Index k=peeled_kc; k<depth; k++)
   908             traits.
loadLhs(&blA[0*LhsProgress], A0);
   909             traits.
loadRhs(&blB[0*RhsProgress], B_0);
   910             traits.
loadRhs(&blB[1*RhsProgress], B1);
   911             traits.
madd(A0,B_0,C0,B_0);
   912             traits.
madd(A0,B1,C1,B1);
   919             traits.
loadLhs(&blA[0*LhsProgress], A0);
   920             traits.
loadRhs(&blB[0*RhsProgress], B_0);
   921             traits.
loadRhs(&blB[1*RhsProgress], B1);
   922             traits.
loadRhs(&blB[2*RhsProgress], B2);
   923             traits.
loadRhs(&blB[3*RhsProgress], B3);
   925             traits.
madd(A0,B_0,C0,B_0);
   926             traits.
madd(A0,B1,C1,B1);
   927             traits.
madd(A0,B2,C2,B2);
   928             traits.
madd(A0,B3,C3,B3);
   931           blB += nr*RhsProgress;
   936         ResPacket alphav = pset1<ResPacket>(alpha);
   938         ResScalar* r0 = &res[(j2+0)*resStride + i];
   943                   R0 = ploadu<ResPacket>(r0);
   944                   R1 = ploadu<ResPacket>(r1);
   945         if(nr==4) R2 = ploadu<ResPacket>(r2);
   946         if(nr==4) R3 = ploadu<ResPacket>(r3);
   948                   traits.
acc(C0, alphav, R0);
   949                   traits.
acc(C1, alphav, R1);
   950         if(nr==4) traits.
acc(C2, alphav, R2);
   951         if(nr==4) traits.
acc(C3, alphav, R3);
   958       for(Index i=peeled_mc2; i<rows; i++)
   960         const LhsScalar* blA = &blockA[i*strideA+offsetA];
   966         const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
   967         for(Index k=0; k<depth; k++)
   977             MADD(cj,A0,B_0,C0,B_0);
   978             MADD(cj,A0,B1,C1,B1);
   983             RhsScalar B_0, B1, B2, B3;
   991             MADD(cj,A0,B_0,C0,B_0);
   992             MADD(cj,A0,B1,C1,B1);
   993             MADD(cj,A0,B2,C2,B2);
   994             MADD(cj,A0,B3,C3,B3);
   999                   res[(j2+0)*resStride + i] += alpha*C0;
  1000                   res[(j2+1)*resStride + i] += alpha*C1;
  1001         if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
  1002         if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
  1007     for(Index j2=packet_cols; j2<cols; j2++)
  1010       traits.
unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
  1012       for(Index i=0; i<peeled_mc; i+=mr)
  1014         const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
  1024         const RhsScalar* blB = unpackedB;
  1025         for(Index k=0; k<depth; k++)
  1031           traits.
loadLhs(&blA[0*LhsProgress], A0);
  1032           traits.
loadLhs(&blA[1*LhsProgress], A1);
  1033           traits.
loadRhs(&blB[0*RhsProgress], B_0);
  1034           traits.
madd(A0,B_0,C0,T0);
  1035           traits.
madd(A1,B_0,C4,B_0);
  1038           blA += 2*LhsProgress;
  1041         ResPacket alphav = pset1<ResPacket>(alpha);
  1043         ResScalar* r0 = &res[(j2+0)*resStride + i];
  1045         R0 = ploadu<ResPacket>(r0);
  1046         R4 = ploadu<ResPacket>(r0+ResPacketSize);
  1048         traits.
acc(C0, alphav, R0);
  1049         traits.
acc(C4, alphav, R4);
  1052         pstoreu(r0+ResPacketSize, R4);
  1054       if(rows-peeled_mc>=LhsProgress)
  1056         Index i = peeled_mc;
  1057         const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
  1063         const RhsScalar* blB = unpackedB;
  1064         for(Index k=0; k<depth; k++)
  1070           traits.
madd(A0, B_0, C0, B_0);
  1075         ResPacket alphav = pset1<ResPacket>(alpha);
  1076         ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
  1077         traits.
acc(C0, alphav, R0);
  1078         pstoreu(&res[(j2+0)*resStride + i], R0);
  1080       for(Index i=peeled_mc2; i<rows; i++)
  1082         const LhsScalar* blA = &blockA[i*strideA+offsetA];
  1088         const RhsScalar* blB = &blockB[j2*strideB+offsetB];
  1089         for(Index k=0; k<depth; k++)
  1091           LhsScalar A0 = blA[k];
  1092           RhsScalar B_0 = blB[k];
  1093           MADD(cj, A0, B_0, C0, B_0);
  1095         res[(j2+0)*resStride + i] += alpha*C0;
  1117 template<
typename Scalar, 
typename Index, 
int Pack1, 
int Pack2, 
int StorageOrder, 
bool Conjugate, 
bool PanelMode>
  1120   EIGEN_DONT_INLINE void operator()(Scalar* blockA, 
const Scalar* 
EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
  1123 template<
typename Scalar, 
typename Index, 
int Pack1, 
int Pack2, 
int StorageOrder, 
bool Conjugate, 
bool PanelMode>
  1131   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
  1136   Index peeled_mc = (rows/Pack1)*Pack1;
  1137   for(Index i=0; i<peeled_mc; i+=Pack1)
  1139     if(PanelMode) count += Pack1 * offset;
  1143       for(Index k=0; k<depth; k++)
  1146         if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
  1147         if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
  1148         if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
  1149         if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
  1150         if(Pack1>=1*PacketSize) { 
pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
  1151         if(Pack1>=2*PacketSize) { 
pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
  1152         if(Pack1>=3*PacketSize) { 
pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
  1153         if(Pack1>=4*PacketSize) { 
pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
  1158       for(Index k=0; k<depth; k++)
  1162         for(; w<Pack1-3; w+=4)
  1164           Scalar a(cj(lhs(i+w+0, k))),
  1165                   b(cj(lhs(i+w+1, k))),
  1166                   c(cj(lhs(i+w+2, k))),
  1167                   d(cj(lhs(i+w+3, k)));
  1168           blockA[count++] = a;
  1169           blockA[count++] = b;
  1170           blockA[count++] = c;
  1171           blockA[count++] = d;
  1175             blockA[count++] = cj(lhs(i+w, k));
  1178     if(PanelMode) count += Pack1 * (stride-offset-depth);
  1180   if(rows-peeled_mc>=Pack2)
  1182     if(PanelMode) count += Pack2*offset;
  1183     for(Index k=0; k<depth; k++)
  1184       for(Index 
w=0; 
w<Pack2; 
w++)
  1185         blockA[count++] = cj(lhs(peeled_mc+
w, k));
  1186     if(PanelMode) count += Pack2 * (stride-offset-depth);
  1189   for(Index i=peeled_mc; i<rows; i++)
  1191     if(PanelMode) count += offset;
  1192     for(Index k=0; k<depth; k++)
  1193       blockA[count++] = cj(lhs(i, k));
  1194     if(PanelMode) count += (stride-offset-depth);
  1205 template<
typename Scalar, 
typename Index, 
int nr, 
bool Conjugate, 
bool PanelMode>
  1210   EIGEN_DONT_INLINE void operator()(Scalar* blockB, 
const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
  1213 template<
typename Scalar, 
typename Index, 
int nr, 
bool Conjugate, 
bool PanelMode>
  1215   ::operator()(Scalar* blockB, 
const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
  1218   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
  1220   Index packet_cols = (cols/nr) * nr;
  1222   for(Index j2=0; j2<packet_cols; j2+=nr)
  1225     if(PanelMode) count += nr * offset;
  1226     const Scalar* b0 = &rhs[(j2+0)*rhsStride];
  1227     const Scalar* b1 = &rhs[(j2+1)*rhsStride];
  1228     const Scalar* b2 = &rhs[(j2+2)*rhsStride];
  1229     const Scalar* b3 = &rhs[(j2+3)*rhsStride];
  1230     for(Index k=0; k<depth; k++)
  1232                 blockB[count+0] = cj(b0[k]);
  1233                 blockB[count+1] = cj(b1[k]);
  1234       if(nr==4) blockB[count+2] = cj(b2[k]);
  1235       if(nr==4) blockB[count+3] = cj(b3[k]);
  1239     if(PanelMode) count += nr * (stride-offset-depth);
  1243   for(Index j2=packet_cols; j2<cols; ++j2)
  1245     if(PanelMode) count += offset;
  1246     const Scalar* b0 = &rhs[(j2+0)*rhsStride];
  1247     for(Index k=0; k<depth; k++)
  1249       blockB[
count] = cj(b0[k]);
  1252     if(PanelMode) count += (stride-offset-depth);
  1257 template<
typename Scalar, 
typename Index, 
int nr, 
bool Conjugate, 
bool PanelMode>
  1261   EIGEN_DONT_INLINE void operator()(Scalar* blockB, 
const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
  1264 template<
typename Scalar, 
typename Index, 
int nr, 
bool Conjugate, 
bool PanelMode>
  1266   ::operator()(Scalar* blockB, 
const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
  1269   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
  1271   Index packet_cols = (cols/nr) * nr;
  1273   for(Index j2=0; j2<packet_cols; j2+=nr)
  1276     if(PanelMode) count += nr * offset;
  1277     for(Index k=0; k<depth; k++)
  1279       const Scalar* b0 = &rhs[k*rhsStride + j2];
  1280                 blockB[count+0] = cj(b0[0]);
  1281                 blockB[count+1] = cj(b0[1]);
  1282       if(nr==4) blockB[count+2] = cj(b0[2]);
  1283       if(nr==4) blockB[count+3] = cj(b0[3]);
  1287     if(PanelMode) count += nr * (stride-offset-depth);
  1290   for(Index j2=packet_cols; j2<cols; ++j2)
  1292     if(PanelMode) count += offset;
  1293     const Scalar* b0 = &rhs[j2];
  1294     for(Index k=0; k<depth; k++)
  1296       blockB[
count] = cj(b0[k*rhsStride]);
  1299     if(PanelMode) count += stride-offset-depth;
  1309   std::ptrdiff_t l1, l2;
  1318   std::ptrdiff_t l1, l2;
  1335 #endif // EIGEN_GENERAL_BLOCK_PANEL_H 
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const 
EIGEN_STRONG_INLINE void initAcc(DoublePacket &p)
EIGEN_DONT_INLINE void operator()(Scalar *blockA, const Scalar *EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0)
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const 
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const 
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
#define EIGEN_STRONG_INLINE
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, DoublePacket &c, RhsPacket &) const 
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, T &a, T &b, T &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
std::complex< RealScalar > Scalar
void computeProductBlockingSizes(SizeType &k, SizeType &m, SizeType &n)
Computes the blocking parameters for a m x k times k x n matrix product. 
#define EIGEN_ASM_COMMENT(X)
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar *rhs, RhsScalar *b)
EIGEN_STRONG_INLINE void gebp_madd(const CJ &cj, A &a, B &b, C &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
packet_traits< LhsScalar >::type _LhsPacket
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, A &a, B &b, C &c, T &)
conditional< Vectorizable, DoublePacket, Scalar >::type AccPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacket &dest) const 
packet_traits< Scalar >::type Packet
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar *rhs, RhsScalar *b)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const 
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
std::complex< RealScalar > RhsScalar
#define EIGEN_UNUSED_VARIABLE(var)
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar *rhs, Scalar *b)
EIGEN_STRONG_INLINE void acc(const DoublePacket &c, const ResPacket &alpha, ResPacket &r) const 
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const 
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
void manage_caching_sizes(Action action, std::ptrdiff_t *l1=0, std::ptrdiff_t *l2=0)
#define MADD(CJ, A, B, C, T)
#define eigen_internal_assert(x)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const 
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const 
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
const ImagReturnType imag() const 
Traits::ResScalar ResScalar
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const 
EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf &x)
void pstore(Scalar *to, const Packet &from)
EIGEN_DONT_INLINE void operator()(ResScalar *res, Index resStride, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar *unpackedB=0)
Traits::LhsPacket LhsPacket
packet_traits< RealScalar >::type RealPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const 
conj_helper< ResPacket, ResPacket, false, ConjRhs > cj
RealReturnType real() const 
void pstoreu(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
void prefetch(const Scalar *addr)
packet_traits< RhsScalar >::type _RhsPacket
std::ptrdiff_t l2CacheSize()
conditional< Vectorizable, ScalarPacket, Scalar >::type ResPacket
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &) const 
Packet psub(const Packet &a, const Packet &b)
packet_traits< LhsScalar >::type _LhsPacket
Traits::RhsPacket RhsPacket
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ResPacket &dest) const 
conditional< Vectorizable, RealPacket, Scalar >::type LhsPacket
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
scalar_product_traits< LhsScalar, RhsScalar >::ReturnType ResScalar
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
packet_traits< ResScalar >::type _ResPacket
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const 
packet_traits< Scalar >::type ScalarPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const 
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, AccPacket &tmp) const 
conj_helper< ResPacket, ResPacket, ConjLhs, false > cj
scalar_product_traits< LhsScalar, RhsScalar >::ReturnType ResScalar
EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex
conditional< Vectorizable, DoublePacket, Scalar >::type RhsPacket
TFSIMD_FORCE_INLINE const tfScalar & w() const 
std::complex< RealScalar > LhsScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
packet_traits< ResScalar >::type _ResPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
std::complex< RealScalar > LhsScalar
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const 
Traits::ResPacket ResPacket
#define EIGEN_ALWAYS_INLINE
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Packet pmul(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const 
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const 
std::complex< RealScalar > ResScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
#define EIGEN_DONT_INLINE
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const 
packet_traits< ResScalar >::type _ResPacket
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const 
Traits::AccPacket AccPacket
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const 
std::ptrdiff_t l1CacheSize()
int queryTopLevelCacheSize()
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Packet padd(const Packet &a, const Packet &b)
std::complex< RealScalar > Scalar
packet_traits< LhsScalar >::type _LhsPacket
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar *rhs, RhsScalar *b)