10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H    11 #define EIGEN_GENERAL_BLOCK_PANEL_H    18 template<
typename _LhsScalar, 
typename _RhsScalar, 
bool _ConjLhs=false, 
bool _ConjRhs=false>
    28 #if EIGEN_ARCH_i386_OR_x86_64    33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
    34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
    35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
    63     m_cacheSizes.
m_l1 = *l1;
    64     m_cacheSizes.
m_l2 = *l2;
    65     m_cacheSizes.
m_l3 = *l3;
    70     *l1 = m_cacheSizes.
m_l1;
    71     *l2 = m_cacheSizes.
m_l2;
    72     *l3 = m_cacheSizes.
m_l3;
    92 template<
typename LhsScalar, 
typename RhsScalar, 
int KcFactor, 
typename Index>
   102   std::ptrdiff_t l1, l2, l3;
   105   if (num_threads > 1) {
   106     typedef typename Traits::ResScalar ResScalar;
   108       kdiv = KcFactor * (Traits::mr * 
sizeof(LhsScalar) + Traits::nr * 
sizeof(RhsScalar)),
   109       ksub = Traits::mr * Traits::nr * 
sizeof(ResScalar),
   118     const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
   120       k = k_cache - (k_cache % kr);
   124     const Index n_cache = (l2-l1) / (nr * 
sizeof(RhsScalar) * k);
   126     if (n_cache <= n_per_thread) {
   129       n = n_cache - (n_cache % nr);
   132       n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
   137       const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
   139       if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
   140         m = m_cache - (m_cache % mr);
   143         m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
   150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS   160     if((numext::maxi)(k,(numext::maxi)(m,n))<48)
   163     typedef typename Traits::ResScalar ResScalar;
   166       k_div = KcFactor * (Traits::mr * 
sizeof(LhsScalar) + Traits::nr * 
sizeof(RhsScalar)),
   167       k_sub = Traits::mr * Traits::nr * 
sizeof(ResScalar)
   177     const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
   178     const Index old_k = k;
   184       k = (k%max_kc)==0 ? max_kc
   185                         : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
   187       eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && 
"the number of sweeps has to remain the same");
   196     #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS   197     const Index actual_l2 = l3;
   199     const Index actual_l2 = 1572864; 
   209     const Index lhs_bytes = m * k * 
sizeof(LhsScalar);
   210     const Index remaining_l1 = l1- k_sub - lhs_bytes;
   211     if(remaining_l1 >= 
Index(Traits::nr*
sizeof(RhsScalar))*k)
   214       max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
   219       max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
   222     Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
   230                     : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
   237       Index problem_size = k*n*
sizeof(LhsScalar);
   238       Index actual_lm = actual_l2;
   240       if(problem_size<=1024)
   246       else if(l3!=0 && problem_size<=32768)
   251         max_mc = (numext::mini<Index>)(576,max_mc);
   253       Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
   254       if (mc > Traits::mr) mc -= mc % Traits::mr;
   255       else if (mc==0) 
return;
   257                     : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
   262 template <
typename Index>
   265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES   266   if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
   267     k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
   268     m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
   269     n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
   296 template<
typename LhsScalar, 
typename RhsScalar, 
int KcFactor, 
typename Index>
   300     evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
   304 template<
typename LhsScalar, 
typename RhsScalar, 
typename Index>
   307   computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
   310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD   311   #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);   326       t = b; t = cj.pmul(a,t); c = 
padd(c,t);
   330   template<
typename CJ, 
typename A, 
typename B, 
typename C, 
typename T>
   336   #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);   350 template<
typename _LhsScalar, 
typename _RhsScalar, 
bool _ConjLhs, 
bool _ConjRhs>
   377     mr = Vectorizable ? 3*LhsPacketSize : default_mr,
   382     LhsProgress = LhsPacketSize,
   398     p = pset1<ResPacket>(ResScalar(0));
   411   template<
typename RhsPacketType>
   414     dest = pset1<RhsPacketType>(*b);
   419     dest = ploadquad<RhsPacket>(b);
   422   template<
typename LhsPacketType>
   425     dest = pload<LhsPacketType>(a);
   428   template<
typename LhsPacketType>
   431     dest = ploadu<LhsPacketType>(a);
   434   template<
typename LhsPacketType, 
typename RhsPacketType, 
typename AccPacketType>
   442 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD   446     tmp = b; tmp = cj.
pmul(a,tmp); c = 
padd(c,tmp);
   452     r = 
pmadd(c,alpha,r);
   455   template<
typename ResPacketHalf>
   458     r = 
pmadd(c,alpha,r);
   463 template<
typename RealScalar, 
bool _ConjLhs>
   481 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)   483     mr = 3*LhsPacketSize,
   488     LhsProgress = LhsPacketSize,
   504     p = pset1<ResPacket>(ResScalar(0));
   509     dest = pset1<RhsPacket>(*b);
   514     dest = pset1<RhsPacket>(*b);
   519     dest = pload<LhsPacket>(a);
   524     dest = ploadu<LhsPacket>(a);
   544 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD   546     c.v = 
pmadd(a.v,b,c.v);
   548     tmp = b; tmp = 
pmul(a.v,tmp); c.v = 
padd(c.v,tmp);
   559     r = cj.pmadd(c,alpha,r);
   566 template<
typename Packet>
   573 template<
typename Packet>
   582 template<
typename Packet>
   598 template<
typename RealScalar, 
bool _ConjLhs, 
bool _ConjRhs>
   599 class gebp_traits<
std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
   621     LhsProgress = ResPacketSize,
   638     p.
first   = pset1<RealPacket>(RealScalar(0));
   639     p.
second  = pset1<RealPacket>(RealScalar(0));
   645     dest = pset1<ResPacket>(*b);
   651     dest.
first  = pset1<RealPacket>(
real(*b));
   718     if((!ConjLhs)&&(!ConjRhs))
   723     else if((!ConjLhs)&&(ConjRhs))
   728     else if((ConjLhs)&&(!ConjRhs))
   733     else if((ConjLhs)&&(ConjRhs))
   739     r = 
pmadd(tmp,alpha,r);
   746 template<
typename RealScalar, 
bool _ConjRhs>
   769     LhsProgress = ResPacketSize,
   785     p = pset1<ResPacket>(ResScalar(0));
   790     dest = pset1<RhsPacket>(*b);
   793   void broadcastRhs(
const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
   807     dest = ploaddup<LhsPacket>(a);
   818     dest = ploaddup<LhsPacket>(a);
   828 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD   830     c.v = 
pmadd(a,b.v,c.v);
   832     tmp = b; tmp.v = 
pmul(a,tmp.v); c = 
padd(c,tmp);
   844     r = cj.pmadd(alpha,c,r);
   858 template<
typename LhsScalar, 
typename RhsScalar, 
typename Index, 
typename DataMapper, 
int mr, 
int nr, 
bool ConjugateLhs, 
bool ConjugateRhs>
   878     Vectorizable  = Traits::Vectorizable,
   879     LhsProgress   = Traits::LhsProgress,
   880     RhsProgress   = Traits::RhsProgress,
   881     ResPacketSize = Traits::ResPacketSize
   885   void operator()(
const DataMapper& res, 
const LhsScalar* blockA, 
const RhsScalar* blockB,
   890 template<
typename LhsScalar, 
typename RhsScalar, 
typename Index, 
typename DataMapper, 
int mr, 
int nr, 
bool ConjugateLhs, 
bool ConjugateRhs>
   893   ::operator()(
const DataMapper& res, 
const LhsScalar* blockA, 
const RhsScalar* blockB,
   900     if(strideA==-1) strideA = depth;
   901     if(strideB==-1) strideB = depth;
   903     Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   904     const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
   905     const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
   906     const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
   908     const Index peeled_kc  = depth & ~(pk-1);
   909     const Index prefetch_res_offset = 32/
sizeof(ResScalar);    
   915     if(mr>=3*Traits::LhsProgress)
   926       const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - 
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth * 
sizeof(LhsScalar) * 3*LhsProgress) ));
   927       for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
   929         const Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc3);
   930         for(
Index j2=0; j2<packet_cols4; j2+=nr)
   932           for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
   938           const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
   960           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
   964           for(
Index k=0; k<peeled_kc; k+=pk)
   970 #define EIGEN_GEBP_ONESTEP(K) \   972               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \   973               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \   974               internal::prefetch(blA+(3*K+16)*LhsProgress); \   975               if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); }  \   976               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \   977               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \   978               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \   979               traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \   980               traits.madd(A0, B_0, C0, T0); \   981               traits.madd(A1, B_0, C4, T0); \   982               traits.madd(A2, B_0, C8, B_0); \   983               traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \   984               traits.madd(A0, B_0, C1, T0); \   985               traits.madd(A1, B_0, C5, T0); \   986               traits.madd(A2, B_0, C9, B_0); \   987               traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \   988               traits.madd(A0, B_0, C2,  T0); \   989               traits.madd(A1, B_0, C6,  T0); \   990               traits.madd(A2, B_0, C10, B_0); \   991               traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \   992               traits.madd(A0, B_0, C3 , T0); \   993               traits.madd(A1, B_0, C7,  T0); \   994               traits.madd(A2, B_0, C11, B_0); \   995               EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \  1008             blB += pk*4*RhsProgress;
  1009             blA += pk*3*Traits::LhsProgress;
  1014           for(
Index k=peeled_kc; k<depth; k++)
  1019             blB += 4*RhsProgress;
  1020             blA += 3*Traits::LhsProgress;
  1023 #undef EIGEN_GEBP_ONESTEP  1026           ResPacket alphav = pset1<ResPacket>(alpha);
  1028           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
  1029           R1 = r0.loadPacket(1 * Traits::ResPacketSize);
  1030           R2 = r0.loadPacket(2 * Traits::ResPacketSize);
  1031           traits.
acc(C0, alphav, R0);
  1032           traits.
acc(C4, alphav, R1);
  1033           traits.
acc(C8, alphav, R2);
  1034           r0.storePacket(0 * Traits::ResPacketSize, R0);
  1035           r0.storePacket(1 * Traits::ResPacketSize, R1);
  1036           r0.storePacket(2 * Traits::ResPacketSize, R2);
  1038           R0 = r1.loadPacket(0 * Traits::ResPacketSize);
  1039           R1 = r1.loadPacket(1 * Traits::ResPacketSize);
  1040           R2 = r1.loadPacket(2 * Traits::ResPacketSize);
  1041           traits.
acc(C1, alphav, R0);
  1042           traits.
acc(C5, alphav, R1);
  1043           traits.
acc(C9, alphav, R2);
  1044           r1.storePacket(0 * Traits::ResPacketSize, R0);
  1045           r1.storePacket(1 * Traits::ResPacketSize, R1);
  1046           r1.storePacket(2 * Traits::ResPacketSize, R2);
  1048           R0 = r2.loadPacket(0 * Traits::ResPacketSize);
  1049           R1 = r2.loadPacket(1 * Traits::ResPacketSize);
  1050           R2 = r2.loadPacket(2 * Traits::ResPacketSize);
  1051           traits.
acc(C2, alphav, R0);
  1052           traits.
acc(C6, alphav, R1);
  1053           traits.
acc(C10, alphav, R2);
  1054           r2.storePacket(0 * Traits::ResPacketSize, R0);
  1055           r2.storePacket(1 * Traits::ResPacketSize, R1);
  1056           r2.storePacket(2 * Traits::ResPacketSize, R2);
  1058           R0 = r3.loadPacket(0 * Traits::ResPacketSize);
  1059           R1 = r3.loadPacket(1 * Traits::ResPacketSize);
  1060           R2 = r3.loadPacket(2 * Traits::ResPacketSize);
  1061           traits.
acc(C3, alphav, R0);
  1062           traits.
acc(C7, alphav, R1);
  1063           traits.
acc(C11, alphav, R2);
  1064           r3.storePacket(0 * Traits::ResPacketSize, R0);
  1065           r3.storePacket(1 * Traits::ResPacketSize, R1);
  1066           r3.storePacket(2 * Traits::ResPacketSize, R2);          
  1071         for(
Index j2=packet_cols4; j2<cols; j2++)
  1073           for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
  1076           const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
  1089           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
  1092           for(
Index k=0; k<peeled_kc; k+=pk)
  1096 #define EIGEN_GEBGP_ONESTEP(K) \  1098               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \  1099               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \  1100               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \  1101               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \  1102               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \  1103               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \  1104               traits.madd(A0, B_0, C0, B_0); \  1105               traits.madd(A1, B_0, C4, B_0); \  1106               traits.madd(A2, B_0, C8, B_0); \  1107               EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \  1119             blB += pk*RhsProgress;
  1120             blA += pk*3*Traits::LhsProgress;
  1126           for(
Index k=peeled_kc; k<depth; k++)
  1131             blA += 3*Traits::LhsProgress;
  1133 #undef EIGEN_GEBGP_ONESTEP  1135           ResPacket alphav = pset1<ResPacket>(alpha);
  1137           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
  1138           R1 = r0.loadPacket(1 * Traits::ResPacketSize);
  1139           R2 = r0.loadPacket(2 * Traits::ResPacketSize);
  1140           traits.
acc(C0, alphav, R0);
  1141           traits.
acc(C4, alphav, R1);
  1142           traits.
acc(C8, alphav, R2);
  1143           r0.storePacket(0 * Traits::ResPacketSize, R0);
  1144           r0.storePacket(1 * Traits::ResPacketSize, R1);
  1145           r0.storePacket(2 * Traits::ResPacketSize, R2);          
  1152     if(mr>=2*Traits::LhsProgress)
  1158       Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - 
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth * 
sizeof(LhsScalar) * 2*LhsProgress) ));
  1160       for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
  1162         Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc2);
  1163         for(
Index j2=0; j2<packet_cols4; j2+=nr)
  1165           for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
  1171           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
  1185           r0.prefetch(prefetch_res_offset);
  1186           r1.prefetch(prefetch_res_offset);
  1187           r2.prefetch(prefetch_res_offset);
  1188           r3.prefetch(prefetch_res_offset);
  1191           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
  1195           for(
Index k=0; k<peeled_kc; k+=pk)
  1200    #define EIGEN_GEBGP_ONESTEP(K) \  1202               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \  1203               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \  1204               traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \  1205               traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \  1206               traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \  1207               traits.madd(A0, B_0, C0, T0);                                     \  1208               traits.madd(A1, B_0, C4, B_0);                                    \  1209               traits.madd(A0, B1,  C1, T0);                                     \  1210               traits.madd(A1, B1,  C5, B1);                                     \  1211               traits.madd(A0, B2,  C2, T0);                                     \  1212               traits.madd(A1, B2,  C6, B2);                                     \  1213               traits.madd(A0, B3,  C3, T0);                                     \  1214               traits.madd(A1, B3,  C7, B3);                                     \  1215               EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \  1229             blB += pk*4*RhsProgress;
  1230             blA += pk*(2*Traits::LhsProgress);
  1235           for(
Index k=peeled_kc; k<depth; k++)
  1239             blB += 4*RhsProgress;
  1240             blA += 2*Traits::LhsProgress;
  1242 #undef EIGEN_GEBGP_ONESTEP  1245           ResPacket alphav = pset1<ResPacket>(alpha);
  1247           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
  1248           R1 = r0.loadPacket(1 * Traits::ResPacketSize);
  1249           R2 = r1.loadPacket(0 * Traits::ResPacketSize);
  1250           R3 = r1.loadPacket(1 * Traits::ResPacketSize);
  1251           traits.
acc(C0, alphav, R0);
  1252           traits.
acc(C4, alphav, R1);
  1253           traits.
acc(C1, alphav, R2);
  1254           traits.
acc(C5, alphav, R3);
  1255           r0.storePacket(0 * Traits::ResPacketSize, R0);
  1256           r0.storePacket(1 * Traits::ResPacketSize, R1);
  1257           r1.storePacket(0 * Traits::ResPacketSize, R2);
  1258           r1.storePacket(1 * Traits::ResPacketSize, R3);
  1260           R0 = r2.loadPacket(0 * Traits::ResPacketSize);
  1261           R1 = r2.loadPacket(1 * Traits::ResPacketSize);
  1262           R2 = r3.loadPacket(0 * Traits::ResPacketSize);
  1263           R3 = r3.loadPacket(1 * Traits::ResPacketSize);
  1264           traits.
acc(C2,  alphav, R0);
  1265           traits.
acc(C6,  alphav, R1);
  1266           traits.
acc(C3,  alphav, R2);
  1267           traits.
acc(C7,  alphav, R3);
  1268           r2.storePacket(0 * Traits::ResPacketSize, R0);
  1269           r2.storePacket(1 * Traits::ResPacketSize, R1);
  1270           r3.storePacket(0 * Traits::ResPacketSize, R2);
  1271           r3.storePacket(1 * Traits::ResPacketSize, R3);
  1276         for(
Index j2=packet_cols4; j2<cols; j2++)
  1278           for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
  1281           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
  1290           r0.prefetch(prefetch_res_offset);
  1293           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
  1296           for(
Index k=0; k<peeled_kc; k+=pk)
  1301 #define EIGEN_GEBGP_ONESTEP(K) \  1303               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \  1304               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \  1305               traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \  1306               traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \  1307               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \  1308               traits.madd(A0, B_0, C0, B1);                                       \  1309               traits.madd(A1, B_0, C4, B_0);                                      \  1310               EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \  1322             blB += pk*RhsProgress;
  1323             blA += pk*2*Traits::LhsProgress;
  1329           for(
Index k=peeled_kc; k<depth; k++)
  1334             blA += 2*Traits::LhsProgress;
  1336 #undef EIGEN_GEBGP_ONESTEP  1338           ResPacket alphav = pset1<ResPacket>(alpha);
  1340           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
  1341           R1 = r0.loadPacket(1 * Traits::ResPacketSize);
  1342           traits.
acc(C0, alphav, R0);
  1343           traits.
acc(C4, alphav, R1);
  1344           r0.storePacket(0 * Traits::ResPacketSize, R0);
  1345           r0.storePacket(1 * Traits::ResPacketSize, R1);
  1351     if(mr>=1*Traits::LhsProgress)
  1354       for(
Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
  1357         for(
Index j2=0; j2<packet_cols4; j2+=nr)
  1362           const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
  1377           r0.prefetch(prefetch_res_offset);
  1378           r1.prefetch(prefetch_res_offset);
  1379           r2.prefetch(prefetch_res_offset);
  1380           r3.prefetch(prefetch_res_offset);
  1383           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
  1387           for(
Index k=0; k<peeled_kc; k+=pk)
  1392 #define EIGEN_GEBGP_ONESTEP(K) \  1394               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \  1395               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \  1396               traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \  1397               traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \  1398               traits.madd(A0, B_0, C0, B_0);                                    \  1399               traits.madd(A0, B1,  C1, B1);                                     \  1400               traits.madd(A0, B2,  C2, B2);                                     \  1401               traits.madd(A0, B3,  C3, B3);                                     \  1402               EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \  1416             blB += pk*4*RhsProgress;
  1417             blA += pk*1*LhsProgress;
  1422           for(
Index k=peeled_kc; k<depth; k++)
  1426             blB += 4*RhsProgress;
  1427             blA += 1*LhsProgress;
  1429 #undef EIGEN_GEBGP_ONESTEP  1432           ResPacket alphav = pset1<ResPacket>(alpha);
  1434           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
  1435           R1 = r1.loadPacket(0 * Traits::ResPacketSize);
  1436           traits.
acc(C0, alphav, R0);
  1437           traits.
acc(C1,  alphav, R1);
  1438           r0.storePacket(0 * Traits::ResPacketSize, R0);
  1439           r1.storePacket(0 * Traits::ResPacketSize, R1);
  1441           R0 = r2.loadPacket(0 * Traits::ResPacketSize);
  1442           R1 = r3.loadPacket(0 * Traits::ResPacketSize);
  1443           traits.
acc(C2,  alphav, R0);
  1444           traits.
acc(C3,  alphav, R1);
  1445           r2.storePacket(0 * Traits::ResPacketSize, R0);
  1446           r3.storePacket(0 * Traits::ResPacketSize, R1);
  1450         for(
Index j2=packet_cols4; j2<cols; j2++)
  1453           const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
  1463           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
  1466           for(
Index k=0; k<peeled_kc; k+=pk)
  1471 #define EIGEN_GEBGP_ONESTEP(K) \  1473               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \  1474               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \  1475               traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \  1476               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \  1477               traits.madd(A0, B_0, C0, B_0);                                    \  1478               EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \  1490             blB += pk*RhsProgress;
  1491             blA += pk*1*Traits::LhsProgress;
  1497           for(
Index k=peeled_kc; k<depth; k++)
  1502             blA += 1*Traits::LhsProgress;
  1504 #undef EIGEN_GEBGP_ONESTEP  1506           ResPacket alphav = pset1<ResPacket>(alpha);
  1507           R0 = r0.loadPacket(0 * Traits::ResPacketSize);
  1508           traits.
acc(C0, alphav, R0);
  1509           r0.storePacket(0 * Traits::ResPacketSize, R0);
  1517       for(
Index j2=0; j2<packet_cols4; j2+=nr)
  1520         for(
Index i=peeled_mc1; i<rows; i+=1)
  1522           const LhsScalar* blA = &blockA[i*strideA+offsetA];
  1524           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
  1530           if ((SwappedTraits::LhsProgress % 4) == 0 &&
  1531               (SwappedTraits::LhsProgress <= 8) &&
  1540             const Index spk   = (
std::max)(1,SwappedTraits::LhsProgress/4);
  1541             const Index endk  = (depth/spk)*spk;
  1542             const Index endk4 = (depth/(spk*4))*(spk*4);
  1545             for(; k<endk4; k+=4*spk)
  1555               straits.
madd(A0,B_0,C0,B_0);
  1556               straits.
madd(A1,B_1,C1,B_1);
  1562               straits.
madd(A0,B_0,C2,B_0);
  1563               straits.
madd(A1,B_1,C3,B_1);
  1565               blB += 4*SwappedTraits::LhsProgress;
  1569             for(; k<endk; k+=spk)
  1576               straits.
madd(A0,B_0,C0,B_0);
  1578               blB += SwappedTraits::LhsProgress;
  1581             if(SwappedTraits::LhsProgress==8)
  1589               SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
  1590               SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
  1600                 straits.
madd(a0,b0,c0,b0);
  1601                 straits.
acc(c0, alphav, R);
  1607               res.scatterPacket(i, j2, R);
  1611               SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
  1612               SResPacket alphav = pset1<SResPacket>(alpha);
  1613               straits.
acc(C0, alphav, R);
  1614               res.scatterPacket(i, j2, R);
  1620             ResScalar C0(0), C1(0), C2(0), C3(0);
  1622             for(
Index k=0; k<depth; k++)
  1631               CJMADD(cj,A0,B_0,C0,  B_0);
  1632               CJMADD(cj,A0,B_1,C1,  B_1);
  1636               CJMADD(cj,A0,B_0,C2,  B_0);
  1637               CJMADD(cj,A0,B_1,C3,  B_1);
  1641             res(i, j2 + 0) += alpha * C0;
  1642             res(i, j2 + 1) += alpha * C1;
  1643             res(i, j2 + 2) += alpha * C2;
  1644             res(i, j2 + 3) += alpha * C3;
  1649       for(
Index j2=packet_cols4; j2<cols; j2++)
  1652         for(
Index i=peeled_mc1; i<rows; i+=1)
  1654           const LhsScalar* blA = &blockA[i*strideA+offsetA];
  1658           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
  1659           for(
Index k=0; k<depth; k++)
  1661             LhsScalar A0 = blA[k];
  1662             RhsScalar B_0 = blB[k];
  1663             CJMADD(cj, A0, B_0, C0, B_0);
  1665           res(i, j2) += alpha * C0;
  1688 template<
typename Scalar, 
typename Index, 
typename DataMapper, 
int Pack1, 
int Pack2, 
bool Conjugate, 
bool PanelMode>
  1695 template<
typename Scalar, 
typename Index, 
typename DataMapper, 
int Pack1, 
int Pack2, 
bool Conjugate, 
bool PanelMode>
  1705   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
  1706   eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
  1710   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
  1711   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
  1712   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
  1713   const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
  1714                          : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
  1719   if(Pack1>=3*PacketSize)
  1721     for(; i<peeled_mc3; i+=3*PacketSize)
  1723       if(PanelMode) count += (3*PacketSize) * offset;
  1725       for(
Index k=0; k<depth; k++)
  1728         A = lhs.loadPacket(i+0*PacketSize, k);
  1729         B = lhs.loadPacket(i+1*PacketSize, k);
  1730         C = lhs.loadPacket(i+2*PacketSize, k);
  1731         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
  1732         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
  1733         pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
  1735       if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
  1739   if(Pack1>=2*PacketSize)
  1741     for(; i<peeled_mc2; i+=2*PacketSize)
  1743       if(PanelMode) count += (2*PacketSize) * offset;
  1745       for(
Index k=0; k<depth; k++)
  1748         A = lhs.loadPacket(i+0*PacketSize, k);
  1749         B = lhs.loadPacket(i+1*PacketSize, k);
  1750         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
  1751         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
  1753       if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
  1757   if(Pack1>=1*PacketSize)
  1759     for(; i<peeled_mc1; i+=1*PacketSize)
  1761       if(PanelMode) count += (1*PacketSize) * offset;
  1763       for(
Index k=0; k<depth; k++)
  1766         A = lhs.loadPacket(i+0*PacketSize, k);
  1767         pstore(blockA+count, cj.pconj(A));
  1770       if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
  1774   if(Pack2<PacketSize && Pack2>1)
  1776     for(; i<peeled_mc0; i+=Pack2)
  1778       if(PanelMode) count += Pack2 * offset;
  1780       for(
Index k=0; k<depth; k++)
  1781         for(
Index w=0; w<Pack2; w++)
  1782           blockA[count++] = cj(lhs(i+w, k));
  1784       if(PanelMode) count += Pack2 * (stride-offset-depth);
  1789     if(PanelMode) count += offset;
  1790     for(
Index k=0; k<depth; k++)
  1791       blockA[count++] = cj(lhs(i, k));
  1792     if(PanelMode) count += (stride-offset-depth);
  1796 template<
typename Scalar, 
typename Index, 
typename DataMapper, 
int Pack1, 
int Pack2, 
bool Conjugate, 
bool PanelMode>
  1803 template<
typename Scalar, 
typename Index, 
typename DataMapper, 
int Pack1, 
int Pack2, 
bool Conjugate, 
bool PanelMode>
  1813   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
  1825     Index remaining_rows = rows-i;
  1826     Index peeled_mc = i+(remaining_rows/pack)*pack;
  1827     for(; i<peeled_mc; i+=pack)
  1829       if(PanelMode) count += pack * offset;
  1831       const Index peeled_k = (depth/PacketSize)*PacketSize;
  1833       if(pack>=PacketSize)
  1835         for(; k<peeled_k; k+=PacketSize)
  1837           for (
Index m = 0; m < pack; m += PacketSize)
  1840             for (
int p = 0; p < PacketSize; ++p) kernel.
packet[p] = lhs.loadPacket(i+p+m, k);
  1842             for (
int p = 0; p < PacketSize; ++p) 
pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.
packet[p]));
  1844           count += PacketSize*pack;
  1850         for(; w<pack-3; w+=4)
  1852           Scalar a(cj(lhs(i+w+0, k))),
  1853                  b(cj(lhs(i+w+1, k))),
  1854                  c(cj(lhs(i+w+2, k))),
  1855                  d(cj(lhs(i+w+3, k)));
  1856           blockA[count++] = a;
  1857           blockA[count++] = b;
  1858           blockA[count++] = c;
  1859           blockA[count++] = d;
  1863             blockA[count++] = cj(lhs(i+w, k));
  1866       if(PanelMode) count += pack * (stride-offset-depth);
  1870     if(pack<Pack2 && (pack+PacketSize)!=Pack2)
  1876     if(PanelMode) count += offset;
  1877     for(
Index k=0; k<depth; k++)
  1878       blockA[count++] = cj(lhs(i, k));
  1879     if(PanelMode) count += (stride-offset-depth);
  1890 template<
typename Scalar, 
typename Index, 
typename DataMapper, 
int nr, 
bool Conjugate, 
bool PanelMode>
  1899 template<
typename Scalar, 
typename Index, 
typename DataMapper, 
int nr, 
bool Conjugate, 
bool PanelMode>
  1906   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
  1908   Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
  1909   Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
  1911   const Index peeled_k = (depth/PacketSize)*PacketSize;
  1960     for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
  1963       if(PanelMode) count += 4 * offset;
  1964       const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
  1965       const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
  1966       const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
  1967       const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
  1970       if((PacketSize%4)==0) 
  1972         for(; k<peeled_k; k+=PacketSize) {
  1974           kernel.
packet[0] = dm0.loadPacket(k);
  1975           kernel.
packet[1%PacketSize] = dm1.loadPacket(k);
  1976           kernel.
packet[2%PacketSize] = dm2.loadPacket(k);
  1977           kernel.
packet[3%PacketSize] = dm3.loadPacket(k);
  1979           pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.
packet[0]));
  1980           pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.
packet[1%PacketSize]));
  1981           pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.
packet[2%PacketSize]));
  1982           pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.
packet[3%PacketSize]));
  1983           count+=4*PacketSize;
  1988         blockB[count+0] = cj(dm0(k));
  1989         blockB[count+1] = cj(dm1(k));
  1990         blockB[count+2] = cj(dm2(k));
  1991         blockB[count+3] = cj(dm3(k));
  1995       if(PanelMode) count += 4 * (stride-offset-depth);
  2000   for(
Index j2=packet_cols4; j2<cols; ++j2)
  2002     if(PanelMode) count += offset;
  2004     for(
Index k=0; k<depth; k++)
  2006       blockB[count] = cj(dm0(k));
  2009     if(PanelMode) count += (stride-offset-depth);
  2014 template<
typename Scalar, 
typename Index, 
typename DataMapper, 
int nr, 
bool Conjugate, 
bool PanelMode>
  2023 template<
typename Scalar, 
typename Index, 
typename DataMapper, 
int nr, 
bool Conjugate, 
bool PanelMode>
  2030   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
  2032   Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
  2033   Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
  2071     for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
  2074       if(PanelMode) count += 4 * offset;
  2075       for(
Index k=0; k<depth; k++)
  2077         if (PacketSize==4) {
  2078           Packet A = rhs.loadPacket(k, j2);
  2079           pstoreu(blockB+count, cj.pconj(A));
  2080           count += PacketSize;
  2083           blockB[count+0] = cj(dm0(0));
  2084           blockB[count+1] = cj(dm0(1));
  2085           blockB[count+2] = cj(dm0(2));
  2086           blockB[count+3] = cj(dm0(3));
  2091       if(PanelMode) count += 4 * (stride-offset-depth);
  2095   for(
Index j2=packet_cols4; j2<cols; ++j2)
  2097     if(PanelMode) count += offset;
  2098     for(
Index k=0; k<depth; k++)
  2100       blockB[count] = cj(rhs(k, j2));
  2103     if(PanelMode) count += stride-offset-depth;
  2113   std::ptrdiff_t l1, l2, l3;
  2122   std::ptrdiff_t l1, l2, l3;
  2132   std::ptrdiff_t l1, l2, l3;
  2149 #endif // EIGEN_GENERAL_BLOCK_PANEL_H 
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
#define EIGEN_ALWAYS_INLINE
SwappedTraits::ResScalar SResScalar
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
DataMapper::LinearMapper LinearMapper
packet_traits< RhsScalar >::type _RhsPacket
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_STRONG_INLINE
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
conditional< Vectorizable, DoublePacketType, Scalar >::type AccPacket
DataMapper::LinearMapper LinearMapper
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
EIGEN_DEVICE_FUNC RealReturnType real() const
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, T &a, T &b, T &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
std::complex< RealScalar > Scalar
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE void gebp_madd(const CJ &cj, A &a, B &b, C &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
packet_traits< LhsScalar >::type _LhsPacket
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, A &a, B &b, C &c, T &)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
std::complex< RealScalar > RhsScalar
EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar &x, const RhsScalar &y, const Scalar &c) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacketType &dest) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ResPacket &dest) const
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
DataMapper::LinearMapper LinearMapper
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Traits::ResScalar ResScalar
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsScalar &b0, RhsScalar &b1)
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
#define EIGEN_ASM_COMMENT(X)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
SwappedTraits::AccPacket SAccPacket
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &) const
Traits::RhsPacket RhsPacket
const std::ptrdiff_t defaultL3CacheSize
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
packet_traits< Scalar >::type Packet
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf &x)
DoublePacket< Packet > half
#define EIGEN_DONT_INLINE
packet_traits< RealScalar >::type RealPacket
std::ptrdiff_t l3CacheSize()
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, AccPacketType &tmp) const
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
#define CJMADD(CJ, A, B, C, T)
SwappedTraits::RhsPacket SRhsPacket
SwappedTraits::ResPacket SResPacket
conj_helper< ResPacket, ResPacket, false, ConjRhs > cj
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::ptrdiff_t l2CacheSize()
EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar &x, const RhsScalar &y) const
EIGEN_STRONG_INLINE void acc(const DoublePacketType &c, const ResPacket &alpha, ResPacket &r) const
void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
conditional< Vectorizable, ScalarPacket, Scalar >::type ResPacket
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
packet_traits< LhsScalar >::type _LhsPacket
void queryCacheSizes(int &l1, int &l2, int &l3)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API. 
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, DoublePacketType &b0, DoublePacketType &b1)
T div_ceil(const T &a, const T &b)
#define EIGEN_GEBP_ONESTEP(K)
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
conditional< Vectorizable, RealPacket, Scalar >::type LhsPacket
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
packet_traits< ResScalar >::type _ResPacket
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
packet_traits< Scalar >::type ScalarPacket
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
conj_helper< ResPacket, ResPacket, ConjLhs, false > cj
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
Traits::AccPacket AccPacket
const std::ptrdiff_t defaultL2CacheSize
std::complex< RealScalar > LhsScalar
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Traits::LhsPacket LhsPacket
packet_traits< ResScalar >::type _ResPacket
SwappedTraits::LhsPacket SLhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::complex< RealScalar > LhsScalar
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
#define EIGEN_PLAIN_ENUM_MIN(a, b)
int64_t max(int64_t a, const int b)
const std::ptrdiff_t defaultL1CacheSize
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > SwappedTraits
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product. 
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
packet_traits< Scalar >::type Packet
Traits::ResPacket ResPacket
EIGEN_DEVICE_FUNC const ImagReturnType imag() const
DataMapper::LinearMapper LinearMapper
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
std::complex< RealScalar > ResScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
DoublePacket< RealPacket > DoublePacketType
packet_traits< ResScalar >::type _ResPacket
#define eigen_internal_assert(x)
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
conditional< Vectorizable, DoublePacketType, Scalar >::type RhsPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, DoublePacketType &c, RhsPacket &) const
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
std::ptrdiff_t l1CacheSize()
DataMapper::LinearMapper LinearMapper
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
EIGEN_DEVICE_FUNC conditional<(unpacket_traits< Packet >::size%8)==0, typename unpacket_traits< Packet >::half, Packet >::type predux_downto4(const Packet &a)
#define EIGEN_UNUSED_VARIABLE(var)
std::complex< RealScalar > Scalar
packet_traits< LhsScalar >::type _LhsPacket
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const