10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H 11 #define EIGEN_GENERAL_BLOCK_PANEL_H 18 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false>
28 #if EIGEN_ARCH_i386_OR_x86_64 33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
63 m_cacheSizes.
m_l1 = *l1;
64 m_cacheSizes.
m_l2 = *l2;
65 m_cacheSizes.
m_l3 = *l3;
70 *l1 = m_cacheSizes.
m_l1;
71 *l2 = m_cacheSizes.
m_l2;
72 *l3 = m_cacheSizes.
m_l3;
92 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
102 std::ptrdiff_t l1, l2, l3;
105 if (num_threads > 1) {
106 typedef typename Traits::ResScalar ResScalar;
108 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
109 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
118 const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
120 k = k_cache - (k_cache % kr);
124 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
126 if (n_cache <= n_per_thread) {
129 n = n_cache - (n_cache % nr);
132 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
137 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
139 if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
140 m = m_cache - (m_cache % mr);
143 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 160 if((numext::maxi)(k,(numext::maxi)(m,n))<48)
163 typedef typename Traits::ResScalar ResScalar;
166 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
167 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
177 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
178 const Index old_k = k;
184 k = (k%max_kc)==0 ? max_kc
185 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
187 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
196 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 197 const Index actual_l2 = l3;
199 const Index actual_l2 = 1572864;
209 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
210 const Index remaining_l1 = l1- k_sub - lhs_bytes;
211 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
214 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
219 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
222 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
230 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
237 Index problem_size = k*n*
sizeof(LhsScalar);
238 Index actual_lm = actual_l2;
240 if(problem_size<=1024)
246 else if(l3!=0 && problem_size<=32768)
251 max_mc = (numext::mini<Index>)(576,max_mc);
253 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
254 if (mc > Traits::mr) mc -= mc % Traits::mr;
255 else if (mc==0)
return;
257 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
262 template <
typename Index>
265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES 266 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
267 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
268 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
269 n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
296 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
300 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
304 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
307 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 311 #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); 326 t =
b; t = cj.pmul(a,t); c =
padd(c,t);
330 template<
typename CJ,
typename A,
typename B,
typename C,
typename T>
336 #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); 350 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs>
377 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
382 LhsProgress = LhsPacketSize,
398 p = pset1<ResPacket>(ResScalar(0));
411 template<
typename RhsPacketType>
414 dest = pset1<RhsPacketType>(*b);
419 dest = ploadquad<RhsPacket>(
b);
422 template<
typename LhsPacketType>
425 dest = pload<LhsPacketType>(a);
428 template<
typename LhsPacketType>
431 dest = ploadu<LhsPacketType>(a);
434 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
442 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 446 tmp =
b; tmp = cj.
pmul(a,tmp); c =
padd(c,tmp);
452 r =
pmadd(c,alpha,r);
455 template<
typename ResPacketHalf>
458 r =
pmadd(c,alpha,r);
463 template<
typename RealScalar,
bool _ConjLhs>
481 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) 483 mr = 3*LhsPacketSize,
488 LhsProgress = LhsPacketSize,
504 p = pset1<ResPacket>(ResScalar(0));
509 dest = pset1<RhsPacket>(*b);
514 dest = pset1<RhsPacket>(*b);
519 dest = pload<LhsPacket>(a);
524 dest = ploadu<LhsPacket>(a);
544 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 546 c.v =
pmadd(a.v,b,c.v);
548 tmp =
b; tmp =
pmul(a.v,tmp); c.v =
padd(c.v,tmp);
559 r = cj.pmadd(c,alpha,r);
566 template<
typename Packet>
573 template<
typename Packet>
582 template<
typename Packet>
598 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs>
599 class gebp_traits<
std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
621 LhsProgress = ResPacketSize,
638 p.
first = pset1<RealPacket>(RealScalar(0));
639 p.
second = pset1<RealPacket>(RealScalar(0));
645 dest = pset1<ResPacket>(*b);
651 dest.
first = pset1<RealPacket>(
real(*b));
718 if((!ConjLhs)&&(!ConjRhs))
723 else if((!ConjLhs)&&(ConjRhs))
728 else if((ConjLhs)&&(!ConjRhs))
733 else if((ConjLhs)&&(ConjRhs))
739 r =
pmadd(tmp,alpha,r);
746 template<
typename RealScalar,
bool _ConjRhs>
769 LhsProgress = ResPacketSize,
785 p = pset1<ResPacket>(ResScalar(0));
790 dest = pset1<RhsPacket>(*b);
793 void broadcastRhs(
const RhsScalar*
b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
807 dest = ploaddup<LhsPacket>(a);
818 dest = ploaddup<LhsPacket>(a);
828 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 830 c.v =
pmadd(a,b.v,c.v);
832 tmp =
b; tmp.v =
pmul(a,tmp.v); c =
padd(c,tmp);
844 r = cj.pmadd(alpha,c,r);
858 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
878 Vectorizable = Traits::Vectorizable,
879 LhsProgress = Traits::LhsProgress,
880 RhsProgress = Traits::RhsProgress,
881 ResPacketSize = Traits::ResPacketSize
885 void operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
890 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
893 ::operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
900 if(strideA==-1) strideA = depth;
901 if(strideB==-1) strideB = depth;
903 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
908 const Index peeled_kc = depth & ~(pk-1);
909 const Index prefetch_res_offset = 32/
sizeof(ResScalar);
915 if(mr>=3*Traits::LhsProgress)
926 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
927 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
929 const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
930 for(
Index j2=0; j2<packet_cols4; j2+=nr)
932 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
938 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
960 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
964 for(
Index k=0; k<peeled_kc; k+=pk)
970 #define EIGEN_GEBP_ONESTEP(K) \ 972 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ 973 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 974 internal::prefetch(blA+(3*K+16)*LhsProgress); \ 975 if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } \ 976 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ 977 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ 978 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ 979 traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ 980 traits.madd(A0, B_0, C0, T0); \ 981 traits.madd(A1, B_0, C4, T0); \ 982 traits.madd(A2, B_0, C8, B_0); \ 983 traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ 984 traits.madd(A0, B_0, C1, T0); \ 985 traits.madd(A1, B_0, C5, T0); \ 986 traits.madd(A2, B_0, C9, B_0); \ 987 traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ 988 traits.madd(A0, B_0, C2, T0); \ 989 traits.madd(A1, B_0, C6, T0); \ 990 traits.madd(A2, B_0, C10, B_0); \ 991 traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ 992 traits.madd(A0, B_0, C3 , T0); \ 993 traits.madd(A1, B_0, C7, T0); \ 994 traits.madd(A2, B_0, C11, B_0); \ 995 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ 1008 blB += pk*4*RhsProgress;
1009 blA += pk*3*Traits::LhsProgress;
1014 for(
Index k=peeled_kc; k<depth; k++)
1019 blB += 4*RhsProgress;
1020 blA += 3*Traits::LhsProgress;
1023 #undef EIGEN_GEBP_ONESTEP 1026 ResPacket alphav = pset1<ResPacket>(alpha);
1028 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1031 traits.
acc(C0, alphav, R0);
1032 traits.
acc(C4, alphav, R1);
1033 traits.
acc(C8, alphav, R2);
1034 r0.storePacket(0 * Traits::ResPacketSize, R0);
1035 r0.storePacket(1 * Traits::ResPacketSize, R1);
1036 r0.storePacket(2 * Traits::ResPacketSize, R2);
1038 R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039 R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040 R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1041 traits.
acc(C1, alphav, R0);
1042 traits.
acc(C5, alphav, R1);
1043 traits.
acc(C9, alphav, R2);
1044 r1.storePacket(0 * Traits::ResPacketSize, R0);
1045 r1.storePacket(1 * Traits::ResPacketSize, R1);
1046 r1.storePacket(2 * Traits::ResPacketSize, R2);
1048 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050 R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1051 traits.
acc(C2, alphav, R0);
1052 traits.
acc(C6, alphav, R1);
1053 traits.
acc(C10, alphav, R2);
1054 r2.storePacket(0 * Traits::ResPacketSize, R0);
1055 r2.storePacket(1 * Traits::ResPacketSize, R1);
1056 r2.storePacket(2 * Traits::ResPacketSize, R2);
1058 R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059 R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060 R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1061 traits.
acc(C3, alphav, R0);
1062 traits.
acc(C7, alphav, R1);
1063 traits.
acc(C11, alphav, R2);
1064 r3.storePacket(0 * Traits::ResPacketSize, R0);
1065 r3.storePacket(1 * Traits::ResPacketSize, R1);
1066 r3.storePacket(2 * Traits::ResPacketSize, R2);
1071 for(
Index j2=packet_cols4; j2<cols; j2++)
1073 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1076 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1089 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1092 for(
Index k=0; k<peeled_kc; k+=pk)
1096 #define EIGEN_GEBGP_ONESTEP(K) \ 1098 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ 1099 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1100 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ 1101 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ 1102 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ 1103 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1104 traits.madd(A0, B_0, C0, B_0); \ 1105 traits.madd(A1, B_0, C4, B_0); \ 1106 traits.madd(A2, B_0, C8, B_0); \ 1107 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ 1119 blB += pk*RhsProgress;
1120 blA += pk*3*Traits::LhsProgress;
1126 for(
Index k=peeled_kc; k<depth; k++)
1131 blA += 3*Traits::LhsProgress;
1133 #undef EIGEN_GEBGP_ONESTEP 1135 ResPacket alphav = pset1<ResPacket>(alpha);
1137 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1140 traits.
acc(C0, alphav, R0);
1141 traits.
acc(C4, alphav, R1);
1142 traits.
acc(C8, alphav, R2);
1143 r0.storePacket(0 * Traits::ResPacketSize, R0);
1144 r0.storePacket(1 * Traits::ResPacketSize, R1);
1145 r0.storePacket(2 * Traits::ResPacketSize, R2);
1152 if(mr>=2*Traits::LhsProgress)
1158 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1160 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1162 Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1163 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1165 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1171 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1185 r0.prefetch(prefetch_res_offset);
1186 r1.prefetch(prefetch_res_offset);
1187 r2.prefetch(prefetch_res_offset);
1188 r3.prefetch(prefetch_res_offset);
1191 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1195 for(
Index k=0; k<peeled_kc; k+=pk)
1200 #define EIGEN_GEBGP_ONESTEP(K) \ 1202 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ 1203 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1204 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1205 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1206 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ 1207 traits.madd(A0, B_0, C0, T0); \ 1208 traits.madd(A1, B_0, C4, B_0); \ 1209 traits.madd(A0, B1, C1, T0); \ 1210 traits.madd(A1, B1, C5, B1); \ 1211 traits.madd(A0, B2, C2, T0); \ 1212 traits.madd(A1, B2, C6, B2); \ 1213 traits.madd(A0, B3, C3, T0); \ 1214 traits.madd(A1, B3, C7, B3); \ 1215 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ 1229 blB += pk*4*RhsProgress;
1230 blA += pk*(2*Traits::LhsProgress);
1235 for(
Index k=peeled_kc; k<depth; k++)
1239 blB += 4*RhsProgress;
1240 blA += 2*Traits::LhsProgress;
1242 #undef EIGEN_GEBGP_ONESTEP 1245 ResPacket alphav = pset1<ResPacket>(alpha);
1247 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1248 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1249 R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1250 R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1251 traits.
acc(C0, alphav, R0);
1252 traits.
acc(C4, alphav, R1);
1253 traits.
acc(C1, alphav, R2);
1254 traits.
acc(C5, alphav, R3);
1255 r0.storePacket(0 * Traits::ResPacketSize, R0);
1256 r0.storePacket(1 * Traits::ResPacketSize, R1);
1257 r1.storePacket(0 * Traits::ResPacketSize, R2);
1258 r1.storePacket(1 * Traits::ResPacketSize, R3);
1260 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1261 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1262 R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1263 R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1264 traits.
acc(C2, alphav, R0);
1265 traits.
acc(C6, alphav, R1);
1266 traits.
acc(C3, alphav, R2);
1267 traits.
acc(C7, alphav, R3);
1268 r2.storePacket(0 * Traits::ResPacketSize, R0);
1269 r2.storePacket(1 * Traits::ResPacketSize, R1);
1270 r3.storePacket(0 * Traits::ResPacketSize, R2);
1271 r3.storePacket(1 * Traits::ResPacketSize, R3);
1276 for(
Index j2=packet_cols4; j2<cols; j2++)
1278 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1281 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1290 r0.prefetch(prefetch_res_offset);
1293 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1296 for(
Index k=0; k<peeled_kc; k+=pk)
1301 #define EIGEN_GEBGP_ONESTEP(K) \ 1303 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ 1304 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1305 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1306 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1307 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1308 traits.madd(A0, B_0, C0, B1); \ 1309 traits.madd(A1, B_0, C4, B_0); \ 1310 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ 1322 blB += pk*RhsProgress;
1323 blA += pk*2*Traits::LhsProgress;
1329 for(
Index k=peeled_kc; k<depth; k++)
1334 blA += 2*Traits::LhsProgress;
1336 #undef EIGEN_GEBGP_ONESTEP 1338 ResPacket alphav = pset1<ResPacket>(alpha);
1340 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1341 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1342 traits.
acc(C0, alphav, R0);
1343 traits.
acc(C4, alphav, R1);
1344 r0.storePacket(0 * Traits::ResPacketSize, R0);
1345 r0.storePacket(1 * Traits::ResPacketSize, R1);
1351 if(mr>=1*Traits::LhsProgress)
1354 for(
Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1357 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1362 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1377 r0.prefetch(prefetch_res_offset);
1378 r1.prefetch(prefetch_res_offset);
1379 r2.prefetch(prefetch_res_offset);
1380 r3.prefetch(prefetch_res_offset);
1383 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1387 for(
Index k=0; k<peeled_kc; k+=pk)
1392 #define EIGEN_GEBGP_ONESTEP(K) \ 1394 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ 1395 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1396 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ 1397 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ 1398 traits.madd(A0, B_0, C0, B_0); \ 1399 traits.madd(A0, B1, C1, B1); \ 1400 traits.madd(A0, B2, C2, B2); \ 1401 traits.madd(A0, B3, C3, B3); \ 1402 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ 1416 blB += pk*4*RhsProgress;
1417 blA += pk*1*LhsProgress;
1422 for(
Index k=peeled_kc; k<depth; k++)
1426 blB += 4*RhsProgress;
1427 blA += 1*LhsProgress;
1429 #undef EIGEN_GEBGP_ONESTEP 1432 ResPacket alphav = pset1<ResPacket>(alpha);
1434 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1435 R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1436 traits.
acc(C0, alphav, R0);
1437 traits.
acc(C1, alphav, R1);
1438 r0.storePacket(0 * Traits::ResPacketSize, R0);
1439 r1.storePacket(0 * Traits::ResPacketSize, R1);
1441 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1442 R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1443 traits.
acc(C2, alphav, R0);
1444 traits.
acc(C3, alphav, R1);
1445 r2.storePacket(0 * Traits::ResPacketSize, R0);
1446 r3.storePacket(0 * Traits::ResPacketSize, R1);
1450 for(
Index j2=packet_cols4; j2<cols; j2++)
1453 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1463 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1466 for(
Index k=0; k<peeled_kc; k+=pk)
1471 #define EIGEN_GEBGP_ONESTEP(K) \ 1473 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ 1474 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1475 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ 1476 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1477 traits.madd(A0, B_0, C0, B_0); \ 1478 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ 1490 blB += pk*RhsProgress;
1491 blA += pk*1*Traits::LhsProgress;
1497 for(
Index k=peeled_kc; k<depth; k++)
1502 blA += 1*Traits::LhsProgress;
1504 #undef EIGEN_GEBGP_ONESTEP 1506 ResPacket alphav = pset1<ResPacket>(alpha);
1507 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1508 traits.
acc(C0, alphav, R0);
1509 r0.storePacket(0 * Traits::ResPacketSize, R0);
1517 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1520 for(
Index i=peeled_mc1; i<rows; i+=1)
1522 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1524 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1530 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1531 (SwappedTraits::LhsProgress <= 8) &&
1540 const Index spk = (
std::max)(1,SwappedTraits::LhsProgress/4);
1541 const Index endk = (depth/spk)*spk;
1542 const Index endk4 = (depth/(spk*4))*(spk*4);
1545 for(; k<endk4; k+=4*spk)
1555 straits.
madd(A0,B_0,C0,B_0);
1556 straits.
madd(A1,B_1,C1,B_1);
1562 straits.
madd(A0,B_0,C2,B_0);
1563 straits.
madd(A1,B_1,C3,B_1);
1565 blB += 4*SwappedTraits::LhsProgress;
1569 for(; k<endk; k+=spk)
1576 straits.
madd(A0,B_0,C0,B_0);
1578 blB += SwappedTraits::LhsProgress;
1581 if(SwappedTraits::LhsProgress==8)
1589 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1590 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1600 straits.
madd(a0,b0,c0,b0);
1601 straits.
acc(c0, alphav, R);
1607 res.scatterPacket(i, j2, R);
1611 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1612 SResPacket alphav = pset1<SResPacket>(alpha);
1613 straits.
acc(C0, alphav, R);
1614 res.scatterPacket(i, j2, R);
1620 ResScalar C0(0), C1(0), C2(0), C3(0);
1622 for(
Index k=0; k<depth; k++)
1631 CJMADD(cj,A0,B_0,C0, B_0);
1632 CJMADD(cj,A0,B_1,C1, B_1);
1636 CJMADD(cj,A0,B_0,C2, B_0);
1637 CJMADD(cj,A0,B_1,C3, B_1);
1641 res(i, j2 + 0) += alpha * C0;
1642 res(i, j2 + 1) += alpha * C1;
1643 res(i, j2 + 2) += alpha * C2;
1644 res(i, j2 + 3) += alpha * C3;
1649 for(
Index j2=packet_cols4; j2<cols; j2++)
1652 for(
Index i=peeled_mc1; i<rows; i+=1)
1654 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1658 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1659 for(
Index k=0; k<depth; k++)
1661 LhsScalar A0 = blA[k];
1662 RhsScalar B_0 = blB[k];
1663 CJMADD(cj, A0, B_0, C0, B_0);
1665 res(i, j2) += alpha * C0;
1688 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1695 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1705 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1706 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1710 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1711 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1712 const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1713 const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1714 : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1719 if(Pack1>=3*PacketSize)
1721 for(; i<peeled_mc3; i+=3*PacketSize)
1723 if(PanelMode) count += (3*PacketSize) * offset;
1725 for(
Index k=0; k<depth; k++)
1728 A = lhs.loadPacket(i+0*PacketSize, k);
1729 B = lhs.loadPacket(i+1*PacketSize, k);
1730 C = lhs.loadPacket(i+2*PacketSize, k);
1731 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1732 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1733 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1735 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1739 if(Pack1>=2*PacketSize)
1741 for(; i<peeled_mc2; i+=2*PacketSize)
1743 if(PanelMode) count += (2*PacketSize) * offset;
1745 for(
Index k=0; k<depth; k++)
1748 A = lhs.loadPacket(i+0*PacketSize, k);
1749 B = lhs.loadPacket(i+1*PacketSize, k);
1750 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1751 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1753 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1757 if(Pack1>=1*PacketSize)
1759 for(; i<peeled_mc1; i+=1*PacketSize)
1761 if(PanelMode) count += (1*PacketSize) * offset;
1763 for(
Index k=0; k<depth; k++)
1766 A = lhs.loadPacket(i+0*PacketSize, k);
1767 pstore(blockA+count, cj.pconj(A));
1770 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1774 if(Pack2<PacketSize && Pack2>1)
1776 for(; i<peeled_mc0; i+=Pack2)
1778 if(PanelMode) count += Pack2 * offset;
1780 for(
Index k=0; k<depth; k++)
1782 blockA[count++] = cj(lhs(i+
w, k));
1784 if(PanelMode) count += Pack2 * (stride-offset-depth);
1789 if(PanelMode) count += offset;
1790 for(
Index k=0; k<depth; k++)
1791 blockA[count++] = cj(lhs(i, k));
1792 if(PanelMode) count += (stride-offset-depth);
1796 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1803 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1813 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1825 Index remaining_rows = rows-i;
1826 Index peeled_mc = i+(remaining_rows/pack)*pack;
1827 for(; i<peeled_mc; i+=pack)
1829 if(PanelMode) count += pack * offset;
1831 const Index peeled_k = (depth/PacketSize)*PacketSize;
1833 if(pack>=PacketSize)
1835 for(; k<peeled_k; k+=PacketSize)
1837 for (
Index m = 0; m < pack; m += PacketSize)
1840 for (
int p = 0; p < PacketSize; ++p) kernel.
packet[p] = lhs.loadPacket(i+p+m, k);
1842 for (
int p = 0; p < PacketSize; ++p)
pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.
packet[p]));
1844 count += PacketSize*pack;
1850 for(; w<pack-3; w+=4)
1852 Scalar a(cj(lhs(i+w+0, k))),
1853 b(cj(lhs(i+w+1, k))),
1854 c(cj(lhs(i+w+2, k))),
1855 d(cj(lhs(i+w+3, k)));
1856 blockA[count++] = a;
1857 blockA[count++] =
b;
1858 blockA[count++] = c;
1859 blockA[count++] = d;
1863 blockA[count++] = cj(lhs(i+w, k));
1866 if(PanelMode) count += pack * (stride-offset-depth);
1870 if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1876 if(PanelMode) count += offset;
1877 for(
Index k=0; k<depth; k++)
1878 blockA[count++] = cj(lhs(i, k));
1879 if(PanelMode) count += (stride-offset-depth);
1890 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1899 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1906 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1908 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1909 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1911 const Index peeled_k = (depth/PacketSize)*PacketSize;
1960 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1963 if(PanelMode) count += 4 * offset;
1964 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1965 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1966 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1967 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
1970 if((PacketSize%4)==0)
1972 for(; k<peeled_k; k+=PacketSize) {
1974 kernel.
packet[0] = dm0.loadPacket(k);
1975 kernel.
packet[1%PacketSize] = dm1.loadPacket(k);
1976 kernel.
packet[2%PacketSize] = dm2.loadPacket(k);
1977 kernel.
packet[3%PacketSize] = dm3.loadPacket(k);
1979 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.
packet[0]));
1980 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.
packet[1%PacketSize]));
1981 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.
packet[2%PacketSize]));
1982 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.
packet[3%PacketSize]));
1983 count+=4*PacketSize;
1988 blockB[count+0] = cj(dm0(k));
1989 blockB[count+1] = cj(dm1(k));
1990 blockB[count+2] = cj(dm2(k));
1991 blockB[count+3] = cj(dm3(k));
1995 if(PanelMode) count += 4 * (stride-offset-depth);
2000 for(
Index j2=packet_cols4; j2<cols; ++j2)
2002 if(PanelMode) count += offset;
2004 for(
Index k=0; k<depth; k++)
2006 blockB[count] = cj(dm0(k));
2009 if(PanelMode) count += (stride-offset-depth);
2014 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2023 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2030 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2032 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2033 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2071 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2074 if(PanelMode) count += 4 * offset;
2075 for(
Index k=0; k<depth; k++)
2077 if (PacketSize==4) {
2078 Packet A = rhs.loadPacket(k, j2);
2079 pstoreu(blockB+count, cj.pconj(A));
2080 count += PacketSize;
2083 blockB[count+0] = cj(dm0(0));
2084 blockB[count+1] = cj(dm0(1));
2085 blockB[count+2] = cj(dm0(2));
2086 blockB[count+3] = cj(dm0(3));
2091 if(PanelMode) count += 4 * (stride-offset-depth);
2095 for(
Index j2=packet_cols4; j2<cols; ++j2)
2097 if(PanelMode) count += offset;
2098 for(
Index k=0; k<depth; k++)
2100 blockB[count] = cj(rhs(k, j2));
2103 if(PanelMode) count += stride-offset-depth;
2113 std::ptrdiff_t l1, l2, l3;
2122 std::ptrdiff_t l1, l2, l3;
2132 std::ptrdiff_t l1, l2, l3;
2149 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
#define EIGEN_ALWAYS_INLINE
SwappedTraits::ResScalar SResScalar
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
DataMapper::LinearMapper LinearMapper
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_STRONG_INLINE
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacketType &dest) const
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
conditional< Vectorizable, DoublePacketType, Scalar >::type AccPacket
EIGEN_STRONG_INLINE void acc(const DoublePacketType &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
DataMapper::LinearMapper LinearMapper
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, T &a, T &b, T &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
std::complex< RealScalar > Scalar
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
EIGEN_STRONG_INLINE void gebp_madd(const CJ &cj, A &a, B &b, C &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
packet_traits< LhsScalar >::type _LhsPacket
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, A &a, B &b, C &c, T &)
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, DoublePacketType &c, RhsPacket &) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
std::complex< RealScalar > RhsScalar
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
DataMapper::LinearMapper LinearMapper
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
DerType::Scalar imag(const AutoDiffScalar< DerType > &)
Traits::ResScalar ResScalar
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsScalar &b0, RhsScalar &b1)
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
#define EIGEN_ASM_COMMENT(X)
SwappedTraits::AccPacket SAccPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
Traits::RhsPacket RhsPacket
const std::ptrdiff_t defaultL3CacheSize
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
packet_traits< Scalar >::type Packet
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf &x)
DoublePacket< Packet > half
#define EIGEN_DONT_INLINE
packet_traits< RealScalar >::type RealPacket
std::ptrdiff_t l3CacheSize()
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half() max(const half &a, const half &b)
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
#define CJMADD(CJ, A, B, C, T)
SwappedTraits::RhsPacket SRhsPacket
SwappedTraits::ResPacket SResPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
conj_helper< ResPacket, ResPacket, false, ConjRhs > cj
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::ptrdiff_t l2CacheSize()
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, AccPacketType &tmp) const
void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
conditional< Vectorizable, ScalarPacket, Scalar >::type ResPacket
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &) const
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
packet_traits< LhsScalar >::type _LhsPacket
void queryCacheSizes(int &l1, int &l2, int &l3)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, DoublePacketType &b0, DoublePacketType &b1)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ResPacket &dest) const
T div_ceil(const T &a, const T &b)
#define EIGEN_GEBP_ONESTEP(K)
conditional< Vectorizable, RealPacket, Scalar >::type LhsPacket
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
packet_traits< ResScalar >::type _ResPacket
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
packet_traits< Scalar >::type ScalarPacket
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
conj_helper< ResPacket, ResPacket, ConjLhs, false > cj
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
Traits::AccPacket AccPacket
const std::ptrdiff_t defaultL2CacheSize
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
TFSIMD_FORCE_INLINE const tfScalar & w() const
std::complex< RealScalar > LhsScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar &x, const RhsScalar &y, const Scalar &c) const
Traits::LhsPacket LhsPacket
packet_traits< ResScalar >::type _ResPacket
SwappedTraits::LhsPacket SLhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::complex< RealScalar > LhsScalar
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
#define EIGEN_PLAIN_ENUM_MIN(a, b)
const std::ptrdiff_t defaultL1CacheSize
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > SwappedTraits
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
packet_traits< Scalar >::type Packet
Traits::ResPacket ResPacket
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
DataMapper::LinearMapper LinearMapper
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
std::complex< RealScalar > ResScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
DoublePacket< RealPacket > DoublePacketType
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
packet_traits< ResScalar >::type _ResPacket
#define eigen_internal_assert(x)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
const AutoDiffScalar< DerType > & real(const AutoDiffScalar< DerType > &x)
conditional< Vectorizable, DoublePacketType, Scalar >::type RhsPacket
EIGEN_DEVICE_FUNC const Scalar & b
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
std::ptrdiff_t l1CacheSize()
DataMapper::LinearMapper LinearMapper
EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar &x, const RhsScalar &y) const
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
EIGEN_DEVICE_FUNC conditional<(unpacket_traits< Packet >::size%8)==0, typename unpacket_traits< Packet >::half, Packet >::type predux_downto4(const Packet &a)
#define EIGEN_UNUSED_VARIABLE(var)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
std::complex< RealScalar > Scalar
packet_traits< LhsScalar >::type _LhsPacket