10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H 11 #define EIGEN_GENERAL_BLOCK_PANEL_H 18 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false>
28 #if EIGEN_ARCH_i386_OR_x86_64 33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
70 *l1 = m_cacheSizes.
m_l1;
71 *l2 = m_cacheSizes.
m_l2;
72 *l3 = m_cacheSizes.
m_l3;
92 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
102 std::ptrdiff_t
l1,
l2,
l3;
105 if (num_threads > 1) {
106 typedef typename Traits::ResScalar ResScalar;
108 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
109 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
118 const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
120 k = k_cache - (k_cache % kr);
124 const Index n_cache = (l2-
l1) / (nr *
sizeof(RhsScalar) * k);
126 if (n_cache <= n_per_thread) {
129 n = n_cache - (n_cache % nr);
132 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
137 const Index m_cache = (l3-
l2) / (
sizeof(LhsScalar) * k * num_threads);
139 if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
140 m = m_cache - (m_cache % mr);
143 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 163 typedef typename Traits::ResScalar ResScalar;
166 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
167 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
177 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
178 const Index old_k = k;
184 k = (k%max_kc)==0 ? max_kc
185 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
187 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
196 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 199 const Index actual_l2 = 1572864;
209 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
210 const Index remaining_l1 = l1- k_sub - lhs_bytes;
211 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
214 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
219 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
222 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
230 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
237 Index problem_size = k*n*
sizeof(LhsScalar);
238 Index actual_lm = actual_l2;
240 if(problem_size<=1024)
246 else if(l3!=0 && problem_size<=32768)
251 max_mc = (numext::mini<Index>)(576,max_mc);
253 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
254 if (mc > Traits::mr) mc -= mc % Traits::mr;
255 else if (mc==0)
return;
257 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
262 template <
typename Index>
265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES 296 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
300 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k,
m,
n, num_threads);
304 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
307 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k,
m,
n, num_threads);
310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 311 #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); 326 t =
b; t = cj.pmul(a,t); c =
padd(c,t);
330 template<
typename CJ,
typename A,
typename B,
typename C,
typename T>
336 #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); 350 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs>
377 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
382 LhsProgress = LhsPacketSize,
398 p = pset1<ResPacket>(ResScalar(0));
411 template<
typename RhsPacketType>
414 dest = pset1<RhsPacketType>(*b);
419 dest = ploadquad<RhsPacket>(
b);
422 template<
typename LhsPacketType>
425 dest = pload<LhsPacketType>(
a);
428 template<
typename LhsPacketType>
431 dest = ploadu<LhsPacketType>(
a);
434 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
442 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 446 tmp =
b; tmp = cj.
pmul(a,tmp); c =
padd(c,tmp);
452 r =
pmadd(c,alpha,r);
455 template<
typename ResPacketHalf>
458 r =
pmadd(c,alpha,r);
463 template<
typename RealScalar,
bool _ConjLhs>
481 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) 483 mr = 3*LhsPacketSize,
488 LhsProgress = LhsPacketSize,
504 p = pset1<ResPacket>(ResScalar(0));
509 dest = pset1<RhsPacket>(*b);
514 dest = pset1<RhsPacket>(*b);
519 dest = pload<LhsPacket>(
a);
524 dest = ploadu<LhsPacket>(
a);
544 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 546 c.v =
pmadd(a.v,b,c.v);
548 tmp =
b; tmp =
pmul(a.v,tmp); c.v =
padd(c.v,tmp);
559 r = cj.pmadd(c,alpha,r);
566 template<
typename Packet>
573 template<
typename Packet>
582 template<
typename Packet>
598 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs>
621 LhsProgress = ResPacketSize,
645 dest = pset1<ResPacket>(*b);
651 dest.
first = pset1<RealPacket>(
real(*b));
718 if((!ConjLhs)&&(!ConjRhs))
723 else if((!ConjLhs)&&(ConjRhs))
728 else if((ConjLhs)&&(!ConjRhs))
733 else if((ConjLhs)&&(ConjRhs))
739 r =
pmadd(tmp,alpha,r);
746 template<
typename RealScalar,
bool _ConjRhs>
769 LhsProgress = ResPacketSize,
785 p = pset1<ResPacket>(ResScalar(0));
790 dest = pset1<RhsPacket>(*b);
807 dest = ploaddup<LhsPacket>(
a);
818 dest = ploaddup<LhsPacket>(
a);
828 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 830 c.v =
pmadd(a,b.v,c.v);
832 tmp =
b; tmp.v =
pmul(a,tmp.v); c =
padd(c,tmp);
844 r = cj.pmadd(alpha,c,r);
858 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
878 Vectorizable = Traits::Vectorizable,
879 LhsProgress = Traits::LhsProgress,
880 RhsProgress = Traits::RhsProgress,
881 ResPacketSize = Traits::ResPacketSize
885 void operator()(
const DataMapper&
res,
const LhsScalar* blockA,
const RhsScalar* blockB,
890 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
900 if(strideA==-1) strideA = depth;
901 if(strideB==-1) strideB = depth;
903 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
908 const Index peeled_kc = depth & ~(pk-1);
909 const Index prefetch_res_offset = 32/
sizeof(ResScalar);
915 if(mr>=3*Traits::LhsProgress)
926 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
927 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
929 const Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc3);
930 for(
Index j2=0; j2<packet_cols4; j2+=nr)
932 for(
Index i=i1;
i<actual_panel_end;
i+=3*LhsProgress)
938 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*LhsProgress)];
960 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
964 for(
Index k=0; k<peeled_kc; k+=pk)
970 #define EIGEN_GEBP_ONESTEP(K) \ 972 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ 973 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 974 internal::prefetch(blA+(3*K+16)*LhsProgress); \ 975 if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } \ 976 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ 977 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ 978 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ 979 traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ 980 traits.madd(A0, B_0, C0, T0); \ 981 traits.madd(A1, B_0, C4, T0); \ 982 traits.madd(A2, B_0, C8, B_0); \ 983 traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ 984 traits.madd(A0, B_0, C1, T0); \ 985 traits.madd(A1, B_0, C5, T0); \ 986 traits.madd(A2, B_0, C9, B_0); \ 987 traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ 988 traits.madd(A0, B_0, C2, T0); \ 989 traits.madd(A1, B_0, C6, T0); \ 990 traits.madd(A2, B_0, C10, B_0); \ 991 traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ 992 traits.madd(A0, B_0, C3 , T0); \ 993 traits.madd(A1, B_0, C7, T0); \ 994 traits.madd(A2, B_0, C11, B_0); \ 995 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ 1008 blB += pk*4*RhsProgress;
1009 blA += pk*3*Traits::LhsProgress;
1014 for(
Index k=peeled_kc; k<depth; k++)
1019 blB += 4*RhsProgress;
1020 blA += 3*Traits::LhsProgress;
1023 #undef EIGEN_GEBP_ONESTEP 1028 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1031 traits.
acc(C0, alphav, R0);
1032 traits.
acc(C4, alphav, R1);
1033 traits.
acc(C8, alphav, R2);
1034 r0.storePacket(0 * Traits::ResPacketSize, R0);
1035 r0.storePacket(1 * Traits::ResPacketSize, R1);
1036 r0.storePacket(2 * Traits::ResPacketSize, R2);
1038 R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039 R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040 R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1041 traits.
acc(C1, alphav, R0);
1042 traits.
acc(C5, alphav, R1);
1043 traits.
acc(C9, alphav, R2);
1044 r1.storePacket(0 * Traits::ResPacketSize, R0);
1045 r1.storePacket(1 * Traits::ResPacketSize, R1);
1046 r1.storePacket(2 * Traits::ResPacketSize, R2);
1048 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050 R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1051 traits.
acc(C2, alphav, R0);
1052 traits.
acc(C6, alphav, R1);
1053 traits.
acc(C10, alphav, R2);
1054 r2.storePacket(0 * Traits::ResPacketSize, R0);
1055 r2.storePacket(1 * Traits::ResPacketSize, R1);
1056 r2.storePacket(2 * Traits::ResPacketSize, R2);
1058 R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059 R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060 R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1061 traits.
acc(C3, alphav, R0);
1062 traits.
acc(C7, alphav, R1);
1063 traits.
acc(C11, alphav, R2);
1064 r3.storePacket(0 * Traits::ResPacketSize, R0);
1065 r3.storePacket(1 * Traits::ResPacketSize, R1);
1066 r3.storePacket(2 * Traits::ResPacketSize, R2);
1071 for(
Index j2=packet_cols4; j2<
cols; j2++)
1073 for(
Index i=i1;
i<actual_panel_end;
i+=3*LhsProgress)
1076 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*Traits::LhsProgress)];
1089 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1092 for(
Index k=0; k<peeled_kc; k+=pk)
1096 #define EIGEN_GEBGP_ONESTEP(K) \ 1098 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ 1099 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1100 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ 1101 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ 1102 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ 1103 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1104 traits.madd(A0, B_0, C0, B_0); \ 1105 traits.madd(A1, B_0, C4, B_0); \ 1106 traits.madd(A2, B_0, C8, B_0); \ 1107 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ 1119 blB += pk*RhsProgress;
1120 blA += pk*3*Traits::LhsProgress;
1126 for(
Index k=peeled_kc; k<depth; k++)
1131 blA += 3*Traits::LhsProgress;
1133 #undef EIGEN_GEBGP_ONESTEP 1137 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1140 traits.
acc(C0, alphav, R0);
1141 traits.
acc(C4, alphav, R1);
1142 traits.
acc(C8, alphav, R2);
1143 r0.storePacket(0 * Traits::ResPacketSize, R0);
1144 r0.storePacket(1 * Traits::ResPacketSize, R1);
1145 r0.storePacket(2 * Traits::ResPacketSize, R2);
1152 if(mr>=2*Traits::LhsProgress)
1158 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1160 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1162 Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc2);
1163 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1165 for(
Index i=i1;
i<actual_panel_end;
i+=2*LhsProgress)
1171 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
1185 r0.prefetch(prefetch_res_offset);
1186 r1.prefetch(prefetch_res_offset);
1187 r2.prefetch(prefetch_res_offset);
1188 r3.prefetch(prefetch_res_offset);
1191 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1195 for(
Index k=0; k<peeled_kc; k+=pk)
1202 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) 1203 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); 1205 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND 1207 #define EIGEN_GEBGP_ONESTEP(K) \ 1209 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ 1210 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1211 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1212 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ 1213 traits.madd(A0, B_0, C0, T0); \ 1214 traits.madd(A1, B_0, C4, B_0); \ 1215 traits.madd(A0, B1, C1, T0); \ 1216 traits.madd(A1, B1, C5, B1); \ 1217 traits.madd(A0, B2, C2, T0); \ 1218 traits.madd(A1, B2, C6, B2); \ 1219 traits.madd(A0, B3, C3, T0); \ 1220 traits.madd(A1, B3, C7, B3); \ 1221 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ 1222 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ 1236 blB += pk*4*RhsProgress;
1237 blA += pk*(2*Traits::LhsProgress);
1242 for(
Index k=peeled_kc; k<depth; k++)
1246 blB += 4*RhsProgress;
1247 blA += 2*Traits::LhsProgress;
1249 #undef EIGEN_GEBGP_ONESTEP 1254 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1255 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1256 R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1257 R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1258 traits.
acc(C0, alphav, R0);
1259 traits.
acc(C4, alphav, R1);
1260 traits.
acc(C1, alphav, R2);
1261 traits.
acc(C5, alphav, R3);
1262 r0.storePacket(0 * Traits::ResPacketSize, R0);
1263 r0.storePacket(1 * Traits::ResPacketSize, R1);
1264 r1.storePacket(0 * Traits::ResPacketSize, R2);
1265 r1.storePacket(1 * Traits::ResPacketSize, R3);
1267 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1268 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1269 R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1270 R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1271 traits.
acc(C2, alphav, R0);
1272 traits.
acc(C6, alphav, R1);
1273 traits.
acc(C3, alphav, R2);
1274 traits.
acc(C7, alphav, R3);
1275 r2.storePacket(0 * Traits::ResPacketSize, R0);
1276 r2.storePacket(1 * Traits::ResPacketSize, R1);
1277 r3.storePacket(0 * Traits::ResPacketSize, R2);
1278 r3.storePacket(1 * Traits::ResPacketSize, R3);
1283 for(
Index j2=packet_cols4; j2<
cols; j2++)
1285 for(
Index i=i1;
i<actual_panel_end;
i+=2*LhsProgress)
1288 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
1297 r0.prefetch(prefetch_res_offset);
1300 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1303 for(
Index k=0; k<peeled_kc; k+=pk)
1308 #define EIGEN_GEBGP_ONESTEP(K) \ 1310 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ 1311 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1312 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1313 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1314 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1315 traits.madd(A0, B_0, C0, B1); \ 1316 traits.madd(A1, B_0, C4, B_0); \ 1317 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ 1329 blB += pk*RhsProgress;
1330 blA += pk*2*Traits::LhsProgress;
1336 for(
Index k=peeled_kc; k<depth; k++)
1341 blA += 2*Traits::LhsProgress;
1343 #undef EIGEN_GEBGP_ONESTEP 1347 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1348 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1349 traits.
acc(C0, alphav, R0);
1350 traits.
acc(C4, alphav, R1);
1351 r0.storePacket(0 * Traits::ResPacketSize, R0);
1352 r0.storePacket(1 * Traits::ResPacketSize, R1);
1358 if(mr>=1*Traits::LhsProgress)
1361 for(
Index i=peeled_mc2;
i<peeled_mc1;
i+=1*LhsProgress)
1364 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1369 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(1*Traits::LhsProgress)];
1384 r0.prefetch(prefetch_res_offset);
1385 r1.prefetch(prefetch_res_offset);
1386 r2.prefetch(prefetch_res_offset);
1387 r3.prefetch(prefetch_res_offset);
1390 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1394 for(
Index k=0; k<peeled_kc; k+=pk)
1399 #define EIGEN_GEBGP_ONESTEP(K) \ 1401 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ 1402 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1403 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ 1404 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ 1405 traits.madd(A0, B_0, C0, B_0); \ 1406 traits.madd(A0, B1, C1, B1); \ 1407 traits.madd(A0, B2, C2, B2); \ 1408 traits.madd(A0, B3, C3, B3); \ 1409 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ 1423 blB += pk*4*RhsProgress;
1424 blA += pk*1*LhsProgress;
1429 for(
Index k=peeled_kc; k<depth; k++)
1433 blB += 4*RhsProgress;
1434 blA += 1*LhsProgress;
1436 #undef EIGEN_GEBGP_ONESTEP 1441 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1442 R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1443 traits.
acc(C0, alphav, R0);
1444 traits.
acc(C1, alphav, R1);
1445 r0.storePacket(0 * Traits::ResPacketSize, R0);
1446 r1.storePacket(0 * Traits::ResPacketSize, R1);
1448 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1449 R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1450 traits.
acc(C2, alphav, R0);
1451 traits.
acc(C3, alphav, R1);
1452 r2.storePacket(0 * Traits::ResPacketSize, R0);
1453 r3.storePacket(0 * Traits::ResPacketSize, R1);
1457 for(
Index j2=packet_cols4; j2<
cols; j2++)
1460 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(1*Traits::LhsProgress)];
1470 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1473 for(
Index k=0; k<peeled_kc; k+=pk)
1478 #define EIGEN_GEBGP_ONESTEP(K) \ 1480 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ 1481 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1482 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ 1483 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1484 traits.madd(A0, B_0, C0, B_0); \ 1485 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ 1497 blB += pk*RhsProgress;
1498 blA += pk*1*Traits::LhsProgress;
1504 for(
Index k=peeled_kc; k<depth; k++)
1509 blA += 1*Traits::LhsProgress;
1511 #undef EIGEN_GEBGP_ONESTEP 1514 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1515 traits.
acc(C0, alphav, R0);
1516 r0.storePacket(0 * Traits::ResPacketSize, R0);
1524 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1529 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
1531 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1537 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1538 (SwappedTraits::LhsProgress <= 8) &&
1539 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1547 const Index spk = (
std::max)(1,SwappedTraits::LhsProgress/4);
1548 const Index endk = (depth/spk)*spk;
1549 const Index endk4 = (depth/(spk*4))*(spk*4);
1552 for(; k<endk4; k+=4*spk)
1562 straits.
madd(A0,B_0,C0,B_0);
1563 straits.
madd(A1,B_1,C1,B_1);
1569 straits.
madd(A0,B_0,C2,B_0);
1570 straits.
madd(A1,B_1,C3,B_1);
1572 blB += 4*SwappedTraits::LhsProgress;
1576 for(; k<endk; k+=spk)
1583 straits.
madd(A0,B_0,C0,B_0);
1585 blB += SwappedTraits::LhsProgress;
1588 if(SwappedTraits::LhsProgress==8)
1596 SResPacketHalf
R = res.template gatherPacket<SResPacketHalf>(
i, j2);
1597 SResPacketHalf alphav = pset1<SResPacketHalf>(
alpha);
1607 straits.
madd(a0,b0,c0,b0);
1608 straits.
acc(c0, alphav, R);
1614 res.scatterPacket(i, j2, R);
1618 SResPacket R = res.template gatherPacket<SResPacket>(
i, j2);
1620 straits.
acc(C0, alphav, R);
1621 res.scatterPacket(i, j2, R);
1627 ResScalar C0(0),
C1(0),
C2(0), C3(0);
1629 for(
Index k=0; k<depth; k++)
1638 CJMADD(cj,A0,B_0,C0, B_0);
1644 CJMADD(cj,A0,B_1,C3, B_1);
1648 res(
i, j2 + 0) += alpha * C0;
1649 res(
i, j2 + 1) += alpha *
C1;
1650 res(
i, j2 + 2) += alpha *
C2;
1651 res(
i, j2 + 3) += alpha * C3;
1656 for(
Index j2=packet_cols4; j2<
cols; j2++)
1661 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
1665 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1666 for(
Index k=0; k<depth; k++)
1668 LhsScalar A0 = blA[k];
1669 RhsScalar B_0 = blB[k];
1670 CJMADD(cj, A0, B_0, C0, B_0);
1672 res(
i, j2) += alpha * C0;
1695 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1702 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1712 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1713 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1717 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1718 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1719 const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1720 const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1721 : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1726 if(Pack1>=3*PacketSize)
1728 for(; i<peeled_mc3; i+=3*PacketSize)
1730 if(PanelMode) count += (3*PacketSize) * offset;
1732 for(
Index k=0; k<depth; k++)
1735 A = lhs.loadPacket(i+0*PacketSize, k);
1736 B = lhs.loadPacket(i+1*PacketSize, k);
1737 C = lhs.loadPacket(i+2*PacketSize, k);
1738 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1739 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1740 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1742 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1746 if(Pack1>=2*PacketSize)
1748 for(; i<peeled_mc2; i+=2*PacketSize)
1750 if(PanelMode) count += (2*PacketSize) * offset;
1752 for(
Index k=0; k<depth; k++)
1755 A = lhs.loadPacket(i+0*PacketSize, k);
1756 B = lhs.loadPacket(i+1*PacketSize, k);
1757 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1758 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1760 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1764 if(Pack1>=1*PacketSize)
1766 for(; i<peeled_mc1; i+=1*PacketSize)
1768 if(PanelMode) count += (1*PacketSize) * offset;
1770 for(
Index k=0; k<depth; k++)
1773 A = lhs.loadPacket(i+0*PacketSize, k);
1774 pstore(blockA+count, cj.pconj(A));
1777 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1781 if(Pack2<PacketSize && Pack2>1)
1783 for(; i<peeled_mc0; i+=Pack2)
1785 if(PanelMode) count += Pack2 *
offset;
1787 for(
Index k=0; k<depth; k++)
1789 blockA[count++] = cj(lhs(i+
w, k));
1791 if(PanelMode) count += Pack2 * (stride-offset-depth);
1796 if(PanelMode) count +=
offset;
1797 for(
Index k=0; k<depth; k++)
1798 blockA[count++] = cj(lhs(i, k));
1799 if(PanelMode) count += (stride-offset-depth);
1803 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1810 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1820 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1832 Index remaining_rows = rows-
i;
1833 Index peeled_mc = i+(remaining_rows/pack)*pack;
1834 for(; i<peeled_mc; i+=pack)
1836 if(PanelMode) count += pack *
offset;
1838 const Index peeled_k = (depth/PacketSize)*PacketSize;
1840 if(pack>=PacketSize)
1842 for(; k<peeled_k; k+=PacketSize)
1844 for (
Index m = 0;
m < pack;
m += PacketSize)
1847 for (
int p = 0;
p < PacketSize; ++
p) kernel.
packet[
p] = lhs.loadPacket(i+
p+
m, k);
1849 for (
int p = 0;
p < PacketSize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel.
packet[p]));
1851 count += PacketSize*pack;
1857 for(; w<pack-3; w+=4)
1860 b(cj(lhs(i+w+1, k))),
1861 c(cj(lhs(i+w+2, k))),
1862 d(cj(lhs(i+w+3, k)));
1863 blockA[count++] =
a;
1864 blockA[count++] =
b;
1865 blockA[count++] =
c;
1866 blockA[count++] =
d;
1870 blockA[count++] = cj(lhs(i+w, k));
1873 if(PanelMode) count += pack * (stride-offset-depth);
1877 if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1883 if(PanelMode) count +=
offset;
1884 for(
Index k=0; k<depth; k++)
1885 blockA[count++] = cj(lhs(i, k));
1886 if(PanelMode) count += (stride-offset-depth);
1897 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1906 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1913 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1915 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1916 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1918 const Index peeled_k = (depth/PacketSize)*PacketSize;
1967 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1970 if(PanelMode) count += 4 *
offset;
1971 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1972 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1973 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1974 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
1977 if((PacketSize%4)==0)
1979 for(; k<peeled_k; k+=PacketSize) {
1981 kernel.
packet[0] = dm0.loadPacket(k);
1982 kernel.
packet[1%PacketSize] = dm1.loadPacket(k);
1983 kernel.
packet[2%PacketSize] = dm2.loadPacket(k);
1984 kernel.
packet[3%PacketSize] = dm3.loadPacket(k);
1986 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.
packet[0]));
1987 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.
packet[1%PacketSize]));
1988 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.
packet[2%PacketSize]));
1989 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.
packet[3%PacketSize]));
1990 count+=4*PacketSize;
1995 blockB[count+0] = cj(dm0(k));
1996 blockB[count+1] = cj(dm1(k));
1997 blockB[count+2] = cj(dm2(k));
1998 blockB[count+3] = cj(dm3(k));
2002 if(PanelMode) count += 4 * (stride-offset-depth);
2007 for(
Index j2=packet_cols4; j2<
cols; ++j2)
2009 if(PanelMode) count +=
offset;
2011 for(
Index k=0; k<depth; k++)
2013 blockB[count] = cj(dm0(k));
2016 if(PanelMode) count += (stride-offset-depth);
2021 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2030 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2037 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2039 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2040 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2078 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2081 if(PanelMode) count += 4 *
offset;
2082 for(
Index k=0; k<depth; k++)
2084 if (PacketSize==4) {
2085 Packet A = rhs.loadPacket(k, j2);
2086 pstoreu(blockB+count, cj.pconj(A));
2087 count += PacketSize;
2090 blockB[count+0] = cj(dm0(0));
2091 blockB[count+1] = cj(dm0(1));
2092 blockB[count+2] = cj(dm0(2));
2093 blockB[count+3] = cj(dm0(3));
2098 if(PanelMode) count += 4 * (stride-offset-depth);
2102 for(
Index j2=packet_cols4; j2<
cols; ++j2)
2104 if(PanelMode) count +=
offset;
2105 for(
Index k=0; k<depth; k++)
2107 blockB[count] = cj(rhs(k, j2));
2110 if(PanelMode) count += stride-offset-depth;
2120 std::ptrdiff_t
l1,
l2,
l3;
2129 std::ptrdiff_t
l1,
l2,
l3;
2139 std::ptrdiff_t
l1,
l2,
l3;
2156 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
internal::packet_traits< Scalar >::type Packet
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
#define EIGEN_ALWAYS_INLINE
SwappedTraits::ResScalar SResScalar
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
DataMapper::LinearMapper LinearMapper
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_STRONG_INLINE
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacketType &dest) const
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
conditional< Vectorizable, DoublePacketType, Scalar >::type AccPacket
EIGEN_STRONG_INLINE void acc(const DoublePacketType &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
DataMapper::LinearMapper LinearMapper
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, T &a, T &b, T &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
std::complex< RealScalar > Scalar
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
EIGEN_STRONG_INLINE void gebp_madd(const CJ &cj, A &a, B &b, C &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
packet_traits< LhsScalar >::type _LhsPacket
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, A &a, B &b, C &c, T &)
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, DoublePacketType &c, RhsPacket &) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
std::complex< RealScalar > RhsScalar
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
Rot2 R(Rot2::fromAngle(0.1))
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
DataMapper::LinearMapper LinearMapper
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Namespace containing all symbols from the Eigen library.
DerType::Scalar imag(const AutoDiffScalar< DerType > &)
Traits::ResScalar ResScalar
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsScalar &b0, RhsScalar &b1)
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
#define EIGEN_ASM_COMMENT(X)
SwappedTraits::AccPacket SAccPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
Traits::RhsPacket RhsPacket
const std::ptrdiff_t defaultL3CacheSize
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
packet_traits< Scalar >::type Packet
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf &x)
DoublePacket< Packet > half
#define EIGEN_DONT_INLINE
packet_traits< RealScalar >::type RealPacket
std::ptrdiff_t l3CacheSize()
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
#define CJMADD(CJ, A, B, C, T)
SwappedTraits::RhsPacket SRhsPacket
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
SwappedTraits::ResPacket SResPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
conj_helper< ResPacket, ResPacket, false, ConjRhs > cj
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::ptrdiff_t l2CacheSize()
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, AccPacketType &tmp) const
void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
conditional< Vectorizable, ScalarPacket, Scalar >::type ResPacket
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &) const
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
packet_traits< LhsScalar >::type _LhsPacket
void queryCacheSizes(int &l1, int &l2, int &l3)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, DoublePacketType &b0, DoublePacketType &b1)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ResPacket &dest) const
T div_ceil(const T &a, const T &b)
#define EIGEN_GEBP_ONESTEP(K)
static const Symbol l3('l', 3)
conditional< Vectorizable, RealPacket, Scalar >::type LhsPacket
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
packet_traits< ResScalar >::type _ResPacket
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Q R1(Eigen::AngleAxisd(1, Q_z_axis))
NumTraits< Scalar >::Real RealScalar
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
packet_traits< Scalar >::type ScalarPacket
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Matrix< Scalar, Dynamic, Dynamic > C
conj_helper< ResPacket, ResPacket, ConjLhs, false > cj
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
Traits::AccPacket AccPacket
const std::ptrdiff_t defaultL2CacheSize
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K
std::complex< RealScalar > LhsScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar &x, const RhsScalar &y, const Scalar &c) const
Traits::LhsPacket LhsPacket
packet_traits< ResScalar >::type _ResPacket
SwappedTraits::LhsPacket SLhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::complex< RealScalar > LhsScalar
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
#define EIGEN_PLAIN_ENUM_MIN(a, b)
const std::ptrdiff_t defaultL1CacheSize
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > SwappedTraits
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
packet_traits< Scalar >::type Packet
Traits::ResPacket ResPacket
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
DataMapper::LinearMapper LinearMapper
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
std::complex< RealScalar > ResScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
DoublePacket< RealPacket > DoublePacketType
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
packet_traits< ResScalar >::type _ResPacket
#define eigen_internal_assert(x)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
const AutoDiffScalar< DerType > & real(const AutoDiffScalar< DerType > &x)
conditional< Vectorizable, DoublePacketType, Scalar >::type RhsPacket
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Q R2(Eigen::AngleAxisd(2, Vector3(0, 1, 0)))
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
std::ptrdiff_t l1CacheSize()
DataMapper::LinearMapper LinearMapper
EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar &x, const RhsScalar &y) const
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
EIGEN_DEVICE_FUNC conditional<(unpacket_traits< Packet >::size%8)==0, typename unpacket_traits< Packet >::half, Packet >::type predux_downto4(const Packet &a)
#define EIGEN_UNUSED_VARIABLE(var)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
std::complex< RealScalar > Scalar
packet_traits< LhsScalar >::type _LhsPacket