10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
18 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false>
28 #if EIGEN_ARCH_i386_OR_x86_64
63 m_cacheSizes.
m_l1 = *l1;
64 m_cacheSizes.
m_l2 = *l2;
65 m_cacheSizes.
m_l3 = *l3;
70 *l1 = m_cacheSizes.
m_l1;
71 *l2 = m_cacheSizes.
m_l2;
72 *l3 = m_cacheSizes.
m_l3;
92 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
102 std::ptrdiff_t l1, l2, l3;
105 if (num_threads > 1) {
106 typedef typename Traits::ResScalar ResScalar;
108 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
109 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
118 const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
120 k = k_cache - (k_cache % kr);
124 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
126 if (n_cache <= n_per_thread) {
129 n = n_cache - (n_cache % nr);
132 n = (numext::mini<Index>)(
n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
137 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
139 if(m_cache < m_per_thread && m_cache >=
static_cast<Index>(mr)) {
140 m = m_cache - (m_cache % mr);
143 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
163 typedef typename Traits::ResScalar ResScalar;
166 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
167 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
177 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
178 const Index old_k = k;
184 k = (k%max_kc)==0 ? max_kc
185 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
187 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
196 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
197 const Index actual_l2 = l3;
199 const Index actual_l2 = 1572864;
209 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
210 const Index remaining_l1 = l1- k_sub - lhs_bytes;
211 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
214 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
219 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
222 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
230 : (nc - Traits::nr * ((nc-(
n%nc))/(Traits::nr*(
n/nc+1))));
237 Index problem_size = k*
n*
sizeof(LhsScalar);
238 Index actual_lm = actual_l2;
240 if(problem_size<=1024)
246 else if(l3!=0 && problem_size<=32768)
251 max_mc = (numext::mini<Index>)(576,max_mc);
253 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
254 if (mc > Traits::mr) mc -= mc % Traits::mr;
255 else if (mc==0)
return;
257 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
262 template <
typename Index>
265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
266 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
267 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
268 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
269 n = numext::mini<Index>(
n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
296 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
300 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m,
n, num_threads);
304 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
307 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m,
n, num_threads);
310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
311 #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
326 t =
b; t = cj.pmul(
a,t);
c =
padd(
c,t);
330 template<
typename CJ,
typename A,
typename B,
typename C,
typename T>
336 #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
350 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs>
411 template<
typename RhsPacketType>
414 dest = pset1<RhsPacketType>(*
b);
419 dest = ploadquad<RhsPacket>(
b);
422 template<
typename LhsPacketType>
425 dest = pload<LhsPacketType>(
a);
428 template<
typename LhsPacketType>
431 dest = ploadu<LhsPacketType>(
a);
434 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
442 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
455 template<
typename ResPacketHalf>
463 template<
typename RealScalar,
bool _ConjLhs>
481 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
509 dest = pset1<RhsPacket>(*
b);
514 dest = pset1<RhsPacket>(*
b);
519 dest = pload<LhsPacket>(
a);
524 dest = ploadu<LhsPacket>(
a);
544 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
566 template<
typename Packet>
573 template<
typename Packet>
582 template<
typename Packet>
598 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs>
645 dest = pset1<ResPacket>(*
b);
746 template<
typename RealScalar,
bool _ConjRhs>
790 dest = pset1<RhsPacket>(*
b);
807 dest = ploaddup<LhsPacket>(
a);
818 dest = ploaddup<LhsPacket>(
a);
828 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
858 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
885 void operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
890 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
893 ::operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
900 if(strideA==-1) strideA = depth;
901 if(strideB==-1) strideB = depth;
903 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
908 const Index peeled_kc = depth & ~(pk-1);
915 if(mr>=3*Traits::LhsProgress)
926 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(
ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
927 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
929 const Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc3);
930 for(
Index j2=0; j2<packet_cols4; j2+=nr)
932 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
938 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
960 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
964 for(
Index k=0; k<peeled_kc; k+=pk)
970 #define EIGEN_GEBP_ONESTEP(K) \
972 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
973 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
974 internal::prefetch(blA+(3*K+16)*LhsProgress); \
975 if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } \
976 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
977 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
978 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
979 traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
980 traits.madd(A0, B_0, C0, T0); \
981 traits.madd(A1, B_0, C4, T0); \
982 traits.madd(A2, B_0, C8, B_0); \
983 traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
984 traits.madd(A0, B_0, C1, T0); \
985 traits.madd(A1, B_0, C5, T0); \
986 traits.madd(A2, B_0, C9, B_0); \
987 traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
988 traits.madd(A0, B_0, C2, T0); \
989 traits.madd(A1, B_0, C6, T0); \
990 traits.madd(A2, B_0, C10, B_0); \
991 traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
992 traits.madd(A0, B_0, C3 , T0); \
993 traits.madd(A1, B_0, C7, T0); \
994 traits.madd(A2, B_0, C11, B_0); \
995 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1008 blB += pk*4*RhsProgress;
1009 blA += pk*3*Traits::LhsProgress;
1014 for(
Index k=peeled_kc; k<depth; k++)
1019 blB += 4*RhsProgress;
1020 blA += 3*Traits::LhsProgress;
1023 #undef EIGEN_GEBP_ONESTEP
1028 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1031 traits.acc(C0, alphav, R0);
1032 traits.acc(C4, alphav, R1);
1033 traits.acc(C8, alphav, R2);
1034 r0.storePacket(0 * Traits::ResPacketSize, R0);
1035 r0.storePacket(1 * Traits::ResPacketSize, R1);
1036 r0.storePacket(2 * Traits::ResPacketSize, R2);
1038 R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039 R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040 R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1041 traits.acc(C1, alphav, R0);
1042 traits.acc(C5, alphav, R1);
1043 traits.acc(C9, alphav, R2);
1044 r1.storePacket(0 * Traits::ResPacketSize, R0);
1045 r1.storePacket(1 * Traits::ResPacketSize, R1);
1046 r1.storePacket(2 * Traits::ResPacketSize, R2);
1048 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050 R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1051 traits.acc(C2, alphav, R0);
1052 traits.acc(C6, alphav, R1);
1053 traits.acc(C10, alphav, R2);
1054 r2.storePacket(0 * Traits::ResPacketSize, R0);
1055 r2.storePacket(1 * Traits::ResPacketSize, R1);
1056 r2.storePacket(2 * Traits::ResPacketSize, R2);
1058 R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059 R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060 R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1061 traits.acc(C3, alphav, R0);
1062 traits.acc(C7, alphav, R1);
1063 traits.acc(C11, alphav, R2);
1064 r3.storePacket(0 * Traits::ResPacketSize, R0);
1065 r3.storePacket(1 * Traits::ResPacketSize, R1);
1066 r3.storePacket(2 * Traits::ResPacketSize, R2);
1071 for(
Index j2=packet_cols4; j2<cols; j2++)
1073 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1076 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1089 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1092 for(
Index k=0; k<peeled_kc; k+=pk)
1096 #define EIGEN_GEBGP_ONESTEP(K) \
1098 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1099 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1100 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1101 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1102 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1103 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1104 traits.madd(A0, B_0, C0, B_0); \
1105 traits.madd(A1, B_0, C4, B_0); \
1106 traits.madd(A2, B_0, C8, B_0); \
1107 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1119 blB += pk*RhsProgress;
1120 blA += pk*3*Traits::LhsProgress;
1126 for(
Index k=peeled_kc; k<depth; k++)
1131 blA += 3*Traits::LhsProgress;
1133 #undef EIGEN_GEBGP_ONESTEP
1137 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1140 traits.acc(C0, alphav, R0);
1141 traits.acc(C4, alphav, R1);
1142 traits.acc(C8, alphav, R2);
1143 r0.storePacket(0 * Traits::ResPacketSize, R0);
1144 r0.storePacket(1 * Traits::ResPacketSize, R1);
1145 r0.storePacket(2 * Traits::ResPacketSize, R2);
1152 if(mr>=2*Traits::LhsProgress)
1158 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(
ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1160 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1162 Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc2);
1163 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1165 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1171 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1185 r0.prefetch(prefetch_res_offset);
1186 r1.prefetch(prefetch_res_offset);
1187 r2.prefetch(prefetch_res_offset);
1188 r3.prefetch(prefetch_res_offset);
1191 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1195 for(
Index k=0; k<peeled_kc; k+=pk)
1202 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1203 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1205 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1207 #define EIGEN_GEBGP_ONESTEP(K) \
1209 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1210 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1211 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1212 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1213 traits.madd(A0, B_0, C0, T0); \
1214 traits.madd(A1, B_0, C4, B_0); \
1215 traits.madd(A0, B1, C1, T0); \
1216 traits.madd(A1, B1, C5, B1); \
1217 traits.madd(A0, B2, C2, T0); \
1218 traits.madd(A1, B2, C6, B2); \
1219 traits.madd(A0, B3, C3, T0); \
1220 traits.madd(A1, B3, C7, B3); \
1221 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1222 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1236 blB += pk*4*RhsProgress;
1237 blA += pk*(2*Traits::LhsProgress);
1242 for(
Index k=peeled_kc; k<depth; k++)
1246 blB += 4*RhsProgress;
1247 blA += 2*Traits::LhsProgress;
1249 #undef EIGEN_GEBGP_ONESTEP
1254 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1255 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1256 R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1257 R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1258 traits.acc(C0, alphav, R0);
1259 traits.acc(C4, alphav, R1);
1260 traits.acc(C1, alphav, R2);
1261 traits.acc(C5, alphav, R3);
1262 r0.storePacket(0 * Traits::ResPacketSize, R0);
1263 r0.storePacket(1 * Traits::ResPacketSize, R1);
1264 r1.storePacket(0 * Traits::ResPacketSize, R2);
1265 r1.storePacket(1 * Traits::ResPacketSize, R3);
1267 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1268 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1269 R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1270 R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1271 traits.acc(C2, alphav, R0);
1272 traits.acc(C6, alphav, R1);
1273 traits.acc(C3, alphav, R2);
1274 traits.acc(C7, alphav, R3);
1275 r2.storePacket(0 * Traits::ResPacketSize, R0);
1276 r2.storePacket(1 * Traits::ResPacketSize, R1);
1277 r3.storePacket(0 * Traits::ResPacketSize, R2);
1278 r3.storePacket(1 * Traits::ResPacketSize, R3);
1283 for(
Index j2=packet_cols4; j2<cols; j2++)
1285 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1288 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1297 r0.prefetch(prefetch_res_offset);
1300 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1303 for(
Index k=0; k<peeled_kc; k+=pk)
1308 #define EIGEN_GEBGP_ONESTEP(K) \
1310 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1311 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1312 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1313 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1314 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1315 traits.madd(A0, B_0, C0, B1); \
1316 traits.madd(A1, B_0, C4, B_0); \
1317 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1329 blB += pk*RhsProgress;
1330 blA += pk*2*Traits::LhsProgress;
1336 for(
Index k=peeled_kc; k<depth; k++)
1341 blA += 2*Traits::LhsProgress;
1343 #undef EIGEN_GEBGP_ONESTEP
1347 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1348 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1349 traits.acc(C0, alphav, R0);
1350 traits.acc(C4, alphav, R1);
1351 r0.storePacket(0 * Traits::ResPacketSize, R0);
1352 r0.storePacket(1 * Traits::ResPacketSize, R1);
1358 if(mr>=1*Traits::LhsProgress)
1361 for(
Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1364 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1369 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1384 r0.prefetch(prefetch_res_offset);
1385 r1.prefetch(prefetch_res_offset);
1386 r2.prefetch(prefetch_res_offset);
1387 r3.prefetch(prefetch_res_offset);
1390 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1394 for(
Index k=0; k<peeled_kc; k+=pk)
1399 #define EIGEN_GEBGP_ONESTEP(K) \
1401 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1402 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1403 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1404 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1405 traits.madd(A0, B_0, C0, B_0); \
1406 traits.madd(A0, B1, C1, B1); \
1407 traits.madd(A0, B2, C2, B2); \
1408 traits.madd(A0, B3, C3, B3); \
1409 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1423 blB += pk*4*RhsProgress;
1424 blA += pk*1*LhsProgress;
1429 for(
Index k=peeled_kc; k<depth; k++)
1433 blB += 4*RhsProgress;
1434 blA += 1*LhsProgress;
1436 #undef EIGEN_GEBGP_ONESTEP
1441 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1442 R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1443 traits.acc(C0, alphav, R0);
1444 traits.acc(C1, alphav, R1);
1445 r0.storePacket(0 * Traits::ResPacketSize, R0);
1446 r1.storePacket(0 * Traits::ResPacketSize, R1);
1448 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1449 R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1450 traits.acc(C2, alphav, R0);
1451 traits.acc(C3, alphav, R1);
1452 r2.storePacket(0 * Traits::ResPacketSize, R0);
1453 r3.storePacket(0 * Traits::ResPacketSize, R1);
1457 for(
Index j2=packet_cols4; j2<cols; j2++)
1460 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1470 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1473 for(
Index k=0; k<peeled_kc; k+=pk)
1478 #define EIGEN_GEBGP_ONESTEP(K) \
1480 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1481 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1482 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1483 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484 traits.madd(A0, B_0, C0, B_0); \
1485 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1497 blB += pk*RhsProgress;
1498 blA += pk*1*Traits::LhsProgress;
1504 for(
Index k=peeled_kc; k<depth; k++)
1509 blA += 1*Traits::LhsProgress;
1511 #undef EIGEN_GEBGP_ONESTEP
1514 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1515 traits.acc(C0, alphav, R0);
1516 r0.storePacket(0 * Traits::ResPacketSize, R0);
1524 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1527 for(
Index i=peeled_mc1; i<rows; i+=1)
1529 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1531 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1537 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1538 (SwappedTraits::LhsProgress <= 8) &&
1539 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1547 const Index spk = (
std::max)(1,SwappedTraits::LhsProgress/4);
1548 const Index endk = (depth/spk)*spk;
1549 const Index endk4 = (depth/(spk*4))*(spk*4);
1552 for(; k<endk4; k+=4*spk)
1562 straits.
madd(A0,B_0,C0,B_0);
1563 straits.
madd(A1,B_1,C1,B_1);
1569 straits.
madd(A0,B_0,C2,B_0);
1570 straits.
madd(A1,B_1,C3,B_1);
1572 blB += 4*SwappedTraits::LhsProgress;
1576 for(; k<endk; k+=spk)
1583 straits.
madd(A0,B_0,C0,B_0);
1585 blB += SwappedTraits::LhsProgress;
1588 if(SwappedTraits::LhsProgress==8)
1596 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1597 SResPacketHalf alphav = pset1<SResPacketHalf>(
alpha);
1607 straits.
madd(a0,b0,c0,b0);
1608 straits.
acc(c0, alphav, R);
1614 res.scatterPacket(i, j2, R);
1618 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1620 straits.
acc(C0, alphav, R);
1621 res.scatterPacket(i, j2, R);
1629 for(
Index k=0; k<depth; k++)
1638 CJMADD(cj,A0,B_0,C0, B_0);
1639 CJMADD(cj,A0,B_1,C1, B_1);
1643 CJMADD(cj,A0,B_0,C2, B_0);
1644 CJMADD(cj,A0,B_1,C3, B_1);
1648 res(i, j2 + 0) +=
alpha * C0;
1649 res(i, j2 + 1) +=
alpha * C1;
1650 res(i, j2 + 2) +=
alpha * C2;
1651 res(i, j2 + 3) +=
alpha * C3;
1656 for(
Index j2=packet_cols4; j2<cols; j2++)
1659 for(
Index i=peeled_mc1; i<rows; i+=1)
1661 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1665 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1666 for(
Index k=0; k<depth; k++)
1668 LhsScalar A0 = blA[k];
1669 RhsScalar B_0 = blB[k];
1670 CJMADD(cj, A0, B_0, C0, B_0);
1672 res(i, j2) +=
alpha * C0;
1695 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1702 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1712 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1713 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1717 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1718 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1719 const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1720 const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1721 : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1726 if(Pack1>=3*PacketSize)
1728 for(; i<peeled_mc3; i+=3*PacketSize)
1730 if(PanelMode) count += (3*PacketSize) * offset;
1732 for(
Index k=0; k<depth; k++)
1735 A = lhs.loadPacket(i+0*PacketSize, k);
1736 B = lhs.loadPacket(i+1*PacketSize, k);
1737 C = lhs.loadPacket(i+2*PacketSize, k);
1738 pstore(blockA+count, cj.pconj(
A)); count+=PacketSize;
1739 pstore(blockA+count, cj.pconj(
B)); count+=PacketSize;
1740 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1742 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1746 if(Pack1>=2*PacketSize)
1748 for(; i<peeled_mc2; i+=2*PacketSize)
1750 if(PanelMode) count += (2*PacketSize) * offset;
1752 for(
Index k=0; k<depth; k++)
1755 A = lhs.loadPacket(i+0*PacketSize, k);
1756 B = lhs.loadPacket(i+1*PacketSize, k);
1757 pstore(blockA+count, cj.pconj(
A)); count+=PacketSize;
1758 pstore(blockA+count, cj.pconj(
B)); count+=PacketSize;
1760 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1764 if(Pack1>=1*PacketSize)
1766 for(; i<peeled_mc1; i+=1*PacketSize)
1768 if(PanelMode) count += (1*PacketSize) * offset;
1770 for(
Index k=0; k<depth; k++)
1773 A = lhs.loadPacket(i+0*PacketSize, k);
1774 pstore(blockA+count, cj.pconj(
A));
1777 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1781 if(Pack2<PacketSize && Pack2>1)
1783 for(; i<peeled_mc0; i+=Pack2)
1785 if(PanelMode) count += Pack2 * offset;
1787 for(
Index k=0; k<depth; k++)
1788 for(
Index w=0; w<Pack2; w++)
1789 blockA[count++] = cj(lhs(i+w, k));
1791 if(PanelMode) count += Pack2 * (stride-offset-depth);
1796 if(PanelMode) count += offset;
1797 for(
Index k=0; k<depth; k++)
1798 blockA[count++] = cj(lhs(i, k));
1799 if(PanelMode) count += (stride-offset-depth);
1803 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1810 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1820 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1832 Index remaining_rows = rows-i;
1833 Index peeled_mc = i+(remaining_rows/pack)*pack;
1834 for(; i<peeled_mc; i+=pack)
1836 if(PanelMode) count += pack * offset;
1838 const Index peeled_k = (depth/PacketSize)*PacketSize;
1840 if(pack>=PacketSize)
1842 for(; k<peeled_k; k+=PacketSize)
1844 for (
Index m = 0; m < pack; m += PacketSize)
1847 for (
int p = 0; p < PacketSize; ++p) kernel.
packet[p] = lhs.loadPacket(i+p+m, k);
1849 for (
int p = 0; p < PacketSize; ++p)
pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.
packet[p]));
1851 count += PacketSize*pack;
1857 for(; w<pack-3; w+=4)
1860 b(cj(lhs(i+w+1, k))),
1861 c(cj(lhs(i+w+2, k))),
1862 d(cj(lhs(i+w+3, k)));
1863 blockA[count++] =
a;
1864 blockA[count++] =
b;
1865 blockA[count++] =
c;
1866 blockA[count++] = d;
1870 blockA[count++] = cj(lhs(i+w, k));
1873 if(PanelMode) count += pack * (stride-offset-depth);
1877 if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1883 if(PanelMode) count += offset;
1884 for(
Index k=0; k<depth; k++)
1885 blockA[count++] = cj(lhs(i, k));
1886 if(PanelMode) count += (stride-offset-depth);
1897 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1906 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1913 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1915 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1916 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1918 const Index peeled_k = (depth/PacketSize)*PacketSize;
1967 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1970 if(PanelMode) count += 4 * offset;
1971 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1972 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1973 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1974 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
1977 if((PacketSize%4)==0)
1979 for(; k<peeled_k; k+=PacketSize) {
1981 kernel.
packet[0] = dm0.loadPacket(k);
1982 kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1983 kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1984 kernel.packet[3%PacketSize] = dm3.loadPacket(k);
1986 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1987 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
1988 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
1989 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
1990 count+=4*PacketSize;
1995 blockB[count+0] = cj(dm0(k));
1996 blockB[count+1] = cj(dm1(k));
1997 blockB[count+2] = cj(dm2(k));
1998 blockB[count+3] = cj(dm3(k));
2002 if(PanelMode) count += 4 * (stride-offset-depth);
2007 for(
Index j2=packet_cols4; j2<cols; ++j2)
2009 if(PanelMode) count += offset;
2011 for(
Index k=0; k<depth; k++)
2013 blockB[count] = cj(dm0(k));
2016 if(PanelMode) count += (stride-offset-depth);
2021 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2030 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2037 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2039 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2040 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2078 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2081 if(PanelMode) count += 4 * offset;
2082 for(
Index k=0; k<depth; k++)
2084 if (PacketSize==4) {
2085 Packet A = rhs.loadPacket(k, j2);
2086 pstoreu(blockB+count, cj.pconj(
A));
2087 count += PacketSize;
2090 blockB[count+0] = cj(dm0(0));
2091 blockB[count+1] = cj(dm0(1));
2092 blockB[count+2] = cj(dm0(2));
2093 blockB[count+3] = cj(dm0(3));
2098 if(PanelMode) count += 4 * (stride-offset-depth);
2102 for(
Index j2=packet_cols4; j2<cols; ++j2)
2104 if(PanelMode) count += offset;
2105 for(
Index k=0; k<depth; k++)
2107 blockB[count] = cj(rhs(k, j2));
2110 if(PanelMode) count += stride-offset-depth;
2120 std::ptrdiff_t l1, l2, l3;
2129 std::ptrdiff_t l1, l2, l3;
2139 std::ptrdiff_t l1, l2, l3;
2156 #endif // EIGEN_GENERAL_BLOCK_PANEL_H