10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H 11 #define EIGEN_GENERAL_BLOCK_PANEL_H 18 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false>
28 #if EIGEN_ARCH_i386_OR_x86_64 33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
63 m_cacheSizes.
m_l1 = *l1;
64 m_cacheSizes.
m_l2 = *l2;
65 m_cacheSizes.
m_l3 = *l3;
70 *l1 = m_cacheSizes.
m_l1;
71 *l2 = m_cacheSizes.
m_l2;
72 *l3 = m_cacheSizes.
m_l3;
92 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
102 std::ptrdiff_t l1, l2, l3;
105 if (num_threads > 1) {
106 typedef typename Traits::ResScalar ResScalar;
108 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
109 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
118 const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
120 k = k_cache - (k_cache % kr);
124 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
126 if (n_cache <= n_per_thread) {
129 n = n_cache - (n_cache % nr);
132 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
137 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
139 if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
140 m = m_cache - (m_cache % mr);
143 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 163 typedef typename Traits::ResScalar ResScalar;
166 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
167 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
177 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
178 const Index old_k = k;
184 k = (k%max_kc)==0 ? max_kc
185 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
187 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
196 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 197 const Index actual_l2 = l3;
199 const Index actual_l2 = 1572864;
209 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
210 const Index remaining_l1 = l1- k_sub - lhs_bytes;
211 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
214 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
219 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
222 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
230 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
237 Index problem_size = k*n*
sizeof(LhsScalar);
238 Index actual_lm = actual_l2;
240 if(problem_size<=1024)
246 else if(l3!=0 && problem_size<=32768)
251 max_mc = (numext::mini<Index>)(576,max_mc);
253 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
254 if (mc > Traits::mr) mc -= mc % Traits::mr;
255 else if (mc==0)
return;
257 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
262 template <
typename Index>
265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES 266 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
267 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
268 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
269 n = numext::mini<Index>(
n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
296 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
300 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m,
n, num_threads);
304 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
307 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m,
n, num_threads);
310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 311 #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); 326 t =
b; t = cj.pmul(a,t); c =
padd(c,t);
330 template<
typename CJ,
typename A,
typename B,
typename C,
typename T>
336 #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); 350 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs>
377 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
382 LhsProgress = LhsPacketSize,
398 p = pset1<ResPacket>(ResScalar(0));
411 template<
typename RhsPacketType>
414 dest = pset1<RhsPacketType>(*b);
419 dest = ploadquad<RhsPacket>(
b);
422 template<
typename LhsPacketType>
425 dest = pload<LhsPacketType>(
a);
428 template<
typename LhsPacketType>
431 dest = ploadu<LhsPacketType>(
a);
434 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
442 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 446 tmp =
b; tmp = cj.
pmul(a,tmp); c =
padd(c,tmp);
452 r =
pmadd(c,alpha,r);
455 template<
typename ResPacketHalf>
458 r =
pmadd(c,alpha,r);
463 template<
typename RealScalar,
bool _ConjLhs>
481 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) 483 mr = 3*LhsPacketSize,
488 LhsProgress = LhsPacketSize,
504 p = pset1<ResPacket>(ResScalar(0));
509 dest = pset1<RhsPacket>(*b);
514 dest = pset1<RhsPacket>(*b);
519 dest = pload<LhsPacket>(
a);
524 dest = ploadu<LhsPacket>(
a);
544 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 546 c.v =
pmadd(a.v,b,c.v);
548 tmp =
b; tmp =
pmul(a.v,tmp); c.v =
padd(c.v,tmp);
559 r = cj.pmadd(c,alpha,r);
566 template<
typename Packet>
573 template<
typename Packet>
582 template<
typename Packet>
598 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs>
621 LhsProgress = ResPacketSize,
645 dest = pset1<ResPacket>(*b);
651 dest.
first = pset1<RealPacket>(
real(*b));
718 if((!ConjLhs)&&(!ConjRhs))
723 else if((!ConjLhs)&&(ConjRhs))
728 else if((ConjLhs)&&(!ConjRhs))
733 else if((ConjLhs)&&(ConjRhs))
739 r =
pmadd(tmp,alpha,r);
746 template<
typename RealScalar,
bool _ConjRhs>
769 LhsProgress = ResPacketSize,
785 p = pset1<ResPacket>(ResScalar(0));
790 dest = pset1<RhsPacket>(*b);
793 void broadcastRhs(
const RhsScalar*
b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
807 dest = ploaddup<LhsPacket>(
a);
818 dest = ploaddup<LhsPacket>(
a);
828 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 830 c.v =
pmadd(a,b.v,c.v);
832 tmp =
b; tmp.v =
pmul(a,tmp.v); c =
padd(c,tmp);
844 r = cj.pmadd(alpha,c,r);
858 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
878 Vectorizable = Traits::Vectorizable,
879 LhsProgress = Traits::LhsProgress,
880 RhsProgress = Traits::RhsProgress,
881 ResPacketSize = Traits::ResPacketSize
885 void operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
890 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
893 ::operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
900 if(strideA==-1) strideA = depth;
901 if(strideB==-1) strideB = depth;
903 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
908 const Index peeled_kc = depth & ~(pk-1);
909 const Index prefetch_res_offset = 32/
sizeof(ResScalar);
915 if(mr>=3*Traits::LhsProgress)
926 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
927 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
929 const Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc3);
930 for(
Index j2=0; j2<packet_cols4; j2+=nr)
932 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
938 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
960 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
964 for(
Index k=0; k<peeled_kc; k+=pk)
970 #define EIGEN_GEBP_ONESTEP(K) \ 972 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ 973 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 974 internal::prefetch(blA+(3*K+16)*LhsProgress); \ 975 if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } \ 976 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ 977 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ 978 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ 979 traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ 980 traits.madd(A0, B_0, C0, T0); \ 981 traits.madd(A1, B_0, C4, T0); \ 982 traits.madd(A2, B_0, C8, B_0); \ 983 traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ 984 traits.madd(A0, B_0, C1, T0); \ 985 traits.madd(A1, B_0, C5, T0); \ 986 traits.madd(A2, B_0, C9, B_0); \ 987 traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ 988 traits.madd(A0, B_0, C2, T0); \ 989 traits.madd(A1, B_0, C6, T0); \ 990 traits.madd(A2, B_0, C10, B_0); \ 991 traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ 992 traits.madd(A0, B_0, C3 , T0); \ 993 traits.madd(A1, B_0, C7, T0); \ 994 traits.madd(A2, B_0, C11, B_0); \ 995 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ 1008 blB += pk*4*RhsProgress;
1009 blA += pk*3*Traits::LhsProgress;
1014 for(
Index k=peeled_kc; k<depth; k++)
1019 blB += 4*RhsProgress;
1020 blA += 3*Traits::LhsProgress;
1023 #undef EIGEN_GEBP_ONESTEP 1028 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1031 traits.
acc(C0, alphav, R0);
1032 traits.
acc(C4, alphav, R1);
1033 traits.
acc(C8, alphav, R2);
1034 r0.storePacket(0 * Traits::ResPacketSize, R0);
1035 r0.storePacket(1 * Traits::ResPacketSize, R1);
1036 r0.storePacket(2 * Traits::ResPacketSize, R2);
1038 R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039 R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040 R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1041 traits.
acc(C1, alphav, R0);
1042 traits.
acc(C5, alphav, R1);
1043 traits.
acc(C9, alphav, R2);
1044 r1.storePacket(0 * Traits::ResPacketSize, R0);
1045 r1.storePacket(1 * Traits::ResPacketSize, R1);
1046 r1.storePacket(2 * Traits::ResPacketSize, R2);
1048 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050 R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1051 traits.
acc(C2, alphav, R0);
1052 traits.
acc(C6, alphav, R1);
1053 traits.
acc(C10, alphav, R2);
1054 r2.storePacket(0 * Traits::ResPacketSize, R0);
1055 r2.storePacket(1 * Traits::ResPacketSize, R1);
1056 r2.storePacket(2 * Traits::ResPacketSize, R2);
1058 R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059 R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060 R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1061 traits.
acc(C3, alphav, R0);
1062 traits.
acc(C7, alphav, R1);
1063 traits.
acc(C11, alphav, R2);
1064 r3.storePacket(0 * Traits::ResPacketSize, R0);
1065 r3.storePacket(1 * Traits::ResPacketSize, R1);
1066 r3.storePacket(2 * Traits::ResPacketSize, R2);
1071 for(
Index j2=packet_cols4; j2<cols; j2++)
1073 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1076 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1089 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1092 for(
Index k=0; k<peeled_kc; k+=pk)
1096 #define EIGEN_GEBGP_ONESTEP(K) \ 1098 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ 1099 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1100 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ 1101 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ 1102 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ 1103 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1104 traits.madd(A0, B_0, C0, B_0); \ 1105 traits.madd(A1, B_0, C4, B_0); \ 1106 traits.madd(A2, B_0, C8, B_0); \ 1107 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ 1119 blB += pk*RhsProgress;
1120 blA += pk*3*Traits::LhsProgress;
1126 for(
Index k=peeled_kc; k<depth; k++)
1131 blA += 3*Traits::LhsProgress;
1133 #undef EIGEN_GEBGP_ONESTEP 1137 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1140 traits.
acc(C0, alphav, R0);
1141 traits.
acc(C4, alphav, R1);
1142 traits.
acc(C8, alphav, R2);
1143 r0.storePacket(0 * Traits::ResPacketSize, R0);
1144 r0.storePacket(1 * Traits::ResPacketSize, R1);
1145 r0.storePacket(2 * Traits::ResPacketSize, R2);
1152 if(mr>=2*Traits::LhsProgress)
1158 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1160 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1162 Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc2);
1163 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1165 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1171 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1185 r0.prefetch(prefetch_res_offset);
1186 r1.prefetch(prefetch_res_offset);
1187 r2.prefetch(prefetch_res_offset);
1188 r3.prefetch(prefetch_res_offset);
1191 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1195 for(
Index k=0; k<peeled_kc; k+=pk)
1202 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) 1203 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); 1205 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND 1207 #define EIGEN_GEBGP_ONESTEP(K) \ 1209 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ 1210 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1211 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1212 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ 1213 traits.madd(A0, B_0, C0, T0); \ 1214 traits.madd(A1, B_0, C4, B_0); \ 1215 traits.madd(A0, B1, C1, T0); \ 1216 traits.madd(A1, B1, C5, B1); \ 1217 traits.madd(A0, B2, C2, T0); \ 1218 traits.madd(A1, B2, C6, B2); \ 1219 traits.madd(A0, B3, C3, T0); \ 1220 traits.madd(A1, B3, C7, B3); \ 1221 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ 1222 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ 1236 blB += pk*4*RhsProgress;
1237 blA += pk*(2*Traits::LhsProgress);
1242 for(
Index k=peeled_kc; k<depth; k++)
1246 blB += 4*RhsProgress;
1247 blA += 2*Traits::LhsProgress;
1249 #undef EIGEN_GEBGP_ONESTEP 1254 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1255 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1256 R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1257 R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1258 traits.
acc(C0, alphav, R0);
1259 traits.
acc(C4, alphav, R1);
1260 traits.
acc(C1, alphav, R2);
1261 traits.
acc(C5, alphav, R3);
1262 r0.storePacket(0 * Traits::ResPacketSize, R0);
1263 r0.storePacket(1 * Traits::ResPacketSize, R1);
1264 r1.storePacket(0 * Traits::ResPacketSize, R2);
1265 r1.storePacket(1 * Traits::ResPacketSize, R3);
1267 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1268 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1269 R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1270 R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1271 traits.
acc(C2, alphav, R0);
1272 traits.
acc(C6, alphav, R1);
1273 traits.
acc(C3, alphav, R2);
1274 traits.
acc(C7, alphav, R3);
1275 r2.storePacket(0 * Traits::ResPacketSize, R0);
1276 r2.storePacket(1 * Traits::ResPacketSize, R1);
1277 r3.storePacket(0 * Traits::ResPacketSize, R2);
1278 r3.storePacket(1 * Traits::ResPacketSize, R3);
1283 for(
Index j2=packet_cols4; j2<cols; j2++)
1285 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1288 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1297 r0.prefetch(prefetch_res_offset);
1300 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1303 for(
Index k=0; k<peeled_kc; k+=pk)
1308 #define EIGEN_GEBGP_ONESTEP(K) \ 1310 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ 1311 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1312 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1313 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1314 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1315 traits.madd(A0, B_0, C0, B1); \ 1316 traits.madd(A1, B_0, C4, B_0); \ 1317 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ 1329 blB += pk*RhsProgress;
1330 blA += pk*2*Traits::LhsProgress;
1336 for(
Index k=peeled_kc; k<depth; k++)
1341 blA += 2*Traits::LhsProgress;
1343 #undef EIGEN_GEBGP_ONESTEP 1347 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1348 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1349 traits.
acc(C0, alphav, R0);
1350 traits.
acc(C4, alphav, R1);
1351 r0.storePacket(0 * Traits::ResPacketSize, R0);
1352 r0.storePacket(1 * Traits::ResPacketSize, R1);
1358 if(mr>=1*Traits::LhsProgress)
1361 for(
Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1364 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1369 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1384 r0.prefetch(prefetch_res_offset);
1385 r1.prefetch(prefetch_res_offset);
1386 r2.prefetch(prefetch_res_offset);
1387 r3.prefetch(prefetch_res_offset);
1390 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1394 for(
Index k=0; k<peeled_kc; k+=pk)
1399 #define EIGEN_GEBGP_ONESTEP(K) \ 1401 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ 1402 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1403 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ 1404 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ 1405 traits.madd(A0, B_0, C0, B_0); \ 1406 traits.madd(A0, B1, C1, B1); \ 1407 traits.madd(A0, B2, C2, B2); \ 1408 traits.madd(A0, B3, C3, B3); \ 1409 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ 1423 blB += pk*4*RhsProgress;
1424 blA += pk*1*LhsProgress;
1429 for(
Index k=peeled_kc; k<depth; k++)
1433 blB += 4*RhsProgress;
1434 blA += 1*LhsProgress;
1436 #undef EIGEN_GEBGP_ONESTEP 1441 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1442 R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1443 traits.
acc(C0, alphav, R0);
1444 traits.
acc(C1, alphav, R1);
1445 r0.storePacket(0 * Traits::ResPacketSize, R0);
1446 r1.storePacket(0 * Traits::ResPacketSize, R1);
1448 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1449 R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1450 traits.
acc(C2, alphav, R0);
1451 traits.
acc(C3, alphav, R1);
1452 r2.storePacket(0 * Traits::ResPacketSize, R0);
1453 r3.storePacket(0 * Traits::ResPacketSize, R1);
1457 for(
Index j2=packet_cols4; j2<cols; j2++)
1460 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1470 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1473 for(
Index k=0; k<peeled_kc; k+=pk)
1478 #define EIGEN_GEBGP_ONESTEP(K) \ 1480 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ 1481 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1482 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ 1483 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1484 traits.madd(A0, B_0, C0, B_0); \ 1485 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ 1497 blB += pk*RhsProgress;
1498 blA += pk*1*Traits::LhsProgress;
1504 for(
Index k=peeled_kc; k<depth; k++)
1509 blA += 1*Traits::LhsProgress;
1511 #undef EIGEN_GEBGP_ONESTEP 1514 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1515 traits.
acc(C0, alphav, R0);
1516 r0.storePacket(0 * Traits::ResPacketSize, R0);
1524 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1527 for(
Index i=peeled_mc1; i<rows; i+=1)
1529 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1531 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1537 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1538 (SwappedTraits::LhsProgress <= 8) &&
1539 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1547 const Index spk = (
std::max)(1,SwappedTraits::LhsProgress/4);
1548 const Index endk = (depth/spk)*spk;
1549 const Index endk4 = (depth/(spk*4))*(spk*4);
1552 for(; k<endk4; k+=4*spk)
1562 straits.
madd(A0,B_0,C0,B_0);
1563 straits.
madd(A1,B_1,C1,B_1);
1569 straits.
madd(A0,B_0,C2,B_0);
1570 straits.
madd(A1,B_1,C3,B_1);
1572 blB += 4*SwappedTraits::LhsProgress;
1576 for(; k<endk; k+=spk)
1583 straits.
madd(A0,B_0,C0,B_0);
1585 blB += SwappedTraits::LhsProgress;
1588 if(SwappedTraits::LhsProgress==8)
1596 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1597 SResPacketHalf alphav = pset1<SResPacketHalf>(
alpha);
1607 straits.
madd(a0,b0,c0,b0);
1608 straits.
acc(c0, alphav, R);
1614 res.scatterPacket(i, j2, R);
1618 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1620 straits.
acc(C0, alphav, R);
1621 res.scatterPacket(i, j2, R);
1627 ResScalar C0(0), C1(0), C2(0), C3(0);
1629 for(
Index k=0; k<depth; k++)
1638 CJMADD(cj,A0,B_0,C0, B_0);
1639 CJMADD(cj,A0,B_1,C1, B_1);
1643 CJMADD(cj,A0,B_0,C2, B_0);
1644 CJMADD(cj,A0,B_1,C3, B_1);
1648 res(i, j2 + 0) += alpha * C0;
1649 res(i, j2 + 1) += alpha * C1;
1650 res(i, j2 + 2) += alpha * C2;
1651 res(i, j2 + 3) += alpha * C3;
1656 for(
Index j2=packet_cols4; j2<cols; j2++)
1659 for(
Index i=peeled_mc1; i<rows; i+=1)
1661 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1665 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1666 for(
Index k=0; k<depth; k++)
1668 LhsScalar A0 = blA[k];
1669 RhsScalar B_0 = blB[k];
1670 CJMADD(cj, A0, B_0, C0, B_0);
1672 res(i, j2) += alpha * C0;
1695 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1702 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1712 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1713 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1717 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1718 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1719 const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1720 const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1721 : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1726 if(Pack1>=3*PacketSize)
1728 for(; i<peeled_mc3; i+=3*PacketSize)
1730 if(PanelMode) count += (3*PacketSize) * offset;
1732 for(
Index k=0; k<depth; k++)
1735 A = lhs.loadPacket(i+0*PacketSize, k);
1736 B = lhs.loadPacket(i+1*PacketSize, k);
1737 C = lhs.loadPacket(i+2*PacketSize, k);
1738 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1739 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1740 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1742 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1746 if(Pack1>=2*PacketSize)
1748 for(; i<peeled_mc2; i+=2*PacketSize)
1750 if(PanelMode) count += (2*PacketSize) * offset;
1752 for(
Index k=0; k<depth; k++)
1755 A = lhs.loadPacket(i+0*PacketSize, k);
1756 B = lhs.loadPacket(i+1*PacketSize, k);
1757 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1758 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1760 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1764 if(Pack1>=1*PacketSize)
1766 for(; i<peeled_mc1; i+=1*PacketSize)
1768 if(PanelMode) count += (1*PacketSize) * offset;
1770 for(
Index k=0; k<depth; k++)
1773 A = lhs.loadPacket(i+0*PacketSize, k);
1774 pstore(blockA+count, cj.pconj(A));
1777 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1781 if(Pack2<PacketSize && Pack2>1)
1783 for(; i<peeled_mc0; i+=Pack2)
1785 if(PanelMode) count += Pack2 * offset;
1787 for(
Index k=0; k<depth; k++)
1788 for(
Index w=0; w<Pack2; w++)
1789 blockA[count++] = cj(lhs(i+w, k));
1791 if(PanelMode) count += Pack2 * (stride-offset-depth);
1796 if(PanelMode) count += offset;
1797 for(
Index k=0; k<depth; k++)
1798 blockA[count++] = cj(lhs(i, k));
1799 if(PanelMode) count += (stride-offset-depth);
1803 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1810 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1820 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1832 Index remaining_rows = rows-i;
1833 Index peeled_mc = i+(remaining_rows/pack)*pack;
1834 for(; i<peeled_mc; i+=pack)
1836 if(PanelMode) count += pack * offset;
1838 const Index peeled_k = (depth/PacketSize)*PacketSize;
1840 if(pack>=PacketSize)
1842 for(; k<peeled_k; k+=PacketSize)
1844 for (
Index m = 0; m < pack; m += PacketSize)
1847 for (
int p = 0; p < PacketSize; ++p) kernel.
packet[p] = lhs.loadPacket(i+p+m, k);
1849 for (
int p = 0; p < PacketSize; ++p)
pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.
packet[p]));
1851 count += PacketSize*pack;
1857 for(; w<pack-3; w+=4)
1860 b(cj(lhs(i+w+1, k))),
1861 c(cj(lhs(i+w+2, k))),
1862 d(cj(lhs(i+w+3, k)));
1863 blockA[count++] =
a;
1864 blockA[count++] =
b;
1865 blockA[count++] =
c;
1866 blockA[count++] = d;
1870 blockA[count++] = cj(lhs(i+w, k));
1873 if(PanelMode) count += pack * (stride-offset-depth);
1877 if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1883 if(PanelMode) count += offset;
1884 for(
Index k=0; k<depth; k++)
1885 blockA[count++] = cj(lhs(i, k));
1886 if(PanelMode) count += (stride-offset-depth);
1897 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1906 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1913 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1915 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1916 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1918 const Index peeled_k = (depth/PacketSize)*PacketSize;
1967 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1970 if(PanelMode) count += 4 * offset;
1971 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1972 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1973 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1974 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
1977 if((PacketSize%4)==0)
1979 for(; k<peeled_k; k+=PacketSize) {
1981 kernel.
packet[0] = dm0.loadPacket(k);
1982 kernel.
packet[1%PacketSize] = dm1.loadPacket(k);
1983 kernel.
packet[2%PacketSize] = dm2.loadPacket(k);
1984 kernel.
packet[3%PacketSize] = dm3.loadPacket(k);
1986 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.
packet[0]));
1987 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.
packet[1%PacketSize]));
1988 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.
packet[2%PacketSize]));
1989 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.
packet[3%PacketSize]));
1990 count+=4*PacketSize;
1995 blockB[count+0] = cj(dm0(k));
1996 blockB[count+1] = cj(dm1(k));
1997 blockB[count+2] = cj(dm2(k));
1998 blockB[count+3] = cj(dm3(k));
2002 if(PanelMode) count += 4 * (stride-offset-depth);
2007 for(
Index j2=packet_cols4; j2<cols; ++j2)
2009 if(PanelMode) count += offset;
2011 for(
Index k=0; k<depth; k++)
2013 blockB[count] = cj(dm0(k));
2016 if(PanelMode) count += (stride-offset-depth);
2021 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2030 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2037 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2039 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2040 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2078 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2081 if(PanelMode) count += 4 * offset;
2082 for(
Index k=0; k<depth; k++)
2084 if (PacketSize==4) {
2085 Packet A = rhs.loadPacket(k, j2);
2086 pstoreu(blockB+count, cj.pconj(A));
2087 count += PacketSize;
2090 blockB[count+0] = cj(dm0(0));
2091 blockB[count+1] = cj(dm0(1));
2092 blockB[count+2] = cj(dm0(2));
2093 blockB[count+3] = cj(dm0(3));
2098 if(PanelMode) count += 4 * (stride-offset-depth);
2102 for(
Index j2=packet_cols4; j2<cols; ++j2)
2104 if(PanelMode) count += offset;
2105 for(
Index k=0; k<depth; k++)
2107 blockB[count] = cj(rhs(k, j2));
2110 if(PanelMode) count += stride-offset-depth;
2120 std::ptrdiff_t l1, l2, l3;
2129 std::ptrdiff_t l1, l2, l3;
2139 std::ptrdiff_t l1, l2, l3;
2156 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
#define EIGEN_ALWAYS_INLINE
SwappedTraits::ResScalar SResScalar
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
DataMapper::LinearMapper LinearMapper
packet_traits< RhsScalar >::type _RhsPacket
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_STRONG_INLINE
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
conditional< Vectorizable, DoublePacketType, Scalar >::type AccPacket
DataMapper::LinearMapper LinearMapper
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, T &a, T &b, T &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
std::complex< RealScalar > Scalar
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE void gebp_madd(const CJ &cj, A &a, B &b, C &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
packet_traits< LhsScalar >::type _LhsPacket
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, A &a, B &b, C &c, T &)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
std::complex< RealScalar > RhsScalar
EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar &x, const RhsScalar &y, const Scalar &c) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacketType &dest) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ResPacket &dest) const
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
DataMapper::LinearMapper LinearMapper
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
DerType::Scalar imag(const AutoDiffScalar< DerType > &)
Traits::ResScalar ResScalar
static constexpr size_t size(Tuple< Args... > &)
Provides access to the number of elements in a tuple as a compile-time constant expression.
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsScalar &b0, RhsScalar &b1)
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
#define EIGEN_ASM_COMMENT(X)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
SwappedTraits::AccPacket SAccPacket
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &) const
Traits::RhsPacket RhsPacket
const std::ptrdiff_t defaultL3CacheSize
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
packet_traits< Scalar >::type Packet
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf &x)
DoublePacket< Packet > half
#define EIGEN_DONT_INLINE
packet_traits< RealScalar >::type RealPacket
std::ptrdiff_t l3CacheSize()
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, AccPacketType &tmp) const
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
#define CJMADD(CJ, A, B, C, T)
SwappedTraits::RhsPacket SRhsPacket
SwappedTraits::ResPacket SResPacket
conj_helper< ResPacket, ResPacket, false, ConjRhs > cj
MatrixType A(a, *n, *n, *lda)
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
NumTraits< Scalar >::Real RealScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::ptrdiff_t l2CacheSize()
EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar &x, const RhsScalar &y) const
EIGEN_STRONG_INLINE void acc(const DoublePacketType &c, const ResPacket &alpha, ResPacket &r) const
void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
conditional< Vectorizable, ScalarPacket, Scalar >::type ResPacket
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
packet_traits< LhsScalar >::type _LhsPacket
void queryCacheSizes(int &l1, int &l2, int &l3)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, DoublePacketType &b0, DoublePacketType &b1)
T div_ceil(const T &a, const T &b)
#define EIGEN_GEBP_ONESTEP(K)
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
conditional< Vectorizable, RealPacket, Scalar >::type LhsPacket
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
packet_traits< ResScalar >::type _ResPacket
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
packet_traits< Scalar >::type ScalarPacket
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
conj_helper< ResPacket, ResPacket, ConjLhs, false > cj
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
Traits::AccPacket AccPacket
const std::ptrdiff_t defaultL2CacheSize
std::complex< RealScalar > LhsScalar
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Traits::LhsPacket LhsPacket
packet_traits< ResScalar >::type _ResPacket
SwappedTraits::LhsPacket SLhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::complex< RealScalar > LhsScalar
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
#define EIGEN_PLAIN_ENUM_MIN(a, b)
const std::ptrdiff_t defaultL1CacheSize
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > SwappedTraits
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
packet_traits< Scalar >::type Packet
Traits::ResPacket ResPacket
DataMapper::LinearMapper LinearMapper
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
std::complex< RealScalar > ResScalar
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
DoublePacket< RealPacket > DoublePacketType
MatrixType B(b, *n, *nrhs, *ldb)
packet_traits< ResScalar >::type _ResPacket
#define eigen_internal_assert(x)
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
const AutoDiffScalar< DerType > & real(const AutoDiffScalar< DerType > &x)
conditional< Vectorizable, DoublePacketType, Scalar >::type RhsPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, DoublePacketType &c, RhsPacket &) const
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
std::ptrdiff_t l1CacheSize()
DataMapper::LinearMapper LinearMapper
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
EIGEN_DEVICE_FUNC conditional<(unpacket_traits< Packet >::size%8)==0, typename unpacket_traits< Packet >::half, Packet >::type predux_downto4(const Packet &a)
#define EIGEN_UNUSED_VARIABLE(var)
std::complex< RealScalar > Scalar
packet_traits< LhsScalar >::type _LhsPacket
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const