10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H 11 #define EIGEN_GENERAL_BLOCK_PANEL_H 24 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false,
int Arch=Architecture::Target,
int _PacketSize=GEBPPacketFull>
34 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE) 35 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE 37 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val 38 #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE) 40 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE) 41 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE 43 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val 44 #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) 46 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) 47 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE 49 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val 50 #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) 52 #if EIGEN_ARCH_i386_OR_x86_64 66 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE 67 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE 68 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE 101 *l1 = m_cacheSizes.
m_l1;
102 *l2 = m_cacheSizes.
m_l2;
103 *l3 = m_cacheSizes.
m_l3;
123 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
133 std::ptrdiff_t
l1,
l2,
l3;
135 #ifdef EIGEN_VECTORIZE_AVX512 146 if (num_threads > 1) {
147 typedef typename Traits::ResScalar ResScalar;
149 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
150 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
160 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
162 k = k_cache - (k_cache % kr);
166 const Index n_cache = (l2-
l1) / (nr *
sizeof(RhsScalar) * k);
168 if (n_cache <= n_per_thread) {
171 n = n_cache - (n_cache % nr);
174 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
179 const Index m_cache = (l3-
l2) / (
sizeof(LhsScalar) * k * num_threads);
181 if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
182 m = m_cache - (m_cache % mr);
185 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
192 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 205 typedef typename Traits::ResScalar ResScalar;
208 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
209 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
219 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220 const Index old_k = k;
226 k = (k%max_kc)==0 ? max_kc
227 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
229 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
238 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 241 const Index actual_l2 = 1572864;
251 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
252 const Index remaining_l1 = l1- k_sub - lhs_bytes;
253 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
256 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
261 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
264 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
272 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
279 Index problem_size = k*n*
sizeof(LhsScalar);
280 Index actual_lm = actual_l2;
282 if(problem_size<=1024)
288 else if(l3!=0 && problem_size<=32768)
293 max_mc = (numext::mini<Index>)(576,max_mc);
295 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
296 if (mc > Traits::mr) mc -= mc % Traits::mr;
297 else if (mc==0)
return;
299 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
304 template <
typename Index>
307 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES 338 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
342 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k,
m,
n, num_threads);
346 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
349 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k,
m,
n, num_threads);
352 template <
typename RhsPacket,
typename RhsPacketx4,
int registers_taken>
360 template <
typename Packet>
370 template <
int N,
typename T1,
typename T2,
typename T3>
373 template <
typename T1,
typename T2,
typename T3>
376 template <
typename T1,
typename T2,
typename T3>
379 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ 380 typedef typename packet_conditional<packet_size, \ 381 typename packet_traits<name ## Scalar>::type, \ 382 typename packet_traits<name ## Scalar>::half, \ 383 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ 384 prefix ## name ## Packet 386 #define PACKET_DECL_COND(name, packet_size) \ 387 typedef typename packet_conditional<packet_size, \ 388 typename packet_traits<name ## Scalar>::type, \ 389 typename packet_traits<name ## Scalar>::half, \ 390 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ 393 #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ 394 typedef typename packet_conditional<packet_size, \ 395 typename packet_traits<Scalar>::type, \ 396 typename packet_traits<Scalar>::half, \ 397 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ 398 prefix ## ScalarPacket 400 #define PACKET_DECL_COND_SCALAR(packet_size) \ 401 typedef typename packet_conditional<packet_size, \ 402 typename packet_traits<Scalar>::type, \ 403 typename packet_traits<Scalar>::half, \ 404 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ 417 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
450 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
455 LhsProgress = LhsPacketSize,
470 p = pset1<ResPacket>(ResScalar(0));
473 template<
typename RhsPacketType>
476 dest = pset1<RhsPacketType>(*b);
484 template<
typename RhsPacketType>
496 dest = ploadquad<RhsPacket>(
b);
499 template<
typename LhsPacketType>
502 dest = pload<LhsPacketType>(
a);
505 template<
typename LhsPacketType>
508 dest = ploadu<LhsPacketType>(
a);
511 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
519 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 523 tmp =
b; tmp = cj.
pmul(a,tmp); c =
padd(c,tmp);
527 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
530 madd(a, b.
get(lane),
c, tmp, lane);
535 r =
pmadd(c,alpha,r);
538 template<
typename ResPacketHalf>
541 r =
pmadd(c,alpha,r);
546 template<
typename RealScalar,
bool _ConjLhs,
int Arch,
int _PacketSize>
568 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) 570 mr = 3*LhsPacketSize,
575 LhsProgress = LhsPacketSize,
590 p = pset1<ResPacket>(ResScalar(0));
593 template<
typename RhsPacketType>
596 dest = pset1<RhsPacketType>(*b);
604 template<
typename RhsPacketType>
622 RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
623 dest = ploadquad<RhsPacket>(tmp);
629 dest = pset1<RhsPacket>(*b);
634 dest = pload<LhsPacket>(
a);
637 template<
typename LhsPacketType>
640 dest = ploadu<LhsPacketType>(
a);
643 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
649 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
652 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 654 c.v =
pmadd(a.v,b,c.v);
656 tmp =
b; tmp =
pmul(a.v,tmp); c.v =
padd(c.v,tmp);
665 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
668 madd(a, b.
get(lane),
c, tmp, lane);
671 template <
typename ResPacketType,
typename AccPacketType>
675 r = cj.
pmadd(c,alpha,r);
681 template<
typename Packet>
688 template<
typename Packet>
701 template<
typename Packet>
709 template<
typename Packet>
724 template<
typename Scalar,
typename RealPacket>
732 template<
typename Scalar,
typename RealPacket>
740 dest.
first = ploadquad<RealPacket>(r);
741 dest.
second = ploadquad<RealPacket>(
i);
757 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
786 LhsProgress = ResPacketSize,
812 dest = pset1<ScalarPacket>(*b);
816 template<
typename RealPacketType>
825 loadRhs(b, dest.
B_0);
826 loadRhs(b + 1, dest.
B1);
827 loadRhs(b + 2, dest.
B2);
828 loadRhs(b + 3, dest.
B3);
838 template<
typename RealPacketType>
861 template<
typename LhsPacketType>
867 template<
typename LhsPacketType,
typename RhsPacketType,
typename ResPacketType,
typename TmpType,
typename LaneIdType>
876 template<
typename LaneIdType>
882 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
885 madd(a, b.
get(lane),
c, tmp, lane);
890 template<
typename RealPacketType,
typename ResPacketType>
895 if((!ConjLhs)&&(!ConjRhs))
900 else if((!ConjLhs)&&(ConjRhs))
905 else if((ConjLhs)&&(!ConjRhs))
910 else if((ConjLhs)&&(ConjRhs))
916 r =
pmadd(tmp,alpha,r);
923 template<
typename RealScalar,
bool _ConjRhs,
int Arch,
int _PacketSize>
938 #undef PACKET_DECL_COND_SCALAR_PREFIX 939 #undef PACKET_DECL_COND_PREFIX 940 #undef PACKET_DECL_COND_SCALAR 941 #undef PACKET_DECL_COND 957 LhsProgress = ResPacketSize,
970 p = pset1<ResPacket>(ResScalar(0));
973 template<
typename RhsPacketType>
976 dest = pset1<RhsPacketType>(*b);
984 template<
typename RhsPacketType>
995 dest = ploaddup<LhsPacket>(
a);
1000 dest = ploadquad<RhsPacket>(
b);
1003 template<
typename LhsPacketType>
1006 dest = ploaddup<LhsPacketType>(
a);
1009 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
1015 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
1018 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1020 c.v =
pmadd(a,b.v,c.v);
1022 tmp =
b; tmp.v =
pmul(a,tmp.v); c =
padd(c,tmp);
1032 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
1035 madd(a, b.
get(lane),
c, tmp, lane);
1038 template <
typename ResPacketType,
typename AccPacketType>
1042 r = cj.
pmadd(alpha,c,r);
1056 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1093 Vectorizable = Traits::Vectorizable,
1094 LhsProgress = Traits::LhsProgress,
1095 LhsProgressHalf = HalfTraits::LhsProgress,
1096 LhsProgressQuarter = QuarterTraits::LhsProgress,
1097 RhsProgress = Traits::RhsProgress,
1098 RhsProgressHalf = HalfTraits::RhsProgress,
1099 RhsProgressQuarter = QuarterTraits::RhsProgress,
1100 ResPacketSize = Traits::ResPacketSize
1104 void operator()(
const DataMapper&
res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1109 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs,
1123 const RhsScalar* blB, Index
depth,
const Index endk, Index
i, Index j2,
1124 ResScalar
alpha, SAccPacket &C0)
1140 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1152 const RhsScalar* blB, Index
depth,
const Index endk, Index
i, Index j2,
1153 ResScalar
alpha, SAccPacket &C0)
1160 SResPacketQuarter
R = res.template gatherPacket<SResPacketQuarter>(
i, j2);
1161 SResPacketQuarter alphav = pset1<SResPacketQuarter>(
alpha);
1163 if (depth - endk > 0)
1169 for (Index kk = endk; kk <
depth; kk++)
1171 SLhsPacketQuarter a0;
1172 SRhsPacketQuarter b0;
1175 straits.
madd(a0,b0,c0,b0, fix<0>);
1176 blB += SwappedTraits::LhsProgress/4;
1179 straits.
acc(c0, alphav, R);
1185 res.scatterPacket(i, j2, R);
1189 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1194 EIGEN_STRONG_INLINE void peeled_kc_onestep(Index
K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits
traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *
C1, AccPacket *
C2, AccPacket *C3)
1198 traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199 traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1200 traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201 traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202 traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203 traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) 1205 __asm__ (
"" :
"+x,m" (*A0));
1211 const DataMapper&
res,
const LhsScalar* blockA,
const RhsScalar* blockB, ResScalar
alpha,
1212 Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1213 int prefetch_res_offset, Index peeled_kc, Index pk, Index
cols, Index
depth, Index packet_cols4)
1219 for(Index
i=peelStart;
i<peelEnd;
i+=LhsProgress)
1222 for(Index j2=0; j2<packet_cols4; j2+=nr)
1227 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(LhsProgress)];
1231 AccPacket C0,
C1,
C2, C3;
1241 AccPacket D0, D1, D2, D3;
1247 LinearMapper r0 = res.getLinearMapper(
i, j2 + 0);
1248 LinearMapper
r1 = res.getLinearMapper(
i, j2 + 1);
1249 LinearMapper
r2 = res.getLinearMapper(
i, j2 + 2);
1250 LinearMapper
r3 = res.getLinearMapper(
i, j2 + 3);
1252 r0.prefetch(prefetch_res_offset);
1253 r1.prefetch(prefetch_res_offset);
1254 r2.prefetch(prefetch_res_offset);
1255 r3.prefetch(prefetch_res_offset);
1258 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1262 for(Index k=0; k<peeled_kc; k+=pk)
1265 RhsPacketx4 rhs_panel;
1269 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270 peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271 peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272 peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1274 peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275 peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276 peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277 peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1279 blB += pk*4*RhsProgress;
1280 blA += pk*LhsProgress;
1290 for(Index k=peeled_kc; k<
depth; k++)
1292 RhsPacketx4 rhs_panel;
1294 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295 blB += 4*RhsProgress;
1300 ResPacket alphav = pset1<ResPacket>(
alpha);
1302 R0 = r0.template loadPacket<ResPacket>(0);
1303 R1 = r1.template loadPacket<ResPacket>(0);
1304 traits.acc(C0, alphav, R0);
1305 traits.acc(C1, alphav, R1);
1306 r0.storePacket(0, R0);
1307 r1.storePacket(0, R1);
1309 R0 = r2.template loadPacket<ResPacket>(0);
1310 R1 = r3.template loadPacket<ResPacket>(0);
1311 traits.acc(C2, alphav, R0);
1312 traits.acc(C3, alphav, R1);
1313 r2.storePacket(0, R0);
1314 r3.storePacket(0, R1);
1318 for(Index j2=packet_cols4; j2<
cols; j2++)
1321 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(LhsProgress)];
1328 LinearMapper r0 = res.getLinearMapper(
i, j2);
1331 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1334 for(Index k= 0; k<peeled_kc; k+=pk)
1339 #define EIGEN_GEBGP_ONESTEP(K) \ 1341 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \ 1342 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1344 traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \ 1345 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1346 traits.madd(A0, B_0, C0, B_0, fix<0>); \ 1347 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ 1359 blB += pk*RhsProgress;
1360 blA += pk*LhsProgress;
1366 for(Index k=peeled_kc; k<
depth; k++)
1373 #undef EIGEN_GEBGP_ONESTEP 1375 ResPacket alphav = pset1<ResPacket>(
alpha);
1376 R0 = r0.template loadPacket<ResPacket>(0);
1377 traits.acc(C0, alphav, R0);
1378 r0.storePacket(0, R0);
1384 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1385 struct lhs_process_fraction_of_packet :
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1388 EIGEN_STRONG_INLINE void peeled_kc_onestep(Index
K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits
traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *
C1, AccPacket *
C2, AccPacket *C3)
1392 traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394 traits.madd(*A0, *B_0, *C0, *B_0);
1395 traits.madd(*A0, *B1, *C1, *B1);
1396 traits.madd(*A0, *B2, *C2, *B2);
1397 traits.madd(*A0, *B3, *C3, *B3);
1402 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1407 Index strideA, Index strideB, Index offsetA, Index offsetB)
1412 if(strideA==-1) strideA =
depth;
1413 if(strideB==-1) strideB =
depth;
1415 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1416 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1417 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1418 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419 const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420 const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1422 const Index peeled_kc = depth & ~(pk-1);
1423 const int prefetch_res_offset = 32/
sizeof(ResScalar);
1429 if(mr>=3*Traits::LhsProgress)
1440 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
1441 for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1443 const Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc3);
1444 for(Index j2=0; j2<packet_cols4; j2+=nr)
1446 for(Index
i=i1;
i<actual_panel_end;
i+=3*LhsProgress)
1452 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*LhsProgress)];
1474 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1478 for(Index k=0; k<peeled_kc; k+=pk)
1485 #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0)) 1489 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2)); 1491 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND 1493 #define EIGEN_GEBP_ONESTEP(K) \ 1495 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ 1496 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1497 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \ 1498 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \ 1499 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \ 1501 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ 1502 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ 1503 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ 1504 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \ 1505 traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \ 1506 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ 1507 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ 1508 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ 1509 traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ 1510 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ 1511 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ 1512 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ 1513 traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ 1514 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ 1515 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ 1516 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ 1517 traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ 1518 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ 1519 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ 1520 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ 1521 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ 1534 blB += pk*4*RhsProgress;
1535 blA += pk*3*Traits::LhsProgress;
1540 for(Index k=peeled_kc; k<
depth; k++)
1546 blB += 4*RhsProgress;
1547 blA += 3*Traits::LhsProgress;
1550 #undef EIGEN_GEBP_ONESTEP 1555 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1558 traits.
acc(C0, alphav, R0);
1559 traits.
acc(C4, alphav, R1);
1560 traits.
acc(C8, alphav, R2);
1561 r0.storePacket(0 * Traits::ResPacketSize, R0);
1562 r0.storePacket(1 * Traits::ResPacketSize, R1);
1563 r0.storePacket(2 * Traits::ResPacketSize, R2);
1565 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1568 traits.
acc(C1, alphav, R0);
1569 traits.
acc(C5, alphav, R1);
1570 traits.
acc(C9, alphav, R2);
1571 r1.storePacket(0 * Traits::ResPacketSize, R0);
1572 r1.storePacket(1 * Traits::ResPacketSize, R1);
1573 r1.storePacket(2 * Traits::ResPacketSize, R2);
1575 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1578 traits.
acc(C2, alphav, R0);
1579 traits.
acc(C6, alphav, R1);
1580 traits.
acc(C10, alphav, R2);
1581 r2.storePacket(0 * Traits::ResPacketSize, R0);
1582 r2.storePacket(1 * Traits::ResPacketSize, R1);
1583 r2.storePacket(2 * Traits::ResPacketSize, R2);
1585 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1588 traits.
acc(C3, alphav, R0);
1589 traits.
acc(C7, alphav, R1);
1590 traits.
acc(C11, alphav, R2);
1591 r3.storePacket(0 * Traits::ResPacketSize, R0);
1592 r3.storePacket(1 * Traits::ResPacketSize, R1);
1593 r3.storePacket(2 * Traits::ResPacketSize, R2);
1598 for(Index j2=packet_cols4; j2<
cols; j2++)
1600 for(Index
i=i1;
i<actual_panel_end;
i+=3*LhsProgress)
1603 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*Traits::LhsProgress)];
1616 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1619 for(Index k=0; k<peeled_kc; k+=pk)
1623 #define EIGEN_GEBGP_ONESTEP(K) \ 1625 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ 1626 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1627 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ 1628 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ 1629 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ 1630 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \ 1631 traits.madd(A0, B_0, C0, B_0, fix<0>); \ 1632 traits.madd(A1, B_0, C4, B_0, fix<0>); \ 1633 traits.madd(A2, B_0, C8, B_0, fix<0>); \ 1634 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ 1646 blB +=
int(pk) *
int(RhsProgress);
1647 blA +=
int(pk) * 3 *
int(Traits::LhsProgress);
1653 for(Index k=peeled_kc; k<
depth; k++)
1658 blA += 3*Traits::LhsProgress;
1660 #undef EIGEN_GEBGP_ONESTEP 1664 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1667 traits.
acc(C0, alphav, R0);
1668 traits.
acc(C4, alphav, R1);
1669 traits.
acc(C8, alphav, R2);
1670 r0.storePacket(0 * Traits::ResPacketSize, R0);
1671 r0.storePacket(1 * Traits::ResPacketSize, R1);
1672 r0.storePacket(2 * Traits::ResPacketSize, R2);
1679 if(mr>=2*Traits::LhsProgress)
1685 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1687 for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1689 Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc2);
1690 for(Index j2=0; j2<packet_cols4; j2+=nr)
1692 for(Index
i=i1;
i<actual_panel_end;
i+=2*LhsProgress)
1698 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
1712 r0.prefetch(prefetch_res_offset);
1713 r1.prefetch(prefetch_res_offset);
1714 r2.prefetch(prefetch_res_offset);
1715 r3.prefetch(prefetch_res_offset);
1718 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1722 for(Index k=0; k<peeled_kc; k+=pk)
1730 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) 1731 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); 1733 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND 1735 #define EIGEN_GEBGP_ONESTEP(K) \ 1737 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ 1738 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \ 1739 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \ 1740 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \ 1741 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ 1742 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ 1743 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ 1744 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ 1745 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ 1746 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ 1747 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ 1748 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ 1749 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ 1750 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ 1764 blB += pk*4*RhsProgress;
1765 blA += pk*(2*Traits::LhsProgress);
1770 for(Index k=peeled_kc; k<
depth; k++)
1775 blB += 4*RhsProgress;
1776 blA += 2*Traits::LhsProgress;
1778 #undef EIGEN_GEBGP_ONESTEP 1783 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1787 traits.
acc(C0, alphav, R0);
1788 traits.
acc(C4, alphav, R1);
1789 traits.
acc(C1, alphav, R2);
1790 traits.
acc(C5, alphav, R3);
1791 r0.storePacket(0 * Traits::ResPacketSize, R0);
1792 r0.storePacket(1 * Traits::ResPacketSize, R1);
1793 r1.storePacket(0 * Traits::ResPacketSize, R2);
1794 r1.storePacket(1 * Traits::ResPacketSize, R3);
1796 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1800 traits.
acc(C2, alphav, R0);
1801 traits.
acc(C6, alphav, R1);
1802 traits.
acc(C3, alphav, R2);
1803 traits.
acc(C7, alphav, R3);
1804 r2.storePacket(0 * Traits::ResPacketSize, R0);
1805 r2.storePacket(1 * Traits::ResPacketSize, R1);
1806 r3.storePacket(0 * Traits::ResPacketSize, R2);
1807 r3.storePacket(1 * Traits::ResPacketSize, R3);
1812 for(Index j2=packet_cols4; j2<
cols; j2++)
1814 for(Index
i=i1;
i<actual_panel_end;
i+=2*LhsProgress)
1817 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
1826 r0.prefetch(prefetch_res_offset);
1829 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1832 for(Index k=0; k<peeled_kc; k+=pk)
1837 #define EIGEN_GEBGP_ONESTEP(K) \ 1839 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ 1840 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1841 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1842 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1843 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1844 traits.madd(A0, B_0, C0, B1, fix<0>); \ 1845 traits.madd(A1, B_0, C4, B_0, fix<0>); \ 1846 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ 1858 blB +=
int(pk) *
int(RhsProgress);
1859 blA +=
int(pk) * 2 *
int(Traits::LhsProgress);
1865 for(Index k=peeled_kc; k<
depth; k++)
1870 blA += 2*Traits::LhsProgress;
1872 #undef EIGEN_GEBGP_ONESTEP 1876 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1878 traits.
acc(C0, alphav, R0);
1879 traits.
acc(C4, alphav, R1);
1880 r0.storePacket(0 * Traits::ResPacketSize, R0);
1881 r0.storePacket(1 * Traits::ResPacketSize, R1);
1887 if(mr>=1*Traits::LhsProgress)
1889 lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890 p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1893 if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1895 lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896 p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1899 if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1901 lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902 p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1905 if(peeled_mc_quarter<rows)
1908 for(Index j2=0; j2<packet_cols4; j2+=nr)
1911 for(Index
i=peeled_mc_quarter;
i<
rows;
i+=1)
1913 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
1915 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1922 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1923 (SwappedTraits::LhsProgress<=16) &&
1924 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925 (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1933 const Index spk = (
std::max)(1,SwappedTraits::LhsProgress/4);
1934 const Index endk = (depth/spk)*spk;
1935 const Index endk4 = (depth/(spk*4))*(spk*4);
1938 for(; k<endk4; k+=4*spk)
1948 straits.
madd(A0,B_0,C0,B_0, fix<0>);
1949 straits.
madd(A1,B_1,C1,B_1, fix<0>);
1955 straits.
madd(A0,B_0,C2,B_0, fix<0>);
1956 straits.
madd(A1,B_1,C3,B_1, fix<0>);
1958 blB += 4*SwappedTraits::LhsProgress;
1962 for(; k<endk; k+=spk)
1969 straits.
madd(A0,B_0,C0,B_0, fix<0>);
1971 blB += SwappedTraits::LhsProgress;
1974 if(SwappedTraits::LhsProgress==8)
1982 SResPacketHalf
R = res.template gatherPacket<SResPacketHalf>(
i, j2);
1983 SResPacketHalf alphav = pset1<SResPacketHalf>(
alpha);
1993 straits.
madd(a0,b0,c0,b0, fix<0>);
1994 straits.
acc(c0, alphav, R);
2000 res.scatterPacket(i, j2, R);
2002 else if (SwappedTraits::LhsProgress==16)
2009 p(res, straits, blA, blB, depth, endk,
i, j2,alpha, C0);
2013 SResPacket R = res.template gatherPacket<SResPacket>(
i, j2);
2015 straits.
acc(C0, alphav, R);
2016 res.scatterPacket(i, j2, R);
2022 ResScalar C0(0),
C1(0),
C2(0), C3(0);
2024 for(Index k=0; k<
depth; k++)
2033 C0 = cj.
pmadd(A0,B_0,C0);
2039 C3 = cj.
pmadd(A0,B_1,C3);
2043 res(
i, j2 + 0) += alpha * C0;
2044 res(
i, j2 + 1) += alpha *
C1;
2045 res(
i, j2 + 2) += alpha *
C2;
2046 res(
i, j2 + 3) += alpha * C3;
2051 for(Index j2=packet_cols4; j2<
cols; j2++)
2054 for(Index
i=peeled_mc_quarter;
i<
rows;
i+=1)
2056 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
2060 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2061 for(Index k=0; k<
depth; k++)
2063 LhsScalar A0 = blA[k];
2064 RhsScalar B_0 = blB[k];
2065 C0 = cj.
pmadd(A0, B_0, C0);
2067 res(
i, j2) += alpha * C0;
2088 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2095 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2104 HasHalf = (
int)HalfPacketSize < (
int)PacketSize,
2105 HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize};
2110 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2111 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2115 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2116 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2117 const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118 const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119 const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2120 const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2121 const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122 : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
2127 if(Pack1>=3*PacketSize)
2129 for(; i<peeled_mc3; i+=3*PacketSize)
2131 if(PanelMode) count += (3*PacketSize) * offset;
2133 for(Index k=0; k<
depth; k++)
2136 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138 C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2139 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2140 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2141 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2143 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2147 if(Pack1>=2*PacketSize)
2149 for(; i<peeled_mc2; i+=2*PacketSize)
2151 if(PanelMode) count += (2*PacketSize) * offset;
2153 for(Index k=0; k<
depth; k++)
2156 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2158 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2159 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2161 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2165 if(Pack1>=1*PacketSize)
2167 for(; i<peeled_mc1; i+=1*PacketSize)
2169 if(PanelMode) count += (1*PacketSize) * offset;
2171 for(Index k=0; k<
depth; k++)
2174 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2175 pstore(blockA+count, cj.pconj(A));
2178 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2182 if(HasHalf && Pack1>=HalfPacketSize)
2184 for(; i<peeled_mc_half; i+=HalfPacketSize)
2186 if(PanelMode) count += (HalfPacketSize) * offset;
2188 for(Index k=0; k<
depth; k++)
2191 A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192 pstoreu(blockA+count, cj.pconj(A));
2193 count+=HalfPacketSize;
2195 if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2199 if(HasQuarter && Pack1>=QuarterPacketSize)
2201 for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2203 if(PanelMode) count += (QuarterPacketSize) * offset;
2205 for(Index k=0; k<
depth; k++)
2208 A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209 pstoreu(blockA+count, cj.pconj(A));
2210 count+=QuarterPacketSize;
2212 if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2221 if(Pack2<PacketSize && Pack2>1)
2223 for(; i<peeled_mc0; i+=last_lhs_progress)
2225 if(PanelMode) count += last_lhs_progress *
offset;
2227 for(Index k=0; k<
depth; k++)
2228 for(Index
w=0;
w<last_lhs_progress;
w++)
2229 blockA[count++] = cj(lhs(i+
w, k));
2231 if(PanelMode) count += last_lhs_progress * (stride-offset-
depth);
2237 if(PanelMode) count +=
offset;
2238 for(Index k=0; k<
depth; k++)
2239 blockA[count++] = cj(lhs(i, k));
2240 if(PanelMode) count += (stride-offset-
depth);
2244 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2251 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2260 HasHalf = (
int)HalfPacketSize < (
int)PacketSize,
2261 HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize};
2266 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2269 bool gone_half =
false, gone_quarter =
false, gone_last =
false;
2273 int psize = PacketSize;
2276 Index remaining_rows = rows-
i;
2277 Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2278 Index starting_pos =
i;
2279 for(; i<peeled_mc; i+=pack)
2281 if(PanelMode) count += pack *
offset;
2284 if(pack>=psize && psize >= QuarterPacketSize)
2286 const Index peeled_k = (depth/psize)*psize;
2287 for(; k<peeled_k; k+=psize)
2289 for (Index
m = 0;
m < pack;
m += psize)
2291 if (psize == PacketSize) {
2293 for (
int p = 0;
p < psize; ++
p) kernel.
packet[
p] = lhs.template loadPacket<Packet>(i+
p+
m, k);
2295 for (
int p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel.
packet[p]));
2296 }
else if (HasHalf && psize == HalfPacketSize) {
2299 for (
int p = 0;
p < psize; ++
p) kernel_half.
packet[
p] = lhs.template loadPacket<HalfPacket>(i+
p+
m, k);
2301 for (
int p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel_half.
packet[p]));
2302 }
else if (HasQuarter && psize == QuarterPacketSize) {
2303 gone_quarter =
true;
2305 for (
int p = 0;
p < psize; ++
p) kernel_quarter.
packet[
p] = lhs.template loadPacket<QuarterPacket>(i+
p+
m, k);
2307 for (
int p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel_quarter.
packet[p]));
2310 count += psize*pack;
2317 for(; w<pack-3; w+=4)
2320 b(cj(lhs(i+w+1, k))),
2321 c(cj(lhs(i+w+2, k))),
2322 d(cj(lhs(i+w+3, k)));
2323 blockA[count++] =
a;
2324 blockA[count++] =
b;
2325 blockA[count++] =
c;
2326 blockA[count++] =
d;
2330 blockA[count++] = cj(lhs(i+w, k));
2333 if(PanelMode) count += pack * (stride-offset-
depth);
2337 Index
left = rows -
i;
2340 (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341 ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342 (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2353 if (Pack2 < PacketSize && !gone_last) {
2355 psize = pack = left & ~1;
2362 if(PanelMode) count +=
offset;
2363 for(Index k=0; k<
depth; k++)
2364 blockA[count++] = cj(lhs(i, k));
2365 if(PanelMode) count += (stride-offset-
depth);
2376 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2385 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2392 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2394 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2395 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2397 const Index peeled_k = (depth/PacketSize)*PacketSize;
2446 for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2449 if(PanelMode) count += 4 *
offset;
2450 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2451 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2452 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2453 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2456 if((PacketSize%4)==0)
2458 for(; k<peeled_k; k+=PacketSize) {
2460 kernel.
packet[0 ] = dm0.template loadPacket<Packet>(k);
2461 kernel.
packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462 kernel.
packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463 kernel.
packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2465 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.
packet[0]));
2466 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.
packet[1%PacketSize]));
2467 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.
packet[2%PacketSize]));
2468 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.
packet[3%PacketSize]));
2469 count+=4*PacketSize;
2474 blockB[count+0] = cj(dm0(k));
2475 blockB[count+1] = cj(dm1(k));
2476 blockB[count+2] = cj(dm2(k));
2477 blockB[count+3] = cj(dm3(k));
2481 if(PanelMode) count += 4 * (stride-offset-
depth);
2486 for(Index j2=packet_cols4; j2<
cols; ++j2)
2488 if(PanelMode) count +=
offset;
2490 for(Index k=0; k<
depth; k++)
2492 blockB[count] = cj(dm0(k));
2495 if(PanelMode) count += (stride-offset-
depth);
2500 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2516 const bool HasHalf = (
int)HalfPacketSize < (
int)PacketSize;
2517 const bool HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize;
2519 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2558 for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2561 if(PanelMode) count += 4 *
offset;
2562 for(Index k=0; k<
depth; k++)
2564 if (PacketSize==4) {
2565 Packet
A = rhs.template loadPacket<Packet>(k, j2);
2566 pstoreu(blockB+count, cj.pconj(A));
2567 count += PacketSize;
2568 }
else if (HasHalf && HalfPacketSize==4) {
2569 HalfPacket
A = rhs.template loadPacket<HalfPacket>(k, j2);
2570 pstoreu(blockB+count, cj.pconj(A));
2571 count += HalfPacketSize;
2572 }
else if (HasQuarter && QuarterPacketSize==4) {
2573 QuarterPacket
A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574 pstoreu(blockB+count, cj.pconj(A));
2575 count += QuarterPacketSize;
2577 const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2578 blockB[count+0] = cj(dm0(0));
2579 blockB[count+1] = cj(dm0(1));
2580 blockB[count+2] = cj(dm0(2));
2581 blockB[count+3] = cj(dm0(3));
2590 for(Index j2=packet_cols4; j2<
cols; ++j2)
2592 if(PanelMode) count +=
offset;
2593 for(Index k=0; k<
depth; k++)
2595 blockB[count] = cj(rhs(k, j2));
2609 std::ptrdiff_t
l1,
l2,
l3;
2618 std::ptrdiff_t
l1,
l2,
l3;
2628 std::ptrdiff_t
l1,
l2,
l3;
2645 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val)
Traits::ResScalar ResScalar
SwappedTraits::ResScalar SResScalar
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
DataMapper::LinearMapper LinearMapper
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
DataMapper::LinearMapper LinearMapper
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
DoublePacket< RealPacket > DoublePacketType
#define EIGEN_STRONG_INLINE
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
DoublePacket< typename unpacket_traits< Packet >::half > half
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
std::complex< RealScalar > RhsScalar
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
QuadPacket< RhsPacket > RhsPacketx4
Traits::ResScalar ResScalar
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const false_type &) const
static const Pose3 T3(Rot3::Rodrigues(-90, 0, 0), Point3(1, 2, 3))
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
std::complex< RealScalar > LhsScalar
unpacket_traits< typename unpacket_traits< Packet >::half >::half QuarterPacket
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
SwappedTraits::RhsPacket SRhsPacket
conditional< Vectorizable, DoublePacketType, Scalar >::type AccPacket
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, ScalarPacket &dest) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType &x, const RhsType &y, const ResultType &c) const
Rot2 R(Rot2::fromAngle(0.1))
conditional< Vectorizable, ScalarPacket, Scalar >::type LhsPacket4Packing
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
DataMapper::LinearMapper LinearMapper
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
Namespace containing all symbols from the Eigen library.
static Cal3_S2 K(500, 500, 0.1, 640/2, 480/2)
SwappedTraits::ResPacket SResPacket
Traits::ResScalar ResScalar
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
#define EIGEN_ASM_COMMENT(X)
Holds information about the various numeric (i.e. scalar) types allowed by Eigen. ...
SwappedTraits::AccPacket SAccPacket
static const Pose3 T2(Rot3::Rodrigues(0.3, 0.2, 0.1), P2)
SwappedTraits::RhsPacket SRhsPacket
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
EIGEN_STRONG_INLINE enable_if<!is_same< RhsPacketType, RhsPacketx4 >::value >::type madd(const LhsPacketType &a, const RhsPacketType &b, DoublePacket< ResPacketType > &c, TmpType &, const LaneIdType &) const
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
Traits::RhsPacketx4 RhsPacketx4
RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter > QuarterTraits
Traits::RhsPacket RhsPacket
conditional< Vectorizable, RealPacket, Scalar >::type LhsPacket
const std::ptrdiff_t defaultL3CacheSize
SwappedTraits::AccPacket SAccPacket
conditional< Vectorizable, ScalarPacket, Scalar >::type ResPacket
HalfTraits::LhsPacket LhsPacketHalf
packet_traits< Scalar >::type Packet
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b)
std::complex< RealScalar > Scalar
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
#define EIGEN_DONT_INLINE
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
std::ptrdiff_t l3CacheSize()
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
SwappedTraits::RhsPacket SRhsPacket
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
SwappedTraits::ResPacket SResPacket
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
void loadQuadToDoublePacket(const Scalar *b, DoublePacket< RealPacket > &dest, typename enable_if< unpacket_traits< RealPacket >::size<=8 >::type *=0)
#define PACKET_DECL_COND_SCALAR(packet_size)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
SwappedTraits::AccPacket SAccPacket
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf > HalfTraits
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ScalarPacket &dest) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
QuarterTraits::LhsPacket LhsPacketQuarter
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
unpacket_traits< Packet >::half HalfPacket
QuarterTraits::RhsPacket RhsPacketQuarter
std::ptrdiff_t l2CacheSize()
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
DataMapper::LinearMapper LinearMapper
HalfTraits::ResPacket ResPacketHalf
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
void queryCacheSizes(int &l1, int &l2, int &l3)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
HalfTraits::RhsPacket RhsPacketHalf
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
GEBPTraits::RhsPacketx4 RhsPacketx4
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
std::complex< RealScalar > Scalar
HalfTraits::AccPacket AccPacketHalf
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
QuadPacket< RhsPacket > RhsPacketx4
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N
SwappedTraits::LhsPacket SLhsPacket
QuarterTraits::ResPacket ResPacketQuarter
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
NumTraits< Scalar >::Real RealScalar
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
LhsPacket LhsPacket4Packing
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Matrix< Scalar, Dynamic, Dynamic > C
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
const Packet & get(const FixedInt< 0 > &) const
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Traits::AccPacket AccPacket
EIGEN_CONSTEXPR Index size(const T &x)
#define EIGEN_GEBP_ONESTEP(K)
const std::ptrdiff_t defaultL2CacheSize
LhsPacket LhsPacket4Packing
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
mp::number< mp::cpp_dec_float< 100 >, mp::et_on > Real
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val)
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K
#define EIGEN_GEBGP_ONESTEP(K)
Traits::LhsPacket LhsPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
SwappedTraits::LhsPacket SLhsPacket
constexpr descr< N - 1 > _(char const (&text)[N])
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
conditional< Vectorizable, DoublePacketType, Scalar >::type RhsPacket
#define EIGEN_PLAIN_ENUM_MIN(a, b)
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
QuadPacket< RhsPacket > RhsPacketx4
static const Similarity3 T1(R, Point3(3.5, -8.2, 4.2), 1)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
const std::ptrdiff_t defaultL1CacheSize
QuadPacket< RhsPacket > RhsPacketx4
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride=0, Index offset=0)
#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size)
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
packet_traits< Scalar >::type Packet
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Traits::ResPacket ResPacket
EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd &x)
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
SwappedTraits::LhsPacket SLhsPacket
EIGEN_DEVICE_FUNC const ImagReturnType imag() const
LhsPacket LhsPacket4Packing
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
DataMapper::LinearMapper LinearMapper
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const true_type &) const
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
std::complex< RealScalar > ResScalar
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &, const LaneIdType &) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
QuarterTraits::AccPacket AccPacketQuarter
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType &x, const RhsType &y) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
#define eigen_internal_assert(x)
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Generic expression where a coefficient-wise unary operator is applied to an expression.
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val)
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
std::complex< RealScalar > LhsScalar
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
#define PACKET_DECL_COND(name, packet_size)
std::ptrdiff_t l1CacheSize()
#define EIGEN_UNUSED_VARIABLE(var)
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
EIGEN_STRONG_INLINE void acc(const DoublePacket< RealPacketType > &c, const ResPacketType &alpha, ResPacketType &r) const
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
SwappedTraits::ResPacket SResPacket