10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
24 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false,
int Arch=Architecture::Target,
int _PacketSize=GEBPPacketFull>
34 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
37 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38 #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
40 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
43 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44 #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
46 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
49 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50 #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
52 #if EIGEN_ARCH_i386_OR_x86_64
66 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
123 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
133 std::ptrdiff_t
l1,
l2,
l3;
135 #ifdef EIGEN_VECTORIZE_AVX512
146 if (num_threads > 1) {
147 typedef typename Traits::ResScalar ResScalar;
149 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
150 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
160 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((
l1-ksub)/kdiv, 320));
162 k = k_cache - (k_cache % kr);
166 const Index n_cache = (
l2-
l1) / (nr *
sizeof(RhsScalar) * k);
168 if (n_cache <= n_per_thread) {
171 n = n_cache - (n_cache % nr);
174 n = (numext::mini<Index>)(
n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
179 const Index m_cache = (
l3-
l2) / (
sizeof(LhsScalar) * k * num_threads);
181 if(m_cache < m_per_thread && m_cache >=
static_cast<Index>(mr)) {
182 m = m_cache - (m_cache % mr);
185 m = (numext::mini<Index>)(
m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
192 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
205 typedef typename Traits::ResScalar ResScalar;
208 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
209 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
219 const Index max_kc = numext::maxi<Index>(((
l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220 const Index old_k = k;
226 k = (k%max_kc)==0 ? max_kc
227 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
229 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
238 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
241 const Index actual_l2 = 1572864;
251 const Index lhs_bytes =
m * k *
sizeof(LhsScalar);
252 const Index remaining_l1 =
l1- k_sub - lhs_bytes;
253 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
256 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
261 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
264 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
272 : (
nc - Traits::nr * ((
nc-(
n%
nc))/(Traits::nr*(
n/
nc+1))));
279 Index problem_size = k*
n*
sizeof(LhsScalar);
280 Index actual_lm = actual_l2;
282 if(problem_size<=1024)
288 else if(
l3!=0 && problem_size<=32768)
293 max_mc = (numext::mini<Index>)(576,max_mc);
295 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
296 if (mc > Traits::mr) mc -= mc % Traits::mr;
297 else if (mc==0)
return;
299 : (mc - Traits::mr * ((mc-(
m%mc))/(Traits::mr*(
m/mc+1))));
304 template <
typename Index>
307 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
338 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
342 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k,
m,
n, num_threads);
346 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
349 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k,
m,
n, num_threads);
352 template <
typename RhsPacket,
typename RhsPacketx4,
int registers_taken>
360 template <
typename Packet>
370 template <
int N,
typename T1,
typename T2,
typename T3>
373 template <
typename T1,
typename T2,
typename T3>
376 template <
typename T1,
typename T2,
typename T3>
379 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380 typedef typename packet_conditional<packet_size, \
381 typename packet_traits<name ## Scalar>::type, \
382 typename packet_traits<name ## Scalar>::half, \
383 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384 prefix ## name ## Packet
386 #define PACKET_DECL_COND(name, packet_size) \
387 typedef typename packet_conditional<packet_size, \
388 typename packet_traits<name ## Scalar>::type, \
389 typename packet_traits<name ## Scalar>::half, \
390 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
393 #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394 typedef typename packet_conditional<packet_size, \
395 typename packet_traits<Scalar>::type, \
396 typename packet_traits<Scalar>::half, \
397 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398 prefix ## ScalarPacket
400 #define PACKET_DECL_COND_SCALAR(packet_size) \
401 typedef typename packet_conditional<packet_size, \
402 typename packet_traits<Scalar>::type, \
403 typename packet_traits<Scalar>::half, \
404 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
417 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
473 template<
typename RhsPacketType>
476 dest = pset1<RhsPacketType>(*
b);
484 template<
typename RhsPacketType>
496 dest = ploadquad<RhsPacket>(
b);
499 template<
typename LhsPacketType>
502 dest = pload<LhsPacketType>(
a);
505 template<
typename LhsPacketType>
508 dest = ploadu<LhsPacketType>(
a);
511 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
519 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
527 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
530 madd(
a,
b.get(lane),
c, tmp, lane);
538 template<
typename ResPacketHalf>
546 template<
typename RealScalar,
bool _ConjLhs,
int Arch,
int _PacketSize>
568 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
593 template<
typename RhsPacketType>
596 dest = pset1<RhsPacketType>(*
b);
604 template<
typename RhsPacketType>
623 dest = ploadquad<RhsPacket>(tmp);
629 dest = pset1<RhsPacket>(*
b);
634 dest = pload<LhsPacket>(
a);
637 template<
typename LhsPacketType>
640 dest = ploadu<LhsPacketType>(
a);
643 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
649 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
652 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
665 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
668 madd(
a,
b.get(lane),
c, tmp, lane);
671 template <
typename ResPacketType,
typename AccPacketType>
681 template<
typename Packet>
688 template<
typename Packet>
701 template<
typename Packet>
702 const DoublePacket<Packet>&
709 template<
typename Packet>
710 DoublePacket<typename unpacket_traits<Packet>::half>
724 template<
typename Scalar,
typename RealPacket>
732 template<
typename Scalar,
typename RealPacket>
740 dest.
first = ploadquad<RealPacket>(r);
741 dest.
second = ploadquad<RealPacket>(
i);
757 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
812 dest = pset1<ScalarPacket>(*
b);
816 template<
typename RealPacketType>
838 template<
typename RealPacketType>
861 template<
typename LhsPacketType>
867 template<
typename LhsPacketType,
typename RhsPacketType,
typename ResPacketType,
typename TmpType,
typename LaneIdType>
876 template<
typename LaneIdType>
882 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
885 madd(
a,
b.get(lane),
c, tmp, lane);
890 template<
typename RealPacketType,
typename ResPacketType>
898 tmp =
padd(ResPacketType(
c.first),tmp);
903 tmp =
padd(ResPacketType(
c.first),tmp);
908 tmp =
padd(
pconj(ResPacketType(
c.first)),tmp);
913 tmp =
psub(
pconj(ResPacketType(
c.first)),tmp);
923 template<
typename RealScalar,
bool _ConjRhs,
int Arch,
int _PacketSize>
938 #undef PACKET_DECL_COND_SCALAR_PREFIX
939 #undef PACKET_DECL_COND_PREFIX
940 #undef PACKET_DECL_COND_SCALAR
941 #undef PACKET_DECL_COND
973 template<
typename RhsPacketType>
976 dest = pset1<RhsPacketType>(*
b);
984 template<
typename RhsPacketType>
995 dest = ploaddup<LhsPacket>(
a);
1000 dest = ploadquad<RhsPacket>(
b);
1003 template<
typename LhsPacketType>
1006 dest = ploaddup<LhsPacketType>(
a);
1009 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
1015 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
1018 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1032 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
1035 madd(
a,
b.get(lane),
c, tmp, lane);
1038 template <
typename ResPacketType,
typename AccPacketType>
1056 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1104 void operator()(
const DataMapper&
res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1109 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs,
1140 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1160 SResPacketQuarter
R =
res.template gatherPacket<SResPacketQuarter>(
i, j2);
1161 SResPacketQuarter alphav = pset1<SResPacketQuarter>(
alpha);
1163 if (
depth - endk > 0)
1171 SLhsPacketQuarter a0;
1172 SRhsPacketQuarter b0;
1175 straits.
madd(a0,b0,c0,b0, fix<0>);
1179 straits.
acc(c0, alphav,
R);
1185 res.scatterPacket(
i, j2,
R);
1189 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1194 EIGEN_STRONG_INLINE void peeled_kc_onestep(
Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits
traits, LhsPacket *
A0,
RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *
C1, AccPacket *
C2, AccPacket *C3)
1198 traits.loadLhs(&blA[(0+1*
K)*LhsProgress], *
A0);
1199 traits.loadRhs(&blB[(0+4*
K)*RhsProgress], *rhs_panel);
1200 traits.madd(*
A0, *rhs_panel, *C0, *T0, fix<0>);
1201 traits.madd(*
A0, *rhs_panel, *
C1, *T0, fix<1>);
1202 traits.madd(*
A0, *rhs_panel, *
C2, *T0, fix<2>);
1203 traits.madd(*
A0, *rhs_panel, *C3, *T0, fix<3>);
1204 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205 __asm__ (
"" :
"+x,m" (*
A0));
1211 const DataMapper&
res,
const LhsScalar* blockA,
const RhsScalar* blockB, ResScalar
alpha,
1219 for(
Index i=peelStart;
i<peelEnd;
i+=LhsProgress)
1222 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1227 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(LhsProgress)];
1231 AccPacket C0,
C1,
C2, C3;
1241 AccPacket D0, D1, D2, D3;
1247 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1248 LinearMapper
r1 =
res.getLinearMapper(
i, j2 + 1);
1249 LinearMapper
r2 =
res.getLinearMapper(
i, j2 + 2);
1250 LinearMapper
r3 =
res.getLinearMapper(
i, j2 + 3);
1252 r0.prefetch(prefetch_res_offset);
1253 r1.prefetch(prefetch_res_offset);
1254 r2.prefetch(prefetch_res_offset);
1255 r3.prefetch(prefetch_res_offset);
1258 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1262 for(
Index k=0; k<peeled_kc; k+=pk)
1269 peeled_kc_onestep(0, blA, blB,
traits, &
A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1270 peeled_kc_onestep(1, blA, blB,
traits, &
A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271 peeled_kc_onestep(2, blA, blB,
traits, &
A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1272 peeled_kc_onestep(3, blA, blB,
traits, &
A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1274 peeled_kc_onestep(4, blA, blB,
traits, &
A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1275 peeled_kc_onestep(5, blA, blB,
traits, &
A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276 peeled_kc_onestep(6, blA, blB,
traits, &
A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1277 peeled_kc_onestep(7, blA, blB,
traits, &
A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1279 blB += pk*4*RhsProgress;
1280 blA += pk*LhsProgress;
1294 peeled_kc_onestep(0, blA, blB,
traits, &
A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1295 blB += 4*RhsProgress;
1300 ResPacket alphav = pset1<ResPacket>(
alpha);
1302 R0 = r0.template loadPacket<ResPacket>(0);
1303 R1 =
r1.template loadPacket<ResPacket>(0);
1306 r0.storePacket(0,
R0);
1307 r1.storePacket(0,
R1);
1309 R0 =
r2.template loadPacket<ResPacket>(0);
1310 R1 =
r3.template loadPacket<ResPacket>(0);
1313 r2.storePacket(0,
R0);
1314 r3.storePacket(0,
R1);
1318 for(
Index j2=packet_cols4; j2<
cols; j2++)
1321 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(LhsProgress)];
1328 LinearMapper r0 =
res.getLinearMapper(
i, j2);
1331 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1334 for(
Index k= 0; k<peeled_kc; k+=pk)
1339 #define EIGEN_GEBGP_ONESTEP(K) \
1341 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1344 traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1359 blB += pk*RhsProgress;
1360 blA += pk*LhsProgress;
1373 #undef EIGEN_GEBGP_ONESTEP
1375 ResPacket alphav = pset1<ResPacket>(
alpha);
1376 R0 = r0.template loadPacket<ResPacket>(0);
1378 r0.storePacket(0,
R0);
1384 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1385 struct lhs_process_fraction_of_packet :
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1388 EIGEN_STRONG_INLINE void peeled_kc_onestep(
Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits
traits, LhsPacket *
A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *
C1, AccPacket *
C2, AccPacket *C3)
1392 traits.loadLhsUnaligned(&blA[(0+1*
K)*(LhsProgress)], *
A0);
1393 traits.broadcastRhs(&blB[(0+4*
K)*RhsProgress], *B_0, *B1, *B2, *B3);
1402 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1412 if(strideA==-1) strideA =
depth;
1413 if(strideB==-1) strideB =
depth;
1415 Index packet_cols4 = nr>=4 ? (
cols/4) * 4 : 0;
1416 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (
rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1417 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((
rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1418 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((
rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419 const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((
rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420 const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((
rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1423 const int prefetch_res_offset = 32/
sizeof(
ResScalar);
1429 if(mr>=3*Traits::LhsProgress)
1440 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (
l1 -
sizeof(
ResScalar)*mr*nr -
depth*nr*
sizeof(RhsScalar)) / (
depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
1441 for(
Index i1=0;
i1<peeled_mc3;
i1+=actual_panel_rows)
1443 const Index actual_panel_end = (
std::min)(
i1+actual_panel_rows, peeled_mc3);
1444 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1446 for(
Index i=
i1;
i<actual_panel_end;
i+=3*LhsProgress)
1452 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*LhsProgress)];
1474 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1478 for(
Index k=0; k<peeled_kc; k+=pk)
1485 #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1489 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1491 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1493 #define EIGEN_GEBP_ONESTEP(K) \
1495 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1496 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1497 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1501 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505 traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509 traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513 traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517 traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1534 blB += pk*4*RhsProgress;
1535 blA += pk*3*Traits::LhsProgress;
1546 blB += 4*RhsProgress;
1547 blA += 3*Traits::LhsProgress;
1550 #undef EIGEN_GEBP_ONESTEP
1555 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1561 r0.storePacket(0 * Traits::ResPacketSize,
R0);
1562 r0.storePacket(1 * Traits::ResPacketSize,
R1);
1563 r0.storePacket(2 * Traits::ResPacketSize,
R2);
1565 R0 =
r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566 R1 =
r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567 R2 =
r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1571 r1.storePacket(0 * Traits::ResPacketSize,
R0);
1572 r1.storePacket(1 * Traits::ResPacketSize,
R1);
1573 r1.storePacket(2 * Traits::ResPacketSize,
R2);
1575 R0 =
r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576 R1 =
r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577 R2 =
r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1581 r2.storePacket(0 * Traits::ResPacketSize,
R0);
1582 r2.storePacket(1 * Traits::ResPacketSize,
R1);
1583 r2.storePacket(2 * Traits::ResPacketSize,
R2);
1585 R0 =
r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586 R1 =
r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587 R2 =
r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1591 r3.storePacket(0 * Traits::ResPacketSize,
R0);
1592 r3.storePacket(1 * Traits::ResPacketSize,
R1);
1593 r3.storePacket(2 * Traits::ResPacketSize,
R2);
1598 for(
Index j2=packet_cols4; j2<
cols; j2++)
1600 for(
Index i=
i1;
i<actual_panel_end;
i+=3*LhsProgress)
1603 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*Traits::LhsProgress)];
1616 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1619 for(
Index k=0; k<peeled_kc; k+=pk)
1623 #define EIGEN_GEBGP_ONESTEP(K) \
1625 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1626 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1627 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1646 blB +=
int(pk) *
int(RhsProgress);
1647 blA +=
int(pk) * 3 *
int(Traits::LhsProgress);
1658 blA += 3*Traits::LhsProgress;
1660 #undef EIGEN_GEBGP_ONESTEP
1664 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1670 r0.storePacket(0 * Traits::ResPacketSize,
R0);
1671 r0.storePacket(1 * Traits::ResPacketSize,
R1);
1672 r0.storePacket(2 * Traits::ResPacketSize,
R2);
1679 if(mr>=2*Traits::LhsProgress)
1685 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (
l1 -
sizeof(
ResScalar)*mr*nr -
depth*nr*
sizeof(RhsScalar)) / (
depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1687 for(
Index i1=peeled_mc3;
i1<peeled_mc2;
i1+=actual_panel_rows)
1689 Index actual_panel_end = (
std::min)(
i1+actual_panel_rows, peeled_mc2);
1690 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1692 for(
Index i=
i1;
i<actual_panel_end;
i+=2*LhsProgress)
1698 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
1712 r0.prefetch(prefetch_res_offset);
1713 r1.prefetch(prefetch_res_offset);
1714 r2.prefetch(prefetch_res_offset);
1715 r3.prefetch(prefetch_res_offset);
1718 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1722 for(
Index k=0; k<peeled_kc; k+=pk)
1730 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1731 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1733 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1735 #define EIGEN_GEBGP_ONESTEP(K) \
1737 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1764 blB += pk*4*RhsProgress;
1765 blA += pk*(2*Traits::LhsProgress);
1775 blB += 4*RhsProgress;
1776 blA += 2*Traits::LhsProgress;
1778 #undef EIGEN_GEBGP_ONESTEP
1783 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785 R2 =
r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786 R3 =
r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1791 r0.storePacket(0 * Traits::ResPacketSize,
R0);
1792 r0.storePacket(1 * Traits::ResPacketSize,
R1);
1793 r1.storePacket(0 * Traits::ResPacketSize,
R2);
1794 r1.storePacket(1 * Traits::ResPacketSize,
R3);
1796 R0 =
r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797 R1 =
r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798 R2 =
r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799 R3 =
r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1804 r2.storePacket(0 * Traits::ResPacketSize,
R0);
1805 r2.storePacket(1 * Traits::ResPacketSize,
R1);
1806 r3.storePacket(0 * Traits::ResPacketSize,
R2);
1807 r3.storePacket(1 * Traits::ResPacketSize,
R3);
1812 for(
Index j2=packet_cols4; j2<
cols; j2++)
1814 for(
Index i=
i1;
i<actual_panel_end;
i+=2*LhsProgress)
1817 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
1826 r0.prefetch(prefetch_res_offset);
1829 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1832 for(
Index k=0; k<peeled_kc; k+=pk)
1837 #define EIGEN_GEBGP_ONESTEP(K) \
1839 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1840 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1841 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1842 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1843 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1844 traits.madd(A0, B_0, C0, B1, fix<0>); \
1845 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1846 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1858 blB +=
int(pk) *
int(RhsProgress);
1859 blA +=
int(pk) * 2 *
int(Traits::LhsProgress);
1870 blA += 2*Traits::LhsProgress;
1872 #undef EIGEN_GEBGP_ONESTEP
1876 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1880 r0.storePacket(0 * Traits::ResPacketSize,
R0);
1881 r0.storePacket(1 * Traits::ResPacketSize,
R1);
1887 if(mr>=1*Traits::LhsProgress)
1889 lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890 p(
res, blockA, blockB,
alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk,
cols,
depth, packet_cols4);
1893 if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1895 lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896 p(
res, blockA, blockB,
alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk,
cols,
depth, packet_cols4);
1899 if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1901 lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902 p(
res, blockA, blockB,
alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk,
cols,
depth, packet_cols4);
1905 if(peeled_mc_quarter<
rows)
1908 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1913 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
1915 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1922 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1923 (SwappedTraits::LhsProgress<=16) &&
1924 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925 (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1933 const Index spk = (
std::max)(1,SwappedTraits::LhsProgress/4);
1938 for(; k<endk4; k+=4*spk)
1948 straits.
madd(
A0,B_0,C0,B_0, fix<0>);
1949 straits.
madd(
A1,B_1,
C1,B_1, fix<0>);
1955 straits.
madd(
A0,B_0,
C2,B_0, fix<0>);
1956 straits.
madd(
A1,B_1,C3,B_1, fix<0>);
1958 blB += 4*SwappedTraits::LhsProgress;
1962 for(; k<endk; k+=spk)
1969 straits.
madd(
A0,B_0,C0,B_0, fix<0>);
1971 blB += SwappedTraits::LhsProgress;
1974 if(SwappedTraits::LhsProgress==8)
1982 SResPacketHalf
R =
res.template gatherPacket<SResPacketHalf>(
i, j2);
1983 SResPacketHalf alphav = pset1<SResPacketHalf>(
alpha);
1993 straits.
madd(a0,b0,c0,b0, fix<0>);
1994 straits.
acc(c0, alphav,
R);
2000 res.scatterPacket(
i, j2,
R);
2002 else if (SwappedTraits::LhsProgress==16)
2015 straits.
acc(C0, alphav,
R);
2016 res.scatterPacket(
i, j2,
R);
2051 for(
Index j2=packet_cols4; j2<
cols; j2++)
2056 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
2060 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2063 LhsScalar
A0 = blA[k];
2064 RhsScalar B_0 = blB[k];
2088 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2095 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2104 HasHalf = (
int)HalfPacketSize < (
int)PacketSize,
2105 HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize};
2111 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2115 const Index peeled_mc3 = Pack1>=3*PacketSize ? (
rows/(3*PacketSize))*(3*PacketSize) : 0;
2116 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((
rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2117 const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((
rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118 const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((
rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119 const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (
rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2120 const Index last_lhs_progress =
rows > peeled_mc_quarter ? (
rows - peeled_mc_quarter) & ~1 : 0;
2121 const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122 : Pack2>1 && last_lhs_progress ? (
rows/last_lhs_progress)*last_lhs_progress : 0;
2127 if(Pack1>=3*PacketSize)
2129 for(;
i<peeled_mc3;
i+=3*PacketSize)
2131 if(PanelMode) count += (3*PacketSize) *
offset;
2136 A = lhs.template loadPacket<Packet>(
i+0*PacketSize, k);
2137 B = lhs.template loadPacket<Packet>(
i+1*PacketSize, k);
2138 C = lhs.template loadPacket<Packet>(
i+2*PacketSize, k);
2139 pstore(blockA+count, cj.pconj(
A)); count+=PacketSize;
2140 pstore(blockA+count, cj.pconj(
B)); count+=PacketSize;
2141 pstore(blockA+count, cj.pconj(
C)); count+=PacketSize;
2143 if(PanelMode) count += (3*PacketSize) * (stride-
offset-
depth);
2147 if(Pack1>=2*PacketSize)
2149 for(;
i<peeled_mc2;
i+=2*PacketSize)
2151 if(PanelMode) count += (2*PacketSize) *
offset;
2156 A = lhs.template loadPacket<Packet>(
i+0*PacketSize, k);
2157 B = lhs.template loadPacket<Packet>(
i+1*PacketSize, k);
2158 pstore(blockA+count, cj.pconj(
A)); count+=PacketSize;
2159 pstore(blockA+count, cj.pconj(
B)); count+=PacketSize;
2161 if(PanelMode) count += (2*PacketSize) * (stride-
offset-
depth);
2165 if(Pack1>=1*PacketSize)
2167 for(;
i<peeled_mc1;
i+=1*PacketSize)
2169 if(PanelMode) count += (1*PacketSize) *
offset;
2174 A = lhs.template loadPacket<Packet>(
i+0*PacketSize, k);
2175 pstore(blockA+count, cj.pconj(
A));
2178 if(PanelMode) count += (1*PacketSize) * (stride-
offset-
depth);
2182 if(HasHalf && Pack1>=HalfPacketSize)
2184 for(;
i<peeled_mc_half;
i+=HalfPacketSize)
2186 if(PanelMode) count += (HalfPacketSize) *
offset;
2191 A = lhs.template loadPacket<HalfPacket>(
i+0*(HalfPacketSize), k);
2192 pstoreu(blockA+count, cj.pconj(
A));
2193 count+=HalfPacketSize;
2195 if(PanelMode) count += (HalfPacketSize) * (stride-
offset-
depth);
2199 if(HasQuarter && Pack1>=QuarterPacketSize)
2201 for(;
i<peeled_mc_quarter;
i+=QuarterPacketSize)
2203 if(PanelMode) count += (QuarterPacketSize) *
offset;
2208 A = lhs.template loadPacket<QuarterPacket>(
i+0*(QuarterPacketSize), k);
2209 pstoreu(blockA+count, cj.pconj(
A));
2210 count+=QuarterPacketSize;
2212 if(PanelMode) count += (QuarterPacketSize) * (stride-
offset-
depth);
2221 if(Pack2<PacketSize && Pack2>1)
2223 for(;
i<peeled_mc0;
i+=last_lhs_progress)
2225 if(PanelMode) count += last_lhs_progress *
offset;
2228 for(
Index w=0;
w<last_lhs_progress;
w++)
2229 blockA[count++] = cj(lhs(
i+
w, k));
2231 if(PanelMode) count += last_lhs_progress * (stride-
offset-
depth);
2237 if(PanelMode) count +=
offset;
2239 blockA[count++] = cj(lhs(
i, k));
2244 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2251 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2260 HasHalf = (
int)HalfPacketSize < (
int)PacketSize,
2261 HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize};
2269 bool gone_half =
false, gone_quarter =
false, gone_last =
false;
2273 int psize = PacketSize;
2277 Index peeled_mc = gone_last ? Pack2>1 ? (
rows/pack)*pack : 0 :
i+(remaining_rows/pack)*pack;
2279 for(;
i<peeled_mc;
i+=pack)
2281 if(PanelMode) count += pack *
offset;
2284 if(pack>=psize && psize >= QuarterPacketSize)
2287 for(; k<peeled_k; k+=psize)
2289 for (
Index m = 0;
m < pack;
m += psize)
2291 if (psize == PacketSize) {
2293 for (
int p = 0;
p < psize; ++
p) kernel.
packet[
p] = lhs.template loadPacket<Packet>(
i+
p+
m, k);
2295 for (
int p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel.
packet[
p]));
2296 }
else if (HasHalf && psize == HalfPacketSize) {
2299 for (
int p = 0;
p < psize; ++
p) kernel_half.
packet[
p] = lhs.template loadPacket<HalfPacket>(
i+
p+
m, k);
2301 for (
int p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel_half.
packet[
p]));
2302 }
else if (HasQuarter && psize == QuarterPacketSize) {
2303 gone_quarter =
true;
2305 for (
int p = 0;
p < psize; ++
p) kernel_quarter.
packet[
p] = lhs.template loadPacket<QuarterPacket>(
i+
p+
m, k);
2307 for (
int p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel_quarter.
packet[
p]));
2310 count += psize*pack;
2317 for(;
w<pack-3;
w+=4)
2320 b(cj(lhs(
i+
w+1, k))),
2321 c(cj(lhs(
i+
w+2, k))),
2322 d(cj(lhs(
i+
w+3, k)));
2323 blockA[count++] =
a;
2324 blockA[count++] =
b;
2325 blockA[count++] =
c;
2326 blockA[count++] =
d;
2330 blockA[count++] = cj(lhs(
i+
w, k));
2333 if(PanelMode) count += pack * (stride-
offset-
depth);
2340 (starting_pos ==
i ||
left >= psize/2 ||
left >= psize/4) &&
2341 ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342 (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2353 if (Pack2 < PacketSize && !gone_last) {
2355 psize = pack =
left & ~1;
2362 if(PanelMode) count +=
offset;
2364 blockA[count++] = cj(lhs(
i, k));
2376 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2385 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2394 Index packet_cols8 = nr>=8 ? (
cols/8) * 8 : 0;
2395 Index packet_cols4 = nr>=4 ? (
cols/4) * 4 : 0;
2397 const Index peeled_k = (
depth/PacketSize)*PacketSize;
2446 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2449 if(PanelMode) count += 4 *
offset;
2450 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2451 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2452 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2453 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2456 if((PacketSize%4)==0)
2458 for(; k<peeled_k; k+=PacketSize) {
2460 kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2461 kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462 kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463 kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2465 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2466 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2467 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2468 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2469 count+=4*PacketSize;
2474 blockB[count+0] = cj(dm0(k));
2475 blockB[count+1] = cj(dm1(k));
2476 blockB[count+2] = cj(dm2(k));
2477 blockB[count+3] = cj(dm3(k));
2486 for(
Index j2=packet_cols4; j2<
cols; ++j2)
2488 if(PanelMode) count +=
offset;
2492 blockB[count] = cj(dm0(k));
2500 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2516 const bool HasHalf = (
int)HalfPacketSize < (
int)PacketSize;
2517 const bool HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize;
2519 Index packet_cols8 = nr>=8 ? (
cols/8) * 8 : 0;
2520 Index packet_cols4 = nr>=4 ? (
cols/4) * 4 : 0;
2558 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2561 if(PanelMode) count += 4 *
offset;
2564 if (PacketSize==4) {
2565 Packet A = rhs.template loadPacket<Packet>(k, j2);
2566 pstoreu(blockB+count, cj.pconj(
A));
2567 count += PacketSize;
2568 }
else if (HasHalf && HalfPacketSize==4) {
2569 HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570 pstoreu(blockB+count, cj.pconj(
A));
2571 count += HalfPacketSize;
2572 }
else if (HasQuarter && QuarterPacketSize==4) {
2574 pstoreu(blockB+count, cj.pconj(
A));
2575 count += QuarterPacketSize;
2578 blockB[count+0] = cj(dm0(0));
2579 blockB[count+1] = cj(dm0(1));
2580 blockB[count+2] = cj(dm0(2));
2581 blockB[count+3] = cj(dm0(3));
2590 for(
Index j2=packet_cols4; j2<
cols; ++j2)
2592 if(PanelMode) count +=
offset;
2595 blockB[count] = cj(rhs(k, j2));
2609 std::ptrdiff_t
l1,
l2,
l3;
2618 std::ptrdiff_t
l1,
l2,
l3;
2628 std::ptrdiff_t
l1,
l2,
l3;
2645 #endif // EIGEN_GENERAL_BLOCK_PANEL_H