11 #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H 12 #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H 14 #ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK 15 #define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1 22 #if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY) 24 #define EIGEN_ALTIVEC_MMA_ONLY 26 #define EIGEN_ALTIVEC_DISABLE_MMA 32 #if __has_builtin(__builtin_mma_assemble_acc) 33 #define ALTIVEC_MMA_SUPPORT 37 #if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 53 template<
typename Scalar>
95 16, 17, 18, 19, 20, 21, 22, 23};
99 24, 25, 26, 27, 28, 29, 30, 31};
119 template<
typename Scalar,
typename Index,
int StorageOrder>
122 std::complex<Scalar>
v;
138 template<
typename Scalar,
typename Index,
int StorageOrder,
int N>
144 const Index vectorDelta = vectorSize *
rows;
147 Index rir = 0, rii,
j = 0;
148 for(; j + vectorSize <=
cols; j+=vectorSize)
150 rii = rir + vectorDelta;
154 for(Index k = 0; k < vectorSize; k++)
156 std::complex<Scalar>
v = getAdjointVal<Scalar, Index, StorageOrder>(
i, j + k, rhs);
158 blockBf[rir + k] = v.real();
159 blockBf[rii + k] = v.imag();
169 rii = rir + ((cols -
j) * rows);
176 std::complex<Scalar>
v = getAdjointVal<Scalar, Index, StorageOrder>(
i, k, rhs);
178 blockBf[rir] = v.real();
179 blockBf[rii] = v.imag();
188 template<
typename Scalar,
typename Index,
int StorageOrder>
194 const Index vectorDelta = vectorSize *
depth;
197 Index rir = 0, rii,
j = 0;
198 for(; j + vectorSize <=
rows; j+=vectorSize)
200 rii = rir + vectorDelta;
204 for(Index k = 0; k < vectorSize; k++)
206 std::complex<Scalar>
v = getAdjointVal<Scalar, Index, StorageOrder>(j+k,
i, lhs);
208 blockAf[rir + k] = v.real();
209 blockAf[rii + k] = v.imag();
220 rii = rir + ((rows -
j) * depth);
227 std::complex<Scalar>
v = getAdjointVal<Scalar, Index, StorageOrder>(k,
i, lhs);
229 blockAf[rir] = v.real();
230 blockAf[rii] = v.imag();
239 template<
typename Scalar,
typename Index,
int StorageOrder,
int N>
247 for(;
j +
N*vectorSize <=
cols;
j+=
N*vectorSize)
250 for(; i <
depth; i++)
252 for(
Index k = 0; k <
N*vectorSize; k++)
255 blockB[ri + k] = rhs(
j+k, i);
257 blockB[ri + k] = rhs(i,
j+k);
271 blockB[ri] = rhs(
i, k);
273 blockB[ri] = rhs(k,
i);
280 template<
typename Scalar,
typename Index,
int StorageOrder>
288 for(;
j + vectorSize <=
rows;
j+=vectorSize)
292 for(; i <
depth; i++)
294 for(
Index k = 0; k < vectorSize; k++)
297 blockA[ri + k] = lhs(
j+k, i);
299 blockA[ri + k] = lhs(i,
j+k);
313 blockA[ri] = lhs(k,
i);
315 blockA[ri] = lhs(
i, k);
322 template<
typename Index,
int nr,
int StorageOrder>
327 symm_pack_complex_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride,
rows,
cols, k2);
331 template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
336 symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride,
cols,
rows);
342 template<
typename Index,
int nr,
int StorageOrder>
347 symm_pack_complex_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride,
rows,
cols, k2);
351 template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
356 symm_pack_complex_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride,
cols,
rows);
361 template<
typename Index,
int nr,
int StorageOrder>
366 symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride,
rows,
cols, k2);
370 template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
375 symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride,
cols,
rows);
380 template<
typename Index,
int nr,
int StorageOrder>
385 symm_pack_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride,
rows,
cols, k2);
389 template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
394 symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride,
cols,
rows);
409 template<
typename Scalar,
typename Packet,
typename Index>
413 pstore<Scalar>(to + (0 *
size), block.
packet[0]);
414 pstore<Scalar>(to + (1 *
size), block.
packet[1]);
415 pstore<Scalar>(to + (2 *
size), block.
packet[2]);
416 pstore<Scalar>(to + (3 *
size), block.
packet[3]);
419 template<
typename Scalar,
typename Packet,
typename Index>
423 pstore<Scalar>(to + (0 *
size), block.
packet[0]);
424 pstore<Scalar>(to + (1 *
size), block.
packet[1]);
428 template<
typename Scalar,
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode,
bool UseLhs>
433 const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
434 Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
438 for(; j + vectorSize <=
rows; j+=vectorSize)
442 rii = rir + vectorDelta;
444 for(; i + vectorSize <=
depth; i+=vectorSize)
450 bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs,
j,
i);
452 bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs,
i,
j);
473 if(((StorageOrder ==
RowMajor) && UseLhs) || (((StorageOrder ==
ColMajor) && !UseLhs)))
479 storeBlock<Scalar, Packet, Index>(blockAt + rir, blockr);
480 storeBlock<Scalar, Packet, Index>(blockAt + rii, blocki);
485 for(; i <
depth; i++)
490 if(((StorageOrder ==
ColMajor) && UseLhs) || (((StorageOrder ==
RowMajor) && !UseLhs)))
493 cblock.
packet[0] = lhs.template loadPacket<PacketC>(j + 0,
i);
494 cblock.
packet[1] = lhs.template loadPacket<PacketC>(j + 2,
i);
496 cblock.
packet[0] = lhs.template loadPacket<PacketC>(
i, j + 0);
497 cblock.
packet[1] = lhs.template loadPacket<PacketC>(
i, j + 2);
500 std::complex<Scalar> lhs0, lhs1;
502 lhs0 = lhs(j + 0, i);
503 lhs1 = lhs(j + 1, i);
505 lhs0 = lhs(j + 2, i);
506 lhs1 = lhs(j + 3, i);
509 lhs0 = lhs(i, j + 0);
510 lhs1 = lhs(i, j + 1);
512 lhs0 = lhs(i, j + 2);
513 lhs1 = lhs(i, j + 3);
526 pstore<Scalar>(blockAt + rir, blockr.
packet[0]);
527 pstore<Scalar>(blockAt + rii, blocki.
packet[0]);
533 rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
538 if(PanelMode) rir += (offset*(rows - j - vectorSize));
539 rii = rir + (((PanelMode) ? stride : depth) * (rows -
j));
547 blockAt[rir] = lhs(k,
i).real();
550 blockAt[rii] = -lhs(k,
i).imag();
552 blockAt[rii] = lhs(k,
i).imag();
554 blockAt[rir] = lhs(
i, k).real();
557 blockAt[rii] = -lhs(
i, k).imag();
559 blockAt[rii] = lhs(
i, k).imag();
571 template<
typename Scalar,
typename Index,
typename DataMapper,
typename Packet,
int StorageOrder,
bool PanelMode,
bool UseLhs>
578 for(;
j + vectorSize <=
rows;
j+=vectorSize)
582 if(PanelMode) ri += vectorSize*
offset;
584 for(; i + vectorSize <=
depth; i+=vectorSize)
589 bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(
block, lhs,
j,
i);
591 bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(
block, lhs,
i,
j);
593 if(((StorageOrder ==
RowMajor) && UseLhs) || ((StorageOrder ==
ColMajor) && !UseLhs))
598 storeBlock<Scalar, Packet, Index>(blockA + ri,
block);
602 for(; i <
depth; i++)
604 if(((StorageOrder ==
RowMajor) && UseLhs) || ((StorageOrder ==
ColMajor) && !UseLhs))
607 blockA[ri+0] = lhs(
j+0, i);
608 blockA[ri+1] = lhs(
j+1, i);
609 blockA[ri+2] = lhs(
j+2, i);
610 blockA[ri+3] = lhs(
j+3, i);
612 blockA[ri+0] = lhs(i,
j+0);
613 blockA[ri+1] = lhs(i,
j+1);
614 blockA[ri+2] = lhs(i,
j+2);
615 blockA[ri+3] = lhs(i,
j+3);
620 lhsV = lhs.template loadPacket<Packet>(
j,
i);
622 lhsV = lhs.template loadPacket<Packet>(
i,
j);
624 pstore<Scalar>(blockA + ri, lhsV);
630 if(PanelMode) ri += vectorSize*(stride - offset -
depth);
635 if(PanelMode) ri += offset*(rows -
j);
643 blockA[ri] = lhs(k,
i);
645 blockA[ri] = lhs(
i, k);
655 template<
typename Index,
typename DataMapper,
int StorageOrder,
bool PanelMode>
663 for(;
j + vectorSize <=
rows;
j+=vectorSize)
667 if(PanelMode) ri += vectorSize*
offset;
669 for(; i + vectorSize <=
depth; i+=vectorSize)
674 block.
packet[0] = lhs.template loadPacket<Packet2d>(
j + 0,
i);
675 block.
packet[1] = lhs.template loadPacket<Packet2d>(j + 1,
i);
679 block.
packet[0] = lhs.template loadPacket<Packet2d>(
j, i + 0);
680 block.
packet[1] = lhs.template loadPacket<Packet2d>(
j, i + 1);
683 storeBlock<double, Packet2d, Index>(blockA + ri,
block);
687 for(; i <
depth; i++)
691 blockA[ri+0] = lhs(
j+0, i);
692 blockA[ri+1] = lhs(
j+1, i);
694 Packet2d lhsV = lhs.template loadPacket<Packet2d>(
j,
i);
701 if(PanelMode) ri += vectorSize*(stride - offset -
depth);
706 if(PanelMode) ri += offset*(rows -
j);
713 blockA[ri] = lhs(k,
i);
722 template<
typename Index,
typename DataMapper,
int StorageOrder,
bool PanelMode>
730 for(;
j + 2*vectorSize <=
cols;
j+=2*vectorSize)
734 if(PanelMode) ri += offset*(2*vectorSize);
736 for(; i + vectorSize <=
depth; i+=vectorSize)
742 block1.
packet[0] = rhs.template loadPacket<Packet2d>(
i,
j + 0);
743 block1.
packet[1] = rhs.template loadPacket<Packet2d>(
i,
j + 1);
744 block2.
packet[0] = rhs.template loadPacket<Packet2d>(
i,
j + 2);
745 block2.
packet[1] = rhs.template loadPacket<Packet2d>(
i,
j + 3);
755 block.
packet[0] = rhs.template loadPacket<Packet2d>(i + 0,
j + 0);
756 block.
packet[1] = rhs.template loadPacket<Packet2d>(i + 0,
j + 2);
757 block.
packet[2] = rhs.template loadPacket<Packet2d>(i + 1,
j + 0);
758 block.
packet[3] = rhs.template loadPacket<Packet2d>(i + 1,
j + 2);
760 storeBlock<double, Packet2d, Index>(blockB + ri,
block);
765 for(; i <
depth; i++)
769 blockB[ri+0] = rhs(i,
j+0);
770 blockB[ri+1] = rhs(i,
j+1);
774 blockB[ri+0] = rhs(i,
j+2);
775 blockB[ri+1] = rhs(i,
j+3);
777 Packet2d rhsV = rhs.template loadPacket<Packet2d>(
i,
j);
782 rhsV = rhs.template loadPacket<Packet2d>(
i,
j + 2);
788 if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
793 if(PanelMode) ri += offset*(cols -
j);
800 blockB[ri] = rhs(
i, k);
809 template<
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode>
815 const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
816 Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
817 double* blockAt =
reinterpret_cast<double *
>(blockA);
820 for(; j + vectorSize <=
rows; j+=vectorSize)
824 rii = rir + vectorDelta;
826 for(; i + vectorSize <=
depth; i+=vectorSize)
833 cblock.
packet[0] = lhs.template loadPacket<PacketC>(
j, i + 0);
834 cblock.
packet[1] = lhs.template loadPacket<PacketC>(
j, i + 1);
836 cblock.
packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0);
837 cblock.
packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
845 cblock.
packet[0] = lhs.template loadPacket<PacketC>(j + 0,
i);
846 cblock.
packet[1] = lhs.template loadPacket<PacketC>(j + 1,
i);
848 cblock.
packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1);
849 cblock.
packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
864 storeBlock<double, Packet, Index>(blockAt + rir, blockr);
865 storeBlock<double, Packet, Index>(blockAt + rii, blocki);
870 for(; i <
depth; i++)
875 cblock.
packet[0] = lhs.template loadPacket<PacketC>(j + 0,
i);
876 cblock.
packet[1] = lhs.template loadPacket<PacketC>(j + 1,
i);
893 rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
898 if(PanelMode) rir += (offset*(rows - j - vectorSize));
899 rii = rir + (((PanelMode) ? stride : depth) * (rows -
j));
906 blockAt[rir] = lhs(k,
i).real();
909 blockAt[rii] = -lhs(k,
i).imag();
911 blockAt[rii] = lhs(k,
i).imag();
922 template<
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode>
928 const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
929 Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
930 double* blockBt =
reinterpret_cast<double *
>(blockB);
933 for(; j + 2*vectorSize <=
cols; j+=2*vectorSize)
937 rii = rir + vectorDelta;
939 for(; i <
depth; i++)
944 bload<DataMapper, PacketC, Index, 2, 0, ColMajor>(cblock, rhs,
i,
j);
946 blockr.
packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
947 blockr.
packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
949 blocki.
packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
950 blocki.
packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
958 storeBlock<double, Packet, Index>(blockBt + rir, blockr);
959 storeBlock<double, Packet, Index>(blockBt + rii, blocki);
965 rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
970 if(PanelMode) rir += (offset*(cols - j - 2*vectorSize));
971 rii = rir + (((PanelMode) ? stride : depth) * (cols -
j));
978 blockBt[rir] = rhs(
i, k).real();
981 blockBt[rii] = -rhs(
i, k).imag();
983 blockBt[rii] = rhs(
i, k).imag();
998 template<
typename Packet,
bool NegativeAccumulate>
1001 if(NegativeAccumulate)
1003 acc->
packet[0] = vec_nmsub(lhsV, rhsV[0], acc->
packet[0]);
1004 acc->
packet[1] = vec_nmsub(lhsV, rhsV[1], acc->
packet[1]);
1005 acc->
packet[2] = vec_nmsub(lhsV, rhsV[2], acc->
packet[2]);
1006 acc->
packet[3] = vec_nmsub(lhsV, rhsV[3], acc->
packet[3]);
1008 acc->
packet[0] = vec_madd(lhsV, rhsV[0], acc->
packet[0]);
1009 acc->
packet[1] = vec_madd(lhsV, rhsV[1], acc->
packet[1]);
1010 acc->
packet[2] = vec_madd(lhsV, rhsV[2], acc->
packet[2]);
1011 acc->
packet[3] = vec_madd(lhsV, rhsV[3], acc->
packet[3]);
1015 template<
typename Packet,
bool NegativeAccumulate>
1018 if(NegativeAccumulate)
1020 acc->
packet[0] = vec_nmsub(lhsV, rhsV[0], acc->
packet[0]);
1022 acc->
packet[0] = vec_madd(lhsV, rhsV[0], acc->
packet[0]);
1026 template<
int N,
typename Scalar,
typename Packet,
bool NegativeAccumulate>
1029 Packet lhsV = pload<Packet>(lhs);
1031 pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
1034 template<
typename Scalar,
typename Packet,
typename Index>
1038 lhsV = vec_xl_len((
Scalar *)lhs, remaining_rows *
sizeof(
Scalar));
1043 }
while (++i < remaining_rows);
1047 template<
int N,
typename Scalar,
typename Packet,
typename Index,
bool NegativeAccumulate>
1051 loadPacketRemaining<Scalar, Packet, Index>(lhs, lhsV, remaining_rows);
1053 pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
1057 template<
int N,
typename Packet,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1060 pger_common<Packet, false>(accReal, lhsV, rhsV);
1063 pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
1067 pger_common<Packet, ConjugateLhs == ConjugateRhs>(accReal, lhsVi, rhsVi);
1068 pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
1072 pger_common<Packet, ConjugateLhs>(accImag, lhsVi, rhsV);
1076 template<
int N,
typename Scalar,
typename Packet,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1079 Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
1081 if(!LhsIsReal) lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
1084 pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1087 template<
typename Scalar,
typename Packet,
typename Index,
bool LhsIsReal>
1091 lhsV = vec_xl_len((
Scalar *)lhs_ptr, remaining_rows *
sizeof(
Scalar));
1092 if(!LhsIsReal) lhsVi = vec_xl_len((
Scalar *)lhs_ptr_imag, remaining_rows *
sizeof(
Scalar));
1097 lhsV[
i] = lhs_ptr[
i];
1098 if(!LhsIsReal) lhsVi[
i] = lhs_ptr_imag[
i];
1099 }
while (++i < remaining_rows);
1104 template<
int N,
typename Scalar,
typename Packet,
typename Index,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1108 loadPacketRemaining<Scalar, Packet, Index, LhsIsReal>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows);
1110 pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1113 template<
typename Scalar,
typename Packet>
1116 return ploadu<Packet>(lhs);
1120 template<
typename Scalar,
typename Packet>
1129 template<
typename Scalar,
typename Packet>
1136 template<
typename Packet>
1145 template<
typename Packet>
1151 template<
typename Packet>
1160 template<
typename Packet>
1167 template<
typename Packet,
int N>
1170 bscalec_common<Packet>(cReal, aReal, bReal);
1172 bscalec_common<Packet>(cImag, aImag, bReal);
1174 pger_common<Packet, true>(&cReal, bImag, aImag.packet);
1176 pger_common<Packet, false>(&cImag, bImag, aReal.packet);
1179 template<
typename Packet>
1188 template<
typename Packet>
1189 EIGEN_ALWAYS_INLINE void bscalec(
PacketBlock<Packet,4>& aReal,
PacketBlock<Packet,4>& aImag,
const Packet& bReal,
const Packet& bImag,
PacketBlock<Packet,4>& cReal,
PacketBlock<Packet,4>& cImag,
const Packet& pMask)
1191 band<Packet>(aReal, pMask);
1192 band<Packet>(aImag, pMask);
1194 bscalec<Packet,4>(aReal, aImag, bReal, bImag, cReal, cImag);
1198 template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1202 acc.
packet[0] = res.template loadPacket<Packet>(row + 0, col +
N*accCols);
1203 acc.
packet[1] = res.template loadPacket<Packet>(row + 1, col +
N*accCols);
1204 acc.
packet[2] = res.template loadPacket<Packet>(row + 2, col +
N*accCols);
1205 acc.
packet[3] = res.template loadPacket<Packet>(row + 3, col +
N*accCols);
1207 acc.
packet[0] = res.template loadPacket<Packet>(row +
N*accCols, col + 0);
1208 acc.
packet[1] = res.template loadPacket<Packet>(row +
N*accCols, col + 1);
1209 acc.
packet[2] = res.template loadPacket<Packet>(row +
N*accCols, col + 2);
1210 acc.
packet[3] = res.template loadPacket<Packet>(row +
N*accCols, col + 3);
1215 template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1219 acc.
packet[0] = res.template loadPacket<Packet>(row + 0, col +
N*accCols);
1220 acc.
packet[1] = res.template loadPacket<Packet>(row + 1, col +
N*accCols);
1221 acc.
packet[2] = res.template loadPacket<Packet>(row + 2, col +
N*accCols);
1222 acc.
packet[3] = res.template loadPacket<Packet>(row + 3, col +
N*accCols);
1223 acc.
packet[4] = res.template loadPacket<Packet>(row + 0, col + (
N+1)*accCols);
1224 acc.
packet[5] = res.template loadPacket<Packet>(row + 1, col + (
N+1)*accCols);
1225 acc.
packet[6] = res.template loadPacket<Packet>(row + 2, col + (
N+1)*accCols);
1226 acc.
packet[7] = res.template loadPacket<Packet>(row + 3, col + (
N+1)*accCols);
1228 acc.
packet[0] = res.template loadPacket<Packet>(row +
N*accCols, col + 0);
1229 acc.
packet[1] = res.template loadPacket<Packet>(row +
N*accCols, col + 1);
1230 acc.
packet[2] = res.template loadPacket<Packet>(row +
N*accCols, col + 2);
1231 acc.
packet[3] = res.template loadPacket<Packet>(row +
N*accCols, col + 3);
1232 acc.
packet[4] = res.template loadPacket<Packet>(row + (
N+1)*accCols, col + 0);
1233 acc.
packet[5] = res.template loadPacket<Packet>(row + (
N+1)*accCols, col + 1);
1234 acc.
packet[6] = res.template loadPacket<Packet>(row + (
N+1)*accCols, col + 2);
1235 acc.
packet[7] = res.template loadPacket<Packet>(row + (
N+1)*accCols, col + 3);
1239 template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1242 acc.
packet[0] = res.template loadPacket<Packet>(row +
N*accCols, col + 0);
1243 acc.
packet[1] = res.template loadPacket<Packet>(row + (
N+1)*accCols, col + 0);
1252 template<
typename Packet>
1255 if (remaining_rows == 0) {
1256 return pset1<Packet>(
float(0.0));
1258 switch (remaining_rows) {
1259 case 1:
return Packet(mask41);
1260 case 2:
return Packet(mask42);
1261 default:
return Packet(mask43);
1269 if (remaining_rows == 0) {
1276 template<
typename Packet>
1279 band<Packet>(accZ, pMask);
1281 bscale<Packet>(acc, accZ, pAlpha);
1284 template<
typename Packet>
1287 pbroadcast4<Packet>(
a, a0,
a1,
a2,
a3);
1295 a0 = vec_splat(a1, 0);
1296 a1 = vec_splat(a1, 1);
1297 a2 = vec_splat(a3, 0);
1298 a3 = vec_splat(a3, 1);
1304 template<
typename Scalar,
typename Packet,
typename Index>
1309 Index remaining_rows,
1310 Index remaining_cols)
1313 rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1314 pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1315 lhs_ptr += remaining_rows;
1316 rhs_ptr += remaining_cols;
1319 template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows>
1321 const DataMapper&
res,
1329 Index remaining_rows,
1330 Index remaining_cols,
1333 const Scalar* rhs_ptr = rhs_base;
1334 const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1337 bsetzero<Scalar, Packet>(accZero);
1339 Index remaining_depth = (depth & -accRows);
1341 for(; k +
PEEL <= remaining_depth; k+=
PEEL)
1345 for (
int l = 0;
l <
PEEL;
l++) {
1346 MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
1349 for(; k < remaining_depth; k++)
1351 MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
1353 for(; k <
depth; k++)
1356 rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1357 pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
1358 lhs_ptr += remaining_rows;
1359 rhs_ptr += remaining_cols;
1362 accZero.
packet[0] = vec_mul(pAlpha, accZero.
packet[0]);
1363 for(
Index i = 0;
i < remaining_rows;
i++) {
1368 template<
typename Scalar,
typename Packet,
typename Index, const Index accRows>
1373 Index remaining_rows)
1376 pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1377 pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1378 lhs_ptr += remaining_rows;
1382 template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1384 const DataMapper&
res,
1394 Index remaining_rows,
1398 const Scalar* rhs_ptr = rhs_base;
1399 const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1402 bsetzero<Scalar, Packet>(accZero);
1404 Index remaining_depth = (col + accRows <
cols) ? depth : (depth & -accRows);
1406 for(; k +
PEEL <= remaining_depth; k+=
PEEL)
1410 for (
int l = 0;
l <
PEEL;
l++) {
1411 MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
1414 for(; k < remaining_depth; k++)
1416 MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
1419 if ((remaining_depth == depth) && (rows >= accCols))
1422 acc.
packet[
j] = res.template loadPacket<Packet>(
row, col +
j);
1424 bscale<Packet>(acc, accZero, pAlpha, pMask);
1425 res.template storePacketBlock<Packet,4>(
row,
col, acc);
1427 for(; k <
depth; k++)
1430 pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1431 pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
1432 lhs_ptr += remaining_rows;
1440 for(
Index i = 0;
i < remaining_rows;
i++) {
1447 #define MICRO_UNROLL(func) \ 1448 func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) 1450 #define MICRO_UNROLL_WORK(func, func2, peel) \ 1451 MICRO_UNROLL(func2); \ 1452 func(0,peel) func(1,peel) func(2,peel) func(3,peel) \ 1453 func(4,peel) func(5,peel) func(6,peel) func(7,peel) 1455 #define MICRO_LOAD_ONE(iter) \ 1456 if (unroll_factor > iter) { \ 1457 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \ 1458 lhs_ptr##iter += accCols; \ 1460 EIGEN_UNUSED_VARIABLE(lhsV##iter); \ 1463 #define MICRO_WORK_ONE(iter, peel) \ 1464 if (unroll_factor > iter) { \ 1465 pger_common<Packet, false>(&accZero##iter, lhsV##iter, rhsV##peel); \ 1468 #define MICRO_TYPE_PEEL4(func, func2, peel) \ 1469 if (PEEL > peel) { \ 1470 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ 1471 pbroadcast4<Packet>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ 1472 MICRO_UNROLL_WORK(func, func2, peel) \ 1474 EIGEN_UNUSED_VARIABLE(rhsV##peel); \ 1477 #define MICRO_TYPE_PEEL1(func, func2, peel) \ 1478 if (PEEL > peel) { \ 1479 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ 1480 rhsV##peel[0] = pset1<Packet>(rhs_ptr[remaining_cols * peel]); \ 1481 MICRO_UNROLL_WORK(func, func2, peel) \ 1483 EIGEN_UNUSED_VARIABLE(rhsV##peel); \ 1486 #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \ 1487 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ 1488 func(func1,func2,0); func(func1,func2,1); \ 1489 func(func1,func2,2); func(func1,func2,3); \ 1490 func(func1,func2,4); func(func1,func2,5); \ 1491 func(func1,func2,6); func(func1,func2,7); \ 1492 func(func1,func2,8); func(func1,func2,9); 1494 #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \ 1496 func(func1,func2,0); 1498 #define MICRO_ONE_PEEL4 \ 1499 MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ 1500 rhs_ptr += (accRows * PEEL); 1502 #define MICRO_ONE4 \ 1503 MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ 1506 #define MICRO_ONE_PEEL1 \ 1507 MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ 1508 rhs_ptr += (remaining_cols * PEEL); 1510 #define MICRO_ONE1 \ 1511 MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ 1512 rhs_ptr += remaining_cols; 1514 #define MICRO_DST_PTR_ONE(iter) \ 1515 if (unroll_factor > iter) { \ 1516 bsetzero<Scalar, Packet>(accZero##iter); \ 1518 EIGEN_UNUSED_VARIABLE(accZero##iter); \ 1521 #define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE) 1523 #define MICRO_SRC_PTR_ONE(iter) \ 1524 if (unroll_factor > iter) { \ 1525 lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ 1527 EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ 1530 #define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE) 1532 #define MICRO_PREFETCH_ONE(iter) \ 1533 if (unroll_factor > iter) { \ 1534 EIGEN_POWER_PREFETCH(lhs_ptr##iter); \ 1537 #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE) 1539 #define MICRO_STORE_ONE(iter) \ 1540 if (unroll_factor > iter) { \ 1541 acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \ 1542 acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \ 1543 acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \ 1544 acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \ 1545 bscale<Packet>(acc, accZero##iter, pAlpha); \ 1546 res.template storePacketBlock<Packet,4>(row + iter*accCols, col, acc); \ 1549 #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) 1551 #define MICRO_COL_STORE_ONE(iter) \ 1552 if (unroll_factor > iter) { \ 1553 acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \ 1554 bscale<Packet>(acc, accZero##iter, pAlpha); \ 1555 res.template storePacketBlock<Packet,1>(row + iter*accCols, col, acc); \ 1558 #define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) 1560 template<
int unroll_factor,
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1562 const DataMapper&
res,
1572 const Scalar* rhs_ptr = rhs_base;
1574 PacketBlock<Packet,4> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1587 for(; k <
depth; k++)
1593 row += unroll_factor*accCols;
1596 template<
int unroll_factor,
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accCols>
1598 const DataMapper&
res,
1606 Index remaining_cols,
1609 const Scalar* rhs_ptr = rhs_base;
1611 PacketBlock<Packet,1> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1624 for(; k <
depth; k++)
1630 row += unroll_factor*accCols;
1633 template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accCols>
1635 const DataMapper&
res,
1644 Index remaining_cols,
1647 #define MAX_UNROLL 6 1649 gemm_unrolled_col_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_cols, pAlpha);
1651 switch( (rows-row)/accCols ) {
1654 gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_cols, pAlpha);
1659 gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_cols, pAlpha);
1664 gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_cols, pAlpha);
1669 gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_cols, pAlpha);
1674 gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_cols, pAlpha);
1679 gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_cols, pAlpha);
1684 gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_cols, pAlpha);
1696 template<
typename Scalar,
typename Index,
typename Packet,
typename RhsPacket,
typename DataMapper, const Index accRows, const Index accCols>
1697 EIGEN_STRONG_INLINE void gemm(
const DataMapper&
res,
const Scalar* blockA,
const Scalar* blockB,
Index rows,
Index depth,
Index cols,
Scalar alpha,
Index strideA,
Index strideB,
Index offsetA,
Index offsetB)
1699 const Index remaining_rows = rows % accCols;
1700 const Index remaining_cols = cols % accRows;
1702 if( strideA == -1 ) strideA =
depth;
1703 if( strideB == -1 ) strideB =
depth;
1706 const Packet pMask = bmask<Packet>((
const int)(remaining_rows));
1709 for(; col + accRows <=
cols; col += accRows)
1711 const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
1712 const Scalar* lhs_base = blockA;
1715 #define MAX_UNROLL 6 1717 gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, pAlpha);
1719 switch( (rows-row)/accCols ) {
1722 gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, pAlpha);
1727 gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, pAlpha);
1732 gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, pAlpha);
1737 gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, pAlpha);
1742 gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, pAlpha);
1747 gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, pAlpha);
1752 gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, pAlpha);
1760 if(remaining_rows > 0)
1762 gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col,
rows,
cols, remaining_rows, pAlpha, pMask);
1766 if(remaining_cols > 0)
1768 const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
1769 const Scalar* lhs_base = blockA;
1771 for(; col <
cols; col++)
1775 gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
rows,
col, remaining_cols, pAlpha);
1777 if (remaining_rows > 0)
1779 gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(
res, lhs_base, rhs_base,
depth, strideA, offsetA,
row,
col, remaining_rows, remaining_cols, pAlpha);
1786 #define accColsC (accCols / 2) 1787 #define advanceRows ((LhsIsReal) ? 1 : 2) 1788 #define advanceCols ((RhsIsReal) ? 1 : 2) 1791 #define PEEL_COMPLEX 3 1793 template<
typename Scalar,
typename Packet,
typename Index, const Index accRows,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1795 const Scalar* &lhs_ptr_real,
const Scalar* &lhs_ptr_imag,
1796 const Scalar* &rhs_ptr_real,
const Scalar* &rhs_ptr_imag,
1798 Index remaining_rows,
1799 Index remaining_cols)
1801 Packet rhsV[1], rhsVi[1];
1802 rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
1803 if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
1804 pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1805 lhs_ptr_real += remaining_rows;
1806 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1808 rhs_ptr_real += remaining_cols;
1809 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
1813 template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1815 const DataMapper&
res,
1824 Index remaining_rows,
1825 Index remaining_cols,
1826 const Packet& pAlphaReal,
1827 const Packet& pAlphaImag)
1829 const Scalar* rhs_ptr_real = rhs_base;
1830 const Scalar* rhs_ptr_imag;
1831 if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB;
1833 const Scalar* lhs_ptr_real = lhs_base +
advanceRows*row*strideA + remaining_rows*offsetA;
1834 const Scalar* lhs_ptr_imag;
1835 if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1841 bsetzero<Scalar, Packet>(accReal);
1842 bsetzero<Scalar, Packet>(accImag);
1844 Index remaining_depth = (depth & -accRows);
1857 MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
1860 for(; k < remaining_depth; k++)
1862 MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
1865 for(; k <
depth; k++)
1867 Packet rhsV[1], rhsVi[1];
1868 rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
1869 if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
1870 pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
1871 lhs_ptr_real += remaining_rows;
1872 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1873 rhs_ptr_real += remaining_cols;
1874 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
1877 bscalec<Packet,1>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
1878 bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
1880 if ((
sizeof(
Scalar) ==
sizeof(
float)) && (remaining_rows == 1))
1882 res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
1884 acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
1885 res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
1892 template<
typename Scalar,
typename Packet,
typename Index, const Index accRows,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1894 const Scalar* &lhs_ptr_real,
const Scalar* &lhs_ptr_imag,
1895 const Scalar* &rhs_ptr_real,
const Scalar* &rhs_ptr_imag,
1897 Index remaining_rows)
1899 Packet rhsV[4], rhsVi[4];
1900 pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1901 if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1902 pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1903 lhs_ptr_real += remaining_rows;
1904 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1906 rhs_ptr_real += accRows;
1907 if(!RhsIsReal) rhs_ptr_imag += accRows;
1911 template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1913 const DataMapper&
res,
1924 Index remaining_rows,
1925 const Packet& pAlphaReal,
1926 const Packet& pAlphaImag,
1929 const Scalar* rhs_ptr_real = rhs_base;
1930 const Scalar* rhs_ptr_imag;
1931 if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
1933 const Scalar* lhs_ptr_real = lhs_base +
advanceRows*row*strideA + remaining_rows*offsetA;
1934 const Scalar* lhs_ptr_imag;
1935 if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1942 bsetzero<Scalar, Packet>(accReal);
1943 bsetzero<Scalar, Packet>(accImag);
1945 Index remaining_depth = (col + accRows <
cols) ? depth : (depth & -accRows);
1958 MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
1961 for(; k < remaining_depth; k++)
1963 MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
1966 if ((remaining_depth == depth) && (rows >= accCols))
1968 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes,
res,
row,
col);
1969 bscalec<Packet>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
1970 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1);
1971 res.template storePacketBlock<Packetc,4>(row + 0,
col, acc0);
1972 res.template storePacketBlock<Packetc,4>(row +
accColsC,
col, acc1);
1974 for(; k <
depth; k++)
1976 Packet rhsV[4], rhsVi[4];
1977 pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1978 if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1979 pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
1980 lhs_ptr_real += remaining_rows;
1981 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1982 rhs_ptr_real += accRows;
1983 if(!RhsIsReal) rhs_ptr_imag += accRows;
1986 bscalec<Packet,4>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
1987 bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
1989 if ((
sizeof(
Scalar) ==
sizeof(
float)) && (remaining_rows == 1))
1992 res(row + 0, col +
j) += pfirst<Packetc>(acc0.packet[
j]);
1997 acc2.
packet[0] = res.template loadPacket<Packetc>(row + 0, col +
j) + acc0.packet[
j];
1998 res.template storePacketBlock<Packetc,1>(row + 0, col +
j, acc2);
2007 #define MICRO_COMPLEX_UNROLL(func) \ 2008 func(0) func(1) func(2) func(3) func(4) 2010 #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ 2011 MICRO_COMPLEX_UNROLL(func2); \ 2012 func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel) 2014 #define MICRO_COMPLEX_LOAD_ONE(iter) \ 2015 if (unroll_factor > iter) { \ 2016 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \ 2017 lhs_ptr_real##iter += accCols; \ 2019 lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \ 2020 lhs_ptr_imag##iter += accCols; \ 2022 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ 2025 EIGEN_UNUSED_VARIABLE(lhsV##iter); \ 2026 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ 2029 #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \ 2030 if (unroll_factor > iter) { \ 2031 pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ 2034 #define MICRO_COMPLEX_WORK_ONE1(iter, peel) \ 2035 if (unroll_factor > iter) { \ 2036 pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ 2039 #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \ 2040 if (PEEL_COMPLEX > peel) { \ 2041 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ 2042 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ 2043 pbroadcast4_old<Packet>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ 2045 pbroadcast4_old<Packet>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ 2047 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ 2049 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ 2051 EIGEN_UNUSED_VARIABLE(rhsV##peel); \ 2052 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ 2055 #define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \ 2056 if (PEEL_COMPLEX > peel) { \ 2057 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ 2058 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ 2059 rhsV##peel[0] = pset1<Packet>(rhs_ptr_real[remaining_cols * peel]); \ 2061 rhsVi##peel[0] = pset1<Packet>(rhs_ptr_imag[remaining_cols * peel]); \ 2063 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ 2065 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ 2067 EIGEN_UNUSED_VARIABLE(rhsV##peel); \ 2068 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ 2071 #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \ 2072 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ 2073 Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \ 2074 func(func1,func2,0); func(func1,func2,1); \ 2075 func(func1,func2,2); func(func1,func2,3); \ 2076 func(func1,func2,4); func(func1,func2,5); \ 2077 func(func1,func2,6); func(func1,func2,7); \ 2078 func(func1,func2,8); func(func1,func2,9); 2080 #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \ 2081 Packet rhsV0[M], rhsVi0[M];\ 2082 func(func1,func2,0); 2084 #define MICRO_COMPLEX_ONE_PEEL4 \ 2085 MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \ 2086 rhs_ptr_real += (accRows * PEEL_COMPLEX); \ 2087 if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX); 2089 #define MICRO_COMPLEX_ONE4 \ 2090 MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \ 2091 rhs_ptr_real += accRows; \ 2092 if(!RhsIsReal) rhs_ptr_imag += accRows; 2094 #define MICRO_COMPLEX_ONE_PEEL1 \ 2095 MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ 2096 rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \ 2097 if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX); 2099 #define MICRO_COMPLEX_ONE1 \ 2100 MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ 2101 rhs_ptr_real += remaining_cols; \ 2102 if(!RhsIsReal) rhs_ptr_imag += remaining_cols; 2104 #define MICRO_COMPLEX_DST_PTR_ONE(iter) \ 2105 if (unroll_factor > iter) { \ 2106 bsetzero<Scalar, Packet>(accReal##iter); \ 2107 bsetzero<Scalar, Packet>(accImag##iter); \ 2109 EIGEN_UNUSED_VARIABLE(accReal##iter); \ 2110 EIGEN_UNUSED_VARIABLE(accImag##iter); \ 2113 #define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE) 2115 #define MICRO_COMPLEX_SRC_PTR_ONE(iter) \ 2116 if (unroll_factor > iter) { \ 2117 lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ 2119 lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ 2121 EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ 2124 EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ 2125 EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ 2128 #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) 2130 #define MICRO_COMPLEX_PREFETCH_ONE(iter) \ 2131 if (unroll_factor > iter) { \ 2132 EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ 2134 EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ 2138 #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) 2140 #define MICRO_COMPLEX_STORE_ONE(iter) \ 2141 if (unroll_factor > iter) { \ 2142 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \ 2143 bscalec<Packet,4>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ 2144 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \ 2145 res.template storePacketBlock<Packetc,4>(row + iter*accCols + 0, col, acc0); \ 2146 res.template storePacketBlock<Packetc,4>(row + iter*accCols + accColsC, col, acc1); \ 2149 #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) 2151 #define MICRO_COMPLEX_COL_STORE_ONE(iter) \ 2152 if (unroll_factor > iter) { \ 2153 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \ 2154 bscalec<Packet,1>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ 2155 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \ 2156 res.template storePacketBlock<Packetc,1>(row + iter*accCols + 0, col, acc0); \ 2157 res.template storePacketBlock<Packetc,1>(row + iter*accCols + accColsC, col, acc1); \ 2160 #define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE) 2162 template<
int unroll_factor,
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2164 const DataMapper&
res,
2173 const Packet& pAlphaReal,
2174 const Packet& pAlphaImag)
2176 const Scalar* rhs_ptr_real = rhs_base;
2177 const Scalar* rhs_ptr_imag;
2179 rhs_ptr_imag = rhs_base + accRows*strideB;
2183 const Scalar* lhs_ptr_real0 =
NULL, * lhs_ptr_imag0 =
NULL, * lhs_ptr_real1 =
NULL, * lhs_ptr_imag1 =
NULL;
2184 const Scalar* lhs_ptr_real2 =
NULL, * lhs_ptr_imag2 =
NULL, * lhs_ptr_real3 =
NULL, * lhs_ptr_imag3 =
NULL;
2206 for(; k <
depth; k++)
2212 row += unroll_factor*accCols;
2215 template<
int unroll_factor,
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2217 const DataMapper&
res,
2226 Index remaining_cols,
2227 const Packet& pAlphaReal,
2228 const Packet& pAlphaImag)
2230 const Scalar* rhs_ptr_real = rhs_base;
2231 const Scalar* rhs_ptr_imag;
2233 rhs_ptr_imag = rhs_base + remaining_cols*strideB;
2237 const Scalar* lhs_ptr_real0 =
NULL, * lhs_ptr_imag0 =
NULL, * lhs_ptr_real1 =
NULL, * lhs_ptr_imag1 =
NULL;
2238 const Scalar* lhs_ptr_real2 =
NULL, * lhs_ptr_imag2 =
NULL, * lhs_ptr_real3 =
NULL, * lhs_ptr_imag3 =
NULL;
2260 for(; k <
depth; k++)
2266 row += unroll_factor*accCols;
2269 template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2271 const DataMapper&
res,
2281 Index remaining_cols,
2282 const Packet& pAlphaReal,
2283 const Packet& pAlphaImag)
2285 #define MAX_COMPLEX_UNROLL 3 2287 gemm_complex_unrolled_col_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, remaining_cols, pAlphaReal, pAlphaImag);
2289 switch( (rows-row)/accCols ) {
2290 #if MAX_COMPLEX_UNROLL > 4 2292 gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, remaining_cols, pAlphaReal, pAlphaImag);
2295 #if MAX_COMPLEX_UNROLL > 3 2297 gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, remaining_cols, pAlphaReal, pAlphaImag);
2300 #if MAX_COMPLEX_UNROLL > 2 2302 gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, remaining_cols, pAlphaReal, pAlphaImag);
2305 #if MAX_COMPLEX_UNROLL > 1 2307 gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, remaining_cols, pAlphaReal, pAlphaImag);
2313 #undef MAX_COMPLEX_UNROLL 2316 template<
typename LhsScalar,
typename RhsScalar,
typename Scalarc,
typename Scalar,
typename Index,
typename Packet,
typename Packetc,
typename RhsPacket,
typename DataMapper, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2317 EIGEN_STRONG_INLINE void gemm_complex(
const DataMapper&
res,
const LhsScalar* blockAc,
const RhsScalar* blockBc,
Index rows,
Index depth,
Index cols, Scalarc
alpha,
Index strideA,
Index strideB,
Index offsetA,
Index offsetB)
2319 const Index remaining_rows = rows % accCols;
2320 const Index remaining_cols = cols % accRows;
2322 if( strideA == -1 ) strideA =
depth;
2323 if( strideB == -1 ) strideB =
depth;
2325 const Packet pAlphaReal = pset1<Packet>(alpha.real());
2326 const Packet pAlphaImag = pset1<Packet>(alpha.imag());
2327 const Packet pMask = bmask<Packet>((
const int)(remaining_rows));
2333 for(; col + accRows <=
cols; col += accRows)
2336 const Scalar* lhs_base = blockA;
2339 #define MAX_COMPLEX_UNROLL 3 2341 gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, pAlphaReal, pAlphaImag);
2343 switch( (rows-row)/accCols ) {
2344 #if MAX_COMPLEX_UNROLL > 4 2346 gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, pAlphaReal, pAlphaImag);
2349 #if MAX_COMPLEX_UNROLL > 3 2351 gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, pAlphaReal, pAlphaImag);
2354 #if MAX_COMPLEX_UNROLL > 2 2356 gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, pAlphaReal, pAlphaImag);
2359 #if MAX_COMPLEX_UNROLL > 1 2361 gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, pAlphaReal, pAlphaImag);
2367 #undef MAX_COMPLEX_UNROLL 2369 if(remaining_rows > 0)
2371 gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col,
rows,
cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2375 if(remaining_cols > 0)
2377 const Scalar* rhs_base = blockB +
advanceCols*col*strideB + remaining_cols*offsetB;
2378 const Scalar* lhs_base = blockA;
2380 for(; col <
cols; col++)
2384 gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
rows,
col, remaining_cols, pAlphaReal, pAlphaImag);
2386 if (remaining_rows > 0)
2388 gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(
res, lhs_base, rhs_base,
depth, strideA, offsetA, strideB,
row,
col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
2402 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2408 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2413 pack(blockA, lhs, depth, rows, stride, offset);
2416 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2422 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2427 pack(blockA, lhs, depth, rows, stride, offset);
2430 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK 2431 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2437 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2442 pack(blockB, rhs, depth, cols, stride, offset);
2445 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2451 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2456 pack(blockB, rhs, depth, cols, stride, offset);
2460 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2466 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2471 pack(blockA, lhs, depth, rows, stride, offset);
2474 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2480 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2485 pack(blockA, lhs, depth, rows, stride, offset);
2488 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2494 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2496 ::operator()(std::complex<float>* blockA,
const DataMapper& lhs, Index
depth, Index
rows, Index stride, Index
offset)
2499 pack(blockA, lhs, depth, rows, stride,
offset);
2502 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2505 void operator()(std::complex<float>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index
offset=0);
2508 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2509 void gemm_pack_lhs<std::complex<float>,
Index, DataMapper, Pack1, Pack2,
Packet,
ColMajor, Conjugate, PanelMode>
2510 ::operator()(std::complex<float>* blockA,
const DataMapper& lhs, Index
depth, Index
rows, Index stride, Index
offset)
2513 pack(blockA, lhs, depth, rows, stride,
offset);
2516 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK 2517 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2520 void operator()(
float* blockB,
const DataMapper& rhs, Index depth, Index
cols, Index stride=0, Index
offset=0);
2523 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2528 pack(blockB, rhs, depth, cols, stride, offset);
2531 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2534 void operator()(
float* blockB,
const DataMapper& rhs, Index depth, Index
cols, Index stride=0, Index
offset=0);
2537 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2542 pack(blockB, rhs, depth, cols, stride, offset);
2546 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2549 void operator()(std::complex<float>* blockB,
const DataMapper& rhs, Index depth, Index
cols, Index stride=0, Index
offset=0);
2552 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2554 ::operator()(std::complex<float>* blockB,
const DataMapper& rhs, Index
depth, Index
cols, Index stride, Index
offset)
2557 pack(blockB, rhs, depth, cols, stride,
offset);
2560 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2563 void operator()(std::complex<float>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index
offset=0);
2566 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2567 void gemm_pack_rhs<std::complex<float>,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2568 ::operator()(std::complex<float>* blockB,
const DataMapper& rhs, Index
depth, Index
cols, Index stride, Index
offset)
2571 pack(blockB, rhs, depth, cols, stride,
offset);
2574 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2577 void operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index
offset=0);
2580 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2582 ::operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index
depth, Index
rows, Index stride, Index
offset)
2585 pack(blockA, lhs, depth, rows, stride,
offset);
2588 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2591 void operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index
offset=0);
2594 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2595 void gemm_pack_lhs<std::complex<double>,
Index, DataMapper, Pack1, Pack2,
Packet,
ColMajor, Conjugate, PanelMode>
2596 ::operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index
depth, Index
rows, Index stride, Index
offset)
2599 pack(blockA, lhs, depth, rows, stride,
offset);
2602 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2605 void operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index
offset=0);
2608 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2610 ::operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index
depth, Index
cols, Index stride, Index
offset)
2613 pack(blockB, rhs, depth, cols, stride,
offset);
2616 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2619 void operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index
offset=0);
2622 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2623 void gemm_pack_rhs<std::complex<double>,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2624 ::operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index
depth, Index
cols, Index stride, Index
offset)
2627 pack(blockB, rhs, depth, cols, stride,
offset);
2631 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2637 void operator()(
const DataMapper&
res,
const float* blockA,
const float* blockB,
2638 Index rows, Index depth, Index cols,
float alpha,
2639 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2642 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2645 Index rows, Index depth, Index cols,
float alpha,
2646 Index strideA, Index strideB, Index offsetA, Index offsetB)
2650 void (*gemm_function)(
const DataMapper&,
const float*,
const float*,
Index,
Index,
Index,
float,
Index,
Index,
Index,
Index);
2652 #ifdef EIGEN_ALTIVEC_MMA_ONLY 2654 gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2655 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 2656 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2657 gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2660 gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2663 gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2665 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2668 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2675 void operator()(
const DataMapper&
res,
const std::complex<float>* blockA,
const std::complex<float>* blockB,
2676 Index rows, Index depth, Index cols, std::complex<float>
alpha,
2677 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2680 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2682 ::operator()(
const DataMapper&
res,
const std::complex<float>* blockA,
const std::complex<float>* blockB,
2684 Index strideA, Index strideB, Index offsetA, Index offsetB)
2688 void (*gemm_function)(
const DataMapper&,
const std::complex<float>*,
const std::complex<float>*,
2691 #ifdef EIGEN_ALTIVEC_MMA_ONLY 2693 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>,
float,
Index,
Packet,
Packetc,
RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2694 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 2695 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2696 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2699 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2702 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2704 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2707 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2714 void operator()(
const DataMapper& res,
const float* blockA,
const std::complex<float>* blockB,
2715 Index rows, Index depth, Index cols, std::complex<float> alpha,
2716 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2719 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2721 ::operator()(
const DataMapper&
res,
const float* blockA,
const std::complex<float>* blockB,
2723 Index strideA, Index strideB, Index offsetA, Index offsetB)
2727 void (*gemm_function)(
const DataMapper&,
const float*,
const std::complex<float>*,
2729 #ifdef EIGEN_ALTIVEC_MMA_ONLY 2731 gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>,
float,
Index,
Packet,
Packetc,
RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2732 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 2733 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2734 gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2737 gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2740 gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2742 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2745 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2752 void operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const float* blockB,
2753 Index rows, Index depth, Index cols, std::complex<float> alpha,
2754 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2757 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2758 void gebp_kernel<std::complex<float>,
float,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2759 ::operator()(
const DataMapper&
res,
const std::complex<float>* blockA,
const float* blockB,
2761 Index strideA, Index strideB, Index offsetA, Index offsetB)
2765 void (*gemm_function)(
const DataMapper&,
const std::complex<float>*,
const float*,
2767 #ifdef EIGEN_ALTIVEC_MMA_ONLY 2769 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>,
float, std::complex<float>,
float,
Index,
Packet,
Packetc,
RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2770 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 2771 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2772 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>,
float, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2775 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>,
float, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2778 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>,
float, std::complex<float>,
float,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2780 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2783 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2784 struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2789 void operator()(
const DataMapper& res,
const double* blockA,
const double* blockB,
2790 Index rows, Index depth, Index cols,
double alpha,
2791 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2794 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2796 ::operator()(
const DataMapper& res,
const double* blockA,
const double* blockB,
2797 Index rows, Index depth, Index cols,
double alpha,
2798 Index strideA, Index strideB, Index offsetA, Index offsetB)
2802 void (*gemm_function)(
const DataMapper&,
const double*,
const double*,
Index,
Index,
Index, double,
Index,
Index,
Index,
Index);
2804 #ifdef EIGEN_ALTIVEC_MMA_ONLY 2806 gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2807 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 2808 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2809 gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2812 gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2815 gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2817 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2820 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2827 void operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const std::complex<double>* blockB,
2828 Index rows, Index depth, Index cols, std::complex<double> alpha,
2829 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2832 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2834 ::operator()(
const DataMapper&
res,
const std::complex<double>* blockA,
const std::complex<double>* blockB,
2836 Index strideA, Index strideB, Index offsetA, Index offsetB)
2840 void (*gemm_function)(
const DataMapper&,
const std::complex<double>*,
const std::complex<double>*,
2842 #ifdef EIGEN_ALTIVEC_MMA_ONLY 2844 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index,
Packet,
Packetc,
RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2845 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 2846 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2847 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2850 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2853 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2855 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2858 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2865 void operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const double* blockB,
2866 Index rows, Index depth, Index cols, std::complex<double> alpha,
2867 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2870 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2871 void gebp_kernel<std::complex<double>, double,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2872 ::operator()(
const DataMapper&
res,
const std::complex<double>* blockA,
const double* blockB,
2874 Index strideA, Index strideB, Index offsetA, Index offsetB)
2878 void (*gemm_function)(
const DataMapper&,
const std::complex<double>*,
const double*,
2880 #ifdef EIGEN_ALTIVEC_MMA_ONLY 2882 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
Index,
Packet,
Packetc,
RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2883 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 2884 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2885 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2888 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2891 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2893 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2896 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2903 void operator()(
const DataMapper& res,
const double* blockA,
const std::complex<double>* blockB,
2904 Index rows, Index depth, Index cols, std::complex<double> alpha,
2905 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2908 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2910 ::operator()(
const DataMapper&
res,
const double* blockA,
const std::complex<double>* blockB,
2912 Index strideA, Index strideB, Index offsetA, Index offsetB)
2916 void (*gemm_function)(
const DataMapper&,
const double*,
const std::complex<double>*,
2918 #ifdef EIGEN_ALTIVEC_MMA_ONLY 2920 gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
Index,
Packet,
Packetc,
RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2921 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) 2922 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2923 gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2926 gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2929 gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double,
Index,
Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2931 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2937 #endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
internal::packet_traits< Scalar >::type Packet
EIGEN_STRONG_INLINE void gemm_extra_col(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index row, Index col, Index remaining_rows, Index remaining_cols, const Packet &pAlpha)
static const Packet4i mask43
#define EIGEN_ALWAYS_INLINE
EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex< Scalar > *blockA, const std::complex< Scalar > *_lhs, Index lhsStride, Index cols, Index rows)
EIGEN_ALWAYS_INLINE std::complex< Scalar > getAdjointVal(Index i, Index j, const_blas_data_mapper< std::complex< Scalar >, Index, StorageOrder > &dt)
EIGEN_ALWAYS_INLINE void pger(PacketBlock< Packet, N > *acc, const Scalar *lhs, const Packet *rhsV)
void operator()(float *blockA, const float *_lhs, Index lhsStride, Index cols, Index rows)
#define EIGEN_STRONG_INLINE
#define MICRO_COMPLEX_ONE_PEEL4
void operator()(float *blockB, const float *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
__vector unsigned char Packet16uc
quad_traits< double >::rhstype RhsPacket
PacketBlock< vectortype, 4 > type
m m block(1, 0, 2, 2)<< 4
PacketBlock< Packet2d, 2 > rhstype
static const Packet4i mask42
quad_traits< double >::rhstype RhsPacket
EIGEN_STRONG_INLINE void operator()(double *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
EIGEN_ALWAYS_INLINE void pger_common(PacketBlock< Packet, 4 > *acc, const Packet &lhsV, const Packet *rhsV)
#define MICRO_COMPLEX_COL_STORE
EIGEN_ALWAYS_INLINE void band(PacketBlock< Packet, 4 > &acc, const Packet &pMask)
EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex< Scalar > *blockB, const std::complex< Scalar > *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
EIGEN_STRONG_INLINE void operator()(Scalar *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
EIGEN_ALWAYS_INLINE void bscalec(PacketBlock< Packet, N > &aReal, PacketBlock< Packet, N > &aImag, const Packet &bReal, const Packet &bImag, PacketBlock< Packet, N > &cReal, PacketBlock< Packet, N > &cImag)
EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL(const Scalar *&lhs_ptr_real, const Scalar *&lhs_ptr_imag, const Scalar *&rhs_ptr_real, const Scalar *&rhs_ptr_imag, PacketBlock< Packet, 1 > &accReal, PacketBlock< Packet, 1 > &accImag, Index remaining_rows, Index remaining_cols)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
EIGEN_ALWAYS_INLINE void pbroadcast4_old< Packet2d >(const double *a, Packet2d &a0, Packet2d &a1, Packet2d &a2, Packet2d &a3)
DerType::Scalar imag(const AutoDiffScalar< DerType > &)
EIGEN_ALWAYS_INLINE Packet2d bmask< Packet2d >(const int remaining_rows)
EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(const Scalar *&lhs_ptr_real, const Scalar *&lhs_ptr_imag, const Scalar *&rhs_ptr_real, const Scalar *&rhs_ptr_imag, PacketBlock< Packet, 4 > &accReal, PacketBlock< Packet, 4 > &accImag, Index remaining_rows)
#define MICRO_COMPLEX_ONE4
EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar *blockA, const Scalar *_lhs, Index lhsStride, Index cols, Index rows)
EIGEN_STRONG_INLINE void gemm_complex_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar *blockB, const Scalar *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
PacketBlock< vectortype, 4 > type
EIGEN_ALWAYS_INLINE void pgerc(PacketBlock< Packet, N > *accReal, PacketBlock< Packet, N > *accImag, const Scalar *lhs_ptr, const Scalar *lhs_ptr_imag, const Packet *rhsV, const Packet *rhsVi)
EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock< Packet, N > *accReal, PacketBlock< Packet, N > *accImag, const Packet &lhsV, const Packet &lhsVi, const Packet *rhsV, const Packet *rhsVi)
static const Packet2l mask21
static const Packet16uc p16uc_GETREAL32
#define MAX_COMPLEX_UNROLL
EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows)
EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, Index col, const Packet &pAlphaReal, const Packet &pAlphaImag)
static const Packet4i mask41
#define MICRO_COMPLEX_ONE1
EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock< Packet, 4 > &acc)
EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet) *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
#define MICRO_COMPLEX_STORE
quad_traits< double >::vectortype Packet
quad_traits< float >::vectortype Packet
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
#define MICRO_COMPLEX_PREFETCH
static const Line3 l(Rot3(), 1, 1)
#define MICRO_COMPLEX_SRC_PTR
void operator()(std::complex< float > *blockA, const std::complex< float > *_lhs, Index lhsStride, Index cols, Index rows)
static const Packet16uc p16uc_GETIMAG32
EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(const Scalar *&lhs_ptr, const Scalar *&rhs_ptr, PacketBlock< Packet, 4 > &accZero, Index remaining_rows)
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
EIGEN_STRONG_INLINE void gemm_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index row, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex< float > *from0, const std::complex< float > *from1)
packet_traits< Scalar >::type vectortype
Array< int, Dynamic, 1 > v
EIGEN_STRONG_INLINE void gemm_complex_extra_col(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index col, Index remaining_rows, Index remaining_cols, const Packet &pAlphaReal, const Packet &pAlphaImag)
quad_traits< double >::rhstype RhsPacket
#define MICRO_COMPLEX_ONE_PEEL1
static const Packet16uc p16uc_GETIMAG64
EIGEN_STRONG_INLINE void gemm_complex(const DataMapper &res, const LhsScalar *blockAc, const RhsScalar *blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
EIGEN_ALWAYS_INLINE void bload(PacketBlock< Packet, 4 > &acc, const DataMapper &res, Index row, Index col)
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar *lhs)
void operator()(double *blockB, const double *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
EIGEN_STRONG_INLINE Packet2d pload< Packet2d >(const double *from)
EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, Index rows, Index col, Index remaining_cols, const Packet &pAlphaReal, const Packet &pAlphaImag)
EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar *lhs, Packet &lhsV, Index remaining_rows)
EIGEN_ALWAYS_INLINE void bscale(PacketBlock< Packet, 4 > &acc, PacketBlock< Packet, 4 > &accZ, const Packet &pAlpha)
EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock< Packet, 4 > &acc, PacketBlock< Packet, 4 > &accZ, const Packet &pAlpha)
EIGEN_STRONG_INLINE void operator()(std::complex< double > *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
#define EIGEN_POWER_PREFETCH(p)
quad_traits< float >::rhstype RhsPacket
void operator()(std::complex< double > *blockA, const std::complex< double > *_lhs, Index lhsStride, Index cols, Index rows)
EIGEN_STRONG_INLINE void operator()(std::complex< Scalar > *blockA, const DataMapper &lhs, Index depth, Index rows, Index stride, Index offset)
quad_traits< double >::vectortype Packet
EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index &row, Index col, Index remaining_cols, const Packet &pAlpha)
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
static const Packet16uc p16uc_GETREAL64
EIGEN_STRONG_INLINE void gemm(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
EIGEN_STRONG_INLINE Packet2d pset1< Packet2d >(const double &from)
EIGEN_STRONG_INLINE void operator()(std::complex< double > *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset)
quad_traits< double >::rhstype RhsPacket
EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet &a)
const AutoDiffScalar< DerType > & real(const AutoDiffScalar< DerType > &x)
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
quad_traits< double >::vectortype Packet
void operator()(double *blockA, const double *_lhs, Index lhsStride, Index cols, Index rows)
EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL(const Scalar *&lhs_ptr, const Scalar *&rhs_ptr, PacketBlock< Packet, 1 > &accZero, Index remaining_rows, Index remaining_cols)
EIGEN_ALWAYS_INLINE void storeBlock(Scalar *to, PacketBlock< Packet, 4 > &block)
void operator()(std::complex< double > *blockB, const std::complex< double > *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
EIGEN_STRONG_INLINE void gemm_unrolled_col(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index &row, Index rows, Index col, Index remaining_cols, const Packet &pAlpha)
void operator()(std::complex< float > *blockB, const std::complex< float > *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
#define MICRO_COMPLEX_DST_PTR
EIGEN_STRONG_INLINE void operator()(double *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset)
#define EIGEN_UNUSED_VARIABLE(var)
EIGEN_STRONG_INLINE void gemm_unrolled_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index &row, Index col, const Packet &pAlpha)
EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, Index col, Index remaining_cols, const Packet &pAlphaReal, const Packet &pAlphaImag)
quad_traits< double >::vectortype Packet