10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H    11 #define EIGEN_GENERAL_MATRIX_VECTOR_H    30 template<
typename Index, 
typename LhsScalar, 
bool ConjugateLhs, 
typename RhsScalar, 
bool ConjugateRhs, 
int Version>
    52   Index rows, Index cols,
    53   const LhsScalar* lhs, Index lhsStride,
    54   const RhsScalar* rhs, Index rhsIncr,
    56   #ifdef EIGEN_INTERNAL_DEBUGGING
    62 template<
typename Index, 
typename LhsScalar, 
bool ConjugateLhs, 
typename RhsScalar, 
bool ConjugateRhs, 
int Version>
    64   Index rows, Index cols,
    65   const LhsScalar* lhs, Index lhsStride,
    66   const RhsScalar* rhs, Index rhsIncr,
    68   #ifdef EIGEN_INTERNAL_DEBUGGING
    74   #ifdef _EIGEN_ACCUMULATE_PACKETS    75   #error _EIGEN_ACCUMULATE_PACKETS has already been defined    77   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \    79       padd(pload<ResPacket>(&res[j]), \    81           padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \    82                   pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \    83           padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \    84                   pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))    89     alpha = numext::conj(alpha);
    91   enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
    92   const Index columnsAtOnce = 4;
    93   const Index peels = 2;
    94   const Index LhsPacketAlignedMask = LhsPacketSize-1;
    95   const Index ResPacketAlignedMask = ResPacketSize-1;
    97   const Index size = rows;
   102   Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
   103   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
   105   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   106   Index alignmentPattern = alignmentStep==0 ? AllAligned
   107                        : alignmentStep==(LhsPacketSize/2) ? EvenAligned
   114   Index skipColumns = 0;
   116   if( (
size_t(lhs)%
sizeof(LhsScalar)) || (
size_t(res)%
sizeof(
ResScalar)) )
   121   else if (LhsPacketSize>1)
   125     while (skipColumns<LhsPacketSize &&
   126           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
   128     if (skipColumns==LhsPacketSize)
   131       alignmentPattern = NoneAligned;
   136       skipColumns = (std::min)(skipColumns,cols);
   141                       || (skipColumns + columnsAtOnce >= cols)
   142                       || LhsPacketSize > size
   143                       || (
size_t(lhs+alignedStart+lhsStride*skipColumns)%
sizeof(
LhsPacket))==0);
   145   else if(Vectorizable)
   149     alignmentPattern = AllAligned;
   152   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
   153   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
   155   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   156   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
   158     RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
   159               ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
   160               ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
   161               ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
   164     const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
   165                     *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
   171       for (Index j=0; j<alignedStart; ++j)
   173         res[j] = cj.pmadd(lhs0[j], 
pfirst(ptmp0), res[j]);
   174         res[j] = cj.pmadd(lhs1[j], 
pfirst(ptmp1), res[j]);
   175         res[j] = cj.pmadd(lhs2[j], 
pfirst(ptmp2), res[j]);
   176         res[j] = cj.pmadd(lhs3[j], 
pfirst(ptmp3), res[j]);
   179       if (alignedSize>alignedStart)
   181         switch(alignmentPattern)
   184             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
   188             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
   193             Index j = alignedStart;
   196               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
   199               A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
   200               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
   201               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
   203               for (; j<peeledSize; j+=peels*ResPacketSize)
   205                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
   206                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
   207                 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
   209                 A00 = pload<LhsPacket>(&lhs0[j]);
   210                 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
   211                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
   212                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
   214                 T0  = pcj.pmadd(A01, ptmp1, T0);
   215                 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
   216                 T0  = pcj.pmadd(A02, ptmp2, T0);
   217                 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
   218                 T0  = pcj.pmadd(A03, ptmp3, T0);
   220                 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
   221                 T1  = pcj.pmadd(A11, ptmp1, T1);
   222                 T1  = pcj.pmadd(A12, ptmp2, T1);
   223                 T1  = pcj.pmadd(A13, ptmp3, T1);
   224                 pstore(&res[j+ResPacketSize],T1);
   227             for (; j<alignedSize; j+=ResPacketSize)
   232             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
   240     for (Index j=alignedSize; j<size; ++j)
   242       res[j] = cj.pmadd(lhs0[j], 
pfirst(ptmp0), res[j]);
   243       res[j] = cj.pmadd(lhs1[j], 
pfirst(ptmp1), res[j]);
   244       res[j] = cj.pmadd(lhs2[j], 
pfirst(ptmp2), res[j]);
   245       res[j] = cj.pmadd(lhs3[j], 
pfirst(ptmp3), res[j]);
   251   Index start = columnBound;
   254     for (Index k=start; k<end; ++k)
   256       RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
   257       const LhsScalar* lhs0 = lhs + k*lhsStride;
   263         for (Index j=0; j<alignedStart; ++j)
   264           res[j] += cj.pmul(lhs0[j], 
pfirst(ptmp0));
   266         if ((
size_t(lhs0+alignedStart)%
sizeof(
LhsPacket))==0)
   267           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
   268             pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
   270           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
   271             pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
   275       for (Index i=alignedSize; i<size; ++i)
   276         res[i] += cj.pmul(lhs0[i], 
pfirst(ptmp0));
   286   } 
while(Vectorizable);
   287   #undef _EIGEN_ACCUMULATE_PACKETS   300 template<
typename Index, 
typename LhsScalar, 
bool ConjugateLhs, 
typename RhsScalar, 
bool ConjugateRhs, 
int Version>
   322   Index rows, Index cols,
   323   const LhsScalar* lhs, Index lhsStride,
   324   const RhsScalar* rhs, Index rhsIncr,
   325   ResScalar* res, Index resIncr,
   329 template<
typename Index, 
typename LhsScalar, 
bool ConjugateLhs, 
typename RhsScalar, 
bool ConjugateRhs, 
int Version>
   331   Index rows, Index cols,
   332   const LhsScalar* lhs, Index lhsStride,
   333   const RhsScalar* rhs, Index rhsIncr,
   339   #ifdef _EIGEN_ACCUMULATE_PACKETS   340   #error _EIGEN_ACCUMULATE_PACKETS has already been defined   343   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\   344     RhsPacket b = pload<RhsPacket>(&rhs[j]); \   345     ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \   346     ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \   347     ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \   348     ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }   353   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
   354   const Index rowsAtOnce = 4;
   355   const Index peels = 2;
   356   const Index RhsPacketAlignedMask = RhsPacketSize-1;
   357   const Index LhsPacketAlignedMask = LhsPacketSize-1;
   359   const Index depth = cols;
   365   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
   366   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
   368   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   369   Index alignmentPattern = alignmentStep==0 ? AllAligned
   370                          : alignmentStep==(LhsPacketSize/2) ? EvenAligned
   379   if( (
sizeof(LhsScalar)!=
sizeof(RhsScalar)) || (
size_t(lhs)%
sizeof(LhsScalar)) || (
size_t(rhs)%
sizeof(RhsScalar)) )
   384   else if (LhsPacketSize>1)
   388     while (skipRows<LhsPacketSize &&
   389            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
   391     if (skipRows==LhsPacketSize)
   394       alignmentPattern = NoneAligned;
   399       skipRows = (std::min)(skipRows,Index(rows));
   404                       || (skipRows + rowsAtOnce >= rows)
   405                       || LhsPacketSize > depth
   406                       || (
size_t(lhs+alignedStart+lhsStride*skipRows)%
sizeof(
LhsPacket))==0);
   408   else if(Vectorizable)
   412     alignmentPattern = AllAligned;
   415   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
   416   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
   418   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   419   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
   425     const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
   426                     *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
   436       for (Index j=0; j<alignedStart; ++j)
   438         RhsScalar b = rhs[j];
   439         tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
   440         tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
   443       if (alignedSize>alignedStart)
   445         switch(alignmentPattern)
   448             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
   452             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
   457             Index j = alignedStart;
   467               A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
   468               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
   469               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
   471               for (; j<peeledSize; j+=peels*RhsPacketSize)
   474                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
   475                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
   476                 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
   478                 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
   479                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
   480                 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
   481                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
   482                 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
   483                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
   484                 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
   486                 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
   487                 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
   488                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
   489                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
   490                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
   493             for (; j<alignedSize; j+=RhsPacketSize)
   498             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
   511     for (Index j=alignedSize; j<depth; ++j)
   513       RhsScalar b = rhs[j];
   514       tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
   515       tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
   517     res[i*resIncr]            += alpha*tmp0;
   518     res[(i+offset1)*resIncr]  += alpha*tmp1;
   519     res[(i+2)*resIncr]        += alpha*tmp2;
   520     res[(i+offset3)*resIncr]  += alpha*tmp3;
   525   Index start = rowBound;
   528     for (Index i=start; i<end; ++i)
   531       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
   532       const LhsScalar* lhs0 = lhs + i*lhsStride;
   535       for (Index j=0; j<alignedStart; ++j)
   536         tmp0 += cj.pmul(lhs0[j], rhs[j]);
   538       if (alignedSize>alignedStart)
   541         if ((
size_t(lhs0+alignedStart)%
sizeof(
LhsPacket))==0)
   542           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
   543             ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
   545           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
   546             ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
   552       for (Index j=alignedSize; j<depth; ++j)
   553         tmp0 += cj.pmul(lhs0[j], rhs[j]);
   554       res[i*resIncr] += alpha*tmp0;
   564   } 
while(Vectorizable);
   566   #undef _EIGEN_ACCUMULATE_PACKETS   573 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H 
packet_traits< RhsScalar >::type _RhsPacket
scalar_product_traits< LhsScalar, RhsScalar >::ReturnType ResScalar
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
#define EIGEN_UNUSED_VARIABLE(var)
#define eigen_internal_assert(x)
scalar_product_traits< LhsScalar, RhsScalar >::ReturnType ResScalar
void pstore(Scalar *to, const Packet &from)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
packet_traits< ResScalar >::type _ResPacket
packet_traits< RhsScalar >::type _RhsPacket
unpacket_traits< Packet >::type pfirst(const Packet &a)
unpacket_traits< Packet >::type predux(const Packet &a)
packet_traits< LhsScalar >::type _LhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
packet_traits< LhsScalar >::type _LhsPacket
#define _EIGEN_ACCUMULATE_PACKETS(A0, A13, A2)
#define EIGEN_DONT_INLINE
packet_traits< ResScalar >::type _ResPacket
static Derived::Index first_aligned(const Derived &m)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
void run(ClassLoader *loader)