10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H 11 #define EIGEN_GENERAL_MATRIX_VECTOR_H 30 template<
typename Index,
typename LhsScalar,
bool ConjugateLhs,
typename RhsScalar,
bool ConjugateRhs,
int Version>
52 Index rows, Index cols,
53 const LhsScalar* lhs, Index lhsStride,
54 const RhsScalar*
rhs, Index rhsIncr,
56 #ifdef EIGEN_INTERNAL_DEBUGGING
62 template<
typename Index,
typename LhsScalar,
bool ConjugateLhs,
typename RhsScalar,
bool ConjugateRhs,
int Version>
64 Index rows, Index cols,
65 const LhsScalar* lhs, Index lhsStride,
66 const RhsScalar*
rhs, Index rhsIncr,
68 #ifdef EIGEN_INTERNAL_DEBUGGING
74 #ifdef _EIGEN_ACCUMULATE_PACKETS 75 #error _EIGEN_ACCUMULATE_PACKETS has already been defined 77 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ 79 padd(pload<ResPacket>(&res[j]), \ 81 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ 82 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ 83 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ 84 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) 91 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
92 const Index columnsAtOnce = 4;
93 const Index peels = 2;
94 const Index LhsPacketAlignedMask = LhsPacketSize-1;
95 const Index ResPacketAlignedMask = ResPacketSize-1;
97 const Index size = rows;
102 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
103 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
105 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
106 Index alignmentPattern = alignmentStep==0 ? AllAligned
107 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
114 Index skipColumns = 0;
116 if( (
size_t(lhs)%
sizeof(LhsScalar)) || (
size_t(res)%
sizeof(
ResScalar)) )
121 else if (LhsPacketSize>1)
125 while (skipColumns<LhsPacketSize &&
126 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
128 if (skipColumns==LhsPacketSize)
131 alignmentPattern = NoneAligned;
136 skipColumns = (std::min)(skipColumns,cols);
141 || (skipColumns + columnsAtOnce >= cols)
142 || LhsPacketSize > size
143 || (
size_t(lhs+alignedStart+lhsStride*skipColumns)%
sizeof(
LhsPacket))==0);
145 else if(Vectorizable)
149 alignmentPattern = AllAligned;
152 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
153 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
155 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
156 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
158 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
159 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
160 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
161 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
164 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
165 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
171 for (Index j=0; j<alignedStart; ++j)
173 res[j] = cj.pmadd(lhs0[j],
pfirst(ptmp0), res[j]);
174 res[j] = cj.pmadd(lhs1[j],
pfirst(ptmp1), res[j]);
175 res[j] = cj.pmadd(lhs2[j],
pfirst(ptmp2), res[j]);
176 res[j] = cj.pmadd(lhs3[j],
pfirst(ptmp3), res[j]);
179 if (alignedSize>alignedStart)
181 switch(alignmentPattern)
184 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
188 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
193 Index j = alignedStart;
196 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
199 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
200 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
201 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
203 for (; j<peeledSize; j+=peels*ResPacketSize)
205 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
206 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
207 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
209 A00 = pload<LhsPacket>(&lhs0[j]);
210 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
211 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
212 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
214 T0 = pcj.pmadd(A01, ptmp1, T0);
215 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
216 T0 = pcj.pmadd(A02, ptmp2, T0);
217 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
218 T0 = pcj.pmadd(A03, ptmp3, T0);
220 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
221 T1 = pcj.pmadd(A11, ptmp1, T1);
222 T1 = pcj.pmadd(A12, ptmp2, T1);
223 T1 = pcj.pmadd(A13, ptmp3, T1);
224 pstore(&res[j+ResPacketSize],T1);
227 for (; j<alignedSize; j+=ResPacketSize)
232 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
240 for (Index j=alignedSize; j<size; ++j)
242 res[j] = cj.pmadd(lhs0[j],
pfirst(ptmp0), res[j]);
243 res[j] = cj.pmadd(lhs1[j],
pfirst(ptmp1), res[j]);
244 res[j] = cj.pmadd(lhs2[j],
pfirst(ptmp2), res[j]);
245 res[j] = cj.pmadd(lhs3[j],
pfirst(ptmp3), res[j]);
251 Index start = columnBound;
254 for (Index k=start; k<end; ++k)
256 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
257 const LhsScalar* lhs0 = lhs + k*lhsStride;
263 for (Index j=0; j<alignedStart; ++j)
264 res[j] += cj.pmul(lhs0[j],
pfirst(ptmp0));
266 if ((
size_t(lhs0+alignedStart)%
sizeof(
LhsPacket))==0)
267 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
268 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
270 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
271 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
275 for (Index i=alignedSize; i<size; ++i)
276 res[i] += cj.pmul(lhs0[i],
pfirst(ptmp0));
286 }
while(Vectorizable);
287 #undef _EIGEN_ACCUMULATE_PACKETS 300 template<
typename Index,
typename LhsScalar,
bool ConjugateLhs,
typename RhsScalar,
bool ConjugateRhs,
int Version>
322 Index rows, Index cols,
323 const LhsScalar* lhs, Index lhsStride,
324 const RhsScalar*
rhs, Index rhsIncr,
325 ResScalar* res, Index resIncr,
329 template<
typename Index,
typename LhsScalar,
bool ConjugateLhs,
typename RhsScalar,
bool ConjugateRhs,
int Version>
331 Index rows, Index cols,
332 const LhsScalar* lhs, Index lhsStride,
333 const RhsScalar*
rhs, Index rhsIncr,
339 #ifdef _EIGEN_ACCUMULATE_PACKETS 340 #error _EIGEN_ACCUMULATE_PACKETS has already been defined 343 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ 344 RhsPacket b = pload<RhsPacket>(&rhs[j]); \ 345 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ 346 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ 347 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ 348 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } 353 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
354 const Index rowsAtOnce = 4;
355 const Index peels = 2;
356 const Index RhsPacketAlignedMask = RhsPacketSize-1;
357 const Index LhsPacketAlignedMask = LhsPacketSize-1;
359 const Index depth = cols;
365 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
366 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
368 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
369 Index alignmentPattern = alignmentStep==0 ? AllAligned
370 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
379 if( (
sizeof(LhsScalar)!=
sizeof(RhsScalar)) || (
size_t(lhs)%
sizeof(LhsScalar)) || (
size_t(rhs)%
sizeof(RhsScalar)) )
384 else if (LhsPacketSize>1)
388 while (skipRows<LhsPacketSize &&
389 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
391 if (skipRows==LhsPacketSize)
394 alignmentPattern = NoneAligned;
399 skipRows = (std::min)(skipRows,Index(rows));
404 || (skipRows + rowsAtOnce >= rows)
405 || LhsPacketSize > depth
406 || (
size_t(lhs+alignedStart+lhsStride*skipRows)%
sizeof(
LhsPacket))==0);
408 else if(Vectorizable)
412 alignmentPattern = AllAligned;
415 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
416 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
418 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
419 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
425 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
426 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
436 for (Index j=0; j<alignedStart; ++j)
438 RhsScalar b = rhs[j];
439 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
440 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
443 if (alignedSize>alignedStart)
445 switch(alignmentPattern)
448 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
452 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
457 Index j = alignedStart;
467 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
468 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
469 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
471 for (; j<peeledSize; j+=peels*RhsPacketSize)
474 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
475 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
476 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
478 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
479 ptmp1 = pcj.pmadd(A01, b, ptmp1);
480 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
481 ptmp2 = pcj.pmadd(A02, b, ptmp2);
482 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
483 ptmp3 = pcj.pmadd(A03, b, ptmp3);
484 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
486 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
487 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
488 ptmp1 = pcj.pmadd(A11, b, ptmp1);
489 ptmp2 = pcj.pmadd(A12, b, ptmp2);
490 ptmp3 = pcj.pmadd(A13, b, ptmp3);
493 for (; j<alignedSize; j+=RhsPacketSize)
498 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
511 for (Index j=alignedSize; j<depth; ++j)
513 RhsScalar b = rhs[j];
514 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
515 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
517 res[i*resIncr] += alpha*tmp0;
518 res[(i+offset1)*resIncr] += alpha*tmp1;
519 res[(i+2)*resIncr] += alpha*tmp2;
520 res[(i+offset3)*resIncr] += alpha*tmp3;
525 Index start = rowBound;
528 for (Index i=start; i<end; ++i)
531 ResPacket ptmp0 = pset1<ResPacket>(tmp0);
532 const LhsScalar* lhs0 = lhs + i*lhsStride;
535 for (Index j=0; j<alignedStart; ++j)
536 tmp0 += cj.pmul(lhs0[j], rhs[j]);
538 if (alignedSize>alignedStart)
541 if ((
size_t(lhs0+alignedStart)%
sizeof(
LhsPacket))==0)
542 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
543 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
545 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
546 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
552 for (Index j=alignedSize; j<depth; ++j)
553 tmp0 += cj.pmul(lhs0[j], rhs[j]);
554 res[i*resIncr] += alpha*tmp0;
564 }
while(Vectorizable);
566 #undef _EIGEN_ACCUMULATE_PACKETS 573 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
packet_traits< RhsScalar >::type _RhsPacket
scalar_product_traits< LhsScalar, RhsScalar >::ReturnType ResScalar
const AutoDiffScalar< DerType > & conj(const AutoDiffScalar< DerType > &x)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
#define EIGEN_UNUSED_VARIABLE(var)
iterative scaling algorithm to equilibrate rows and column norms in matrices
#define eigen_internal_assert(x)
scalar_product_traits< LhsScalar, RhsScalar >::ReturnType ResScalar
void pstore(Scalar *to, const Packet &from)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
packet_traits< ResScalar >::type _ResPacket
packet_traits< RhsScalar >::type _RhsPacket
unpacket_traits< Packet >::type pfirst(const Packet &a)
unpacket_traits< Packet >::type predux(const Packet &a)
void rhs(const real_t *x, real_t *f)
packet_traits< LhsScalar >::type _LhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
packet_traits< LhsScalar >::type _LhsPacket
#define _EIGEN_ACCUMULATE_PACKETS(A0, A13, A2)
#define EIGEN_DONT_INLINE
packet_traits< ResScalar >::type _ResPacket
static Derived::Index first_aligned(const Derived &m)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket