00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
00011 #define EIGEN_GENERAL_MATRIX_VECTOR_H
00012
00013 namespace Eigen {
00014
00015 namespace internal {
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00031 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00032 {
00033 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00034
00035 enum {
00036 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00037 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00038 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00039 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00040 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00041 };
00042
00043 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00044 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00045 typedef typename packet_traits<ResScalar>::type _ResPacket;
00046
00047 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00048 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00049 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00050
00051 EIGEN_DONT_INLINE static void run(
00052 Index rows, Index cols,
00053 const LhsScalar* lhs, Index lhsStride,
00054 const RhsScalar* rhs, Index rhsIncr,
00055 ResScalar* res, Index
00056 #ifdef EIGEN_INTERNAL_DEBUGGING
00057 resIncr
00058 #endif
00059 , RhsScalar alpha);
00060 };
00061
00062 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00063 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
00064 Index rows, Index cols,
00065 const LhsScalar* lhs, Index lhsStride,
00066 const RhsScalar* rhs, Index rhsIncr,
00067 ResScalar* res, Index
00068 #ifdef EIGEN_INTERNAL_DEBUGGING
00069 resIncr
00070 #endif
00071 , RhsScalar alpha)
00072 {
00073 eigen_internal_assert(resIncr==1);
00074 #ifdef _EIGEN_ACCUMULATE_PACKETS
00075 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00076 #endif
00077 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
00078 pstore(&res[j], \
00079 padd(pload<ResPacket>(&res[j]), \
00080 padd( \
00081 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
00082 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
00083 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
00084 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
00085
00086 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00087 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00088 if(ConjugateRhs)
00089 alpha = numext::conj(alpha);
00090
00091 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
00092 const Index columnsAtOnce = 4;
00093 const Index peels = 2;
00094 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00095 const Index ResPacketAlignedMask = ResPacketSize-1;
00096
00097 const Index size = rows;
00098
00099
00100
00101 Index alignedStart = internal::first_aligned(res,size);
00102 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
00103 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00104
00105 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00106 Index alignmentPattern = alignmentStep==0 ? AllAligned
00107 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00108 : FirstAligned;
00109
00110
00111 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
00112
00113
00114 Index skipColumns = 0;
00115
00116 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
00117 {
00118 alignedSize = 0;
00119 alignedStart = 0;
00120 }
00121 else if (LhsPacketSize>1)
00122 {
00123 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
00124
00125 while (skipColumns<LhsPacketSize &&
00126 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
00127 ++skipColumns;
00128 if (skipColumns==LhsPacketSize)
00129 {
00130
00131 alignmentPattern = NoneAligned;
00132 skipColumns = 0;
00133 }
00134 else
00135 {
00136 skipColumns = (std::min)(skipColumns,cols);
00137
00138 }
00139
00140 eigen_internal_assert( (alignmentPattern==NoneAligned)
00141 || (skipColumns + columnsAtOnce >= cols)
00142 || LhsPacketSize > size
00143 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
00144 }
00145 else if(Vectorizable)
00146 {
00147 alignedStart = 0;
00148 alignedSize = size;
00149 alignmentPattern = AllAligned;
00150 }
00151
00152 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00153 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00154
00155 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
00156 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
00157 {
00158 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
00159 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
00160 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
00161 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
00162
00163
00164 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00165 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00166
00167 if (Vectorizable)
00168 {
00169
00170
00171 for (Index j=0; j<alignedStart; ++j)
00172 {
00173 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00174 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00175 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00176 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00177 }
00178
00179 if (alignedSize>alignedStart)
00180 {
00181 switch(alignmentPattern)
00182 {
00183 case AllAligned:
00184 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00185 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00186 break;
00187 case EvenAligned:
00188 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00189 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00190 break;
00191 case FirstAligned:
00192 {
00193 Index j = alignedStart;
00194 if(peels>1)
00195 {
00196 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
00197 ResPacket T0, T1;
00198
00199 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00200 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00201 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00202
00203 for (; j<peeledSize; j+=peels*ResPacketSize)
00204 {
00205 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00206 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00207 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00208
00209 A00 = pload<LhsPacket>(&lhs0[j]);
00210 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
00211 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
00212 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
00213
00214 T0 = pcj.pmadd(A01, ptmp1, T0);
00215 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00216 T0 = pcj.pmadd(A02, ptmp2, T0);
00217 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00218 T0 = pcj.pmadd(A03, ptmp3, T0);
00219 pstore(&res[j],T0);
00220 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00221 T1 = pcj.pmadd(A11, ptmp1, T1);
00222 T1 = pcj.pmadd(A12, ptmp2, T1);
00223 T1 = pcj.pmadd(A13, ptmp3, T1);
00224 pstore(&res[j+ResPacketSize],T1);
00225 }
00226 }
00227 for (; j<alignedSize; j+=ResPacketSize)
00228 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00229 break;
00230 }
00231 default:
00232 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00233 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00234 break;
00235 }
00236 }
00237 }
00238
00239
00240 for (Index j=alignedSize; j<size; ++j)
00241 {
00242 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00243 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00244 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00245 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00246 }
00247 }
00248
00249
00250 Index end = cols;
00251 Index start = columnBound;
00252 do
00253 {
00254 for (Index k=start; k<end; ++k)
00255 {
00256 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
00257 const LhsScalar* lhs0 = lhs + k*lhsStride;
00258
00259 if (Vectorizable)
00260 {
00261
00262
00263 for (Index j=0; j<alignedStart; ++j)
00264 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
00265
00266 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00267 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00268 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00269 else
00270 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00271 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00272 }
00273
00274
00275 for (Index i=alignedSize; i<size; ++i)
00276 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
00277 }
00278 if (skipColumns)
00279 {
00280 start = 0;
00281 end = skipColumns;
00282 skipColumns = 0;
00283 }
00284 else
00285 break;
00286 } while(Vectorizable);
00287 #undef _EIGEN_ACCUMULATE_PACKETS
00288 }
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00301 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00302 {
00303 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00304
00305 enum {
00306 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00307 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00308 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00309 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00310 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00311 };
00312
00313 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00314 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00315 typedef typename packet_traits<ResScalar>::type _ResPacket;
00316
00317 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00318 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00319 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00320
00321 EIGEN_DONT_INLINE static void run(
00322 Index rows, Index cols,
00323 const LhsScalar* lhs, Index lhsStride,
00324 const RhsScalar* rhs, Index rhsIncr,
00325 ResScalar* res, Index resIncr,
00326 ResScalar alpha);
00327 };
00328
00329 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00330 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
00331 Index rows, Index cols,
00332 const LhsScalar* lhs, Index lhsStride,
00333 const RhsScalar* rhs, Index rhsIncr,
00334 ResScalar* res, Index resIncr,
00335 ResScalar alpha)
00336 {
00337 EIGEN_UNUSED_VARIABLE(rhsIncr);
00338 eigen_internal_assert(rhsIncr==1);
00339 #ifdef _EIGEN_ACCUMULATE_PACKETS
00340 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00341 #endif
00342
00343 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
00344 RhsPacket b = pload<RhsPacket>(&rhs[j]); \
00345 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
00346 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
00347 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
00348 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
00349
00350 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00351 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00352
00353 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
00354 const Index rowsAtOnce = 4;
00355 const Index peels = 2;
00356 const Index RhsPacketAlignedMask = RhsPacketSize-1;
00357 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00358
00359 const Index depth = cols;
00360
00361
00362
00363
00364 Index alignedStart = internal::first_aligned(rhs, depth);
00365 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
00366 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00367
00368 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00369 Index alignmentPattern = alignmentStep==0 ? AllAligned
00370 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00371 : FirstAligned;
00372
00373
00374 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
00375
00376
00377 Index skipRows = 0;
00378
00379 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
00380 {
00381 alignedSize = 0;
00382 alignedStart = 0;
00383 }
00384 else if (LhsPacketSize>1)
00385 {
00386 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
00387
00388 while (skipRows<LhsPacketSize &&
00389 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
00390 ++skipRows;
00391 if (skipRows==LhsPacketSize)
00392 {
00393
00394 alignmentPattern = NoneAligned;
00395 skipRows = 0;
00396 }
00397 else
00398 {
00399 skipRows = (std::min)(skipRows,Index(rows));
00400
00401 }
00402 eigen_internal_assert( alignmentPattern==NoneAligned
00403 || LhsPacketSize==1
00404 || (skipRows + rowsAtOnce >= rows)
00405 || LhsPacketSize > depth
00406 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
00407 }
00408 else if(Vectorizable)
00409 {
00410 alignedStart = 0;
00411 alignedSize = depth;
00412 alignmentPattern = AllAligned;
00413 }
00414
00415 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00416 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00417
00418 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
00419 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
00420 {
00421 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00422 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
00423
00424
00425 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00426 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00427
00428 if (Vectorizable)
00429 {
00430
00431 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
00432 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
00433
00434
00435
00436 for (Index j=0; j<alignedStart; ++j)
00437 {
00438 RhsScalar b = rhs[j];
00439 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00440 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00441 }
00442
00443 if (alignedSize>alignedStart)
00444 {
00445 switch(alignmentPattern)
00446 {
00447 case AllAligned:
00448 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00449 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00450 break;
00451 case EvenAligned:
00452 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00453 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00454 break;
00455 case FirstAligned:
00456 {
00457 Index j = alignedStart;
00458 if (peels>1)
00459 {
00460
00461
00462
00463
00464
00465
00466 LhsPacket A01, A02, A03, A11, A12, A13;
00467 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00468 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00469 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00470
00471 for (; j<peeledSize; j+=peels*RhsPacketSize)
00472 {
00473 RhsPacket b = pload<RhsPacket>(&rhs[j]);
00474 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00475 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00476 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00477
00478 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
00479 ptmp1 = pcj.pmadd(A01, b, ptmp1);
00480 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00481 ptmp2 = pcj.pmadd(A02, b, ptmp2);
00482 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00483 ptmp3 = pcj.pmadd(A03, b, ptmp3);
00484 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00485
00486 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
00487 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
00488 ptmp1 = pcj.pmadd(A11, b, ptmp1);
00489 ptmp2 = pcj.pmadd(A12, b, ptmp2);
00490 ptmp3 = pcj.pmadd(A13, b, ptmp3);
00491 }
00492 }
00493 for (; j<alignedSize; j+=RhsPacketSize)
00494 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00495 break;
00496 }
00497 default:
00498 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00499 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00500 break;
00501 }
00502 tmp0 += predux(ptmp0);
00503 tmp1 += predux(ptmp1);
00504 tmp2 += predux(ptmp2);
00505 tmp3 += predux(ptmp3);
00506 }
00507 }
00508
00509
00510
00511 for (Index j=alignedSize; j<depth; ++j)
00512 {
00513 RhsScalar b = rhs[j];
00514 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00515 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00516 }
00517 res[i*resIncr] += alpha*tmp0;
00518 res[(i+offset1)*resIncr] += alpha*tmp1;
00519 res[(i+2)*resIncr] += alpha*tmp2;
00520 res[(i+offset3)*resIncr] += alpha*tmp3;
00521 }
00522
00523
00524 Index end = rows;
00525 Index start = rowBound;
00526 do
00527 {
00528 for (Index i=start; i<end; ++i)
00529 {
00530 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00531 ResPacket ptmp0 = pset1<ResPacket>(tmp0);
00532 const LhsScalar* lhs0 = lhs + i*lhsStride;
00533
00534
00535 for (Index j=0; j<alignedStart; ++j)
00536 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00537
00538 if (alignedSize>alignedStart)
00539 {
00540
00541 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00542 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00543 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00544 else
00545 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00546 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00547 tmp0 += predux(ptmp0);
00548 }
00549
00550
00551
00552 for (Index j=alignedSize; j<depth; ++j)
00553 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00554 res[i*resIncr] += alpha*tmp0;
00555 }
00556 if (skipRows)
00557 {
00558 start = 0;
00559 end = skipRows;
00560 skipRows = 0;
00561 }
00562 else
00563 break;
00564 } while(Vectorizable);
00565
00566 #undef _EIGEN_ACCUMULATE_PACKETS
00567 }
00568
00569 }
00570
00571 }
00572
00573 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H