00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
00026 #define EIGEN_GENERAL_MATRIX_VECTOR_H
00027
00028 namespace internal {
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
00044 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
00045 {
00046 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00047
00048 enum {
00049 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00050 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00051 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00052 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00053 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00054 };
00055
00056 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00057 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00058 typedef typename packet_traits<ResScalar>::type _ResPacket;
00059
00060 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00061 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00062 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00063
00064 EIGEN_DONT_INLINE static void run(
00065 Index rows, Index cols,
00066 const LhsScalar* lhs, Index lhsStride,
00067 const RhsScalar* rhs, Index rhsIncr,
00068 ResScalar* res, Index
00069 #ifdef EIGEN_INTERNAL_DEBUGGING
00070 resIncr
00071 #endif
00072 , RhsScalar alpha)
00073 {
00074 eigen_internal_assert(resIncr==1);
00075 #ifdef _EIGEN_ACCUMULATE_PACKETS
00076 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00077 #endif
00078 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
00079 pstore(&res[j], \
00080 padd(pload<ResPacket>(&res[j]), \
00081 padd( \
00082 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
00083 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
00084 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
00085 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
00086
00087 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00088 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00089 if(ConjugateRhs)
00090 alpha = conj(alpha);
00091
00092 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
00093 const Index columnsAtOnce = 4;
00094 const Index peels = 2;
00095 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00096 const Index ResPacketAlignedMask = ResPacketSize-1;
00097 const Index PeelAlignedMask = ResPacketSize*peels-1;
00098 const Index size = rows;
00099
00100
00101
00102 Index alignedStart = first_aligned(res,size);
00103 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
00104 const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
00105
00106 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00107 Index alignmentPattern = alignmentStep==0 ? AllAligned
00108 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00109 : FirstAligned;
00110
00111
00112 const Index lhsAlignmentOffset = first_aligned(lhs,size);
00113
00114
00115 Index skipColumns = 0;
00116
00117 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
00118 {
00119 alignedSize = 0;
00120 alignedStart = 0;
00121 }
00122 else if (LhsPacketSize>1)
00123 {
00124 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
00125
00126 while (skipColumns<LhsPacketSize &&
00127 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
00128 ++skipColumns;
00129 if (skipColumns==LhsPacketSize)
00130 {
00131
00132 alignmentPattern = NoneAligned;
00133 skipColumns = 0;
00134 }
00135 else
00136 {
00137 skipColumns = (std::min)(skipColumns,cols);
00138
00139 }
00140
00141 eigen_internal_assert( (alignmentPattern==NoneAligned)
00142 || (skipColumns + columnsAtOnce >= cols)
00143 || LhsPacketSize > size
00144 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
00145 }
00146 else if(Vectorizable)
00147 {
00148 alignedStart = 0;
00149 alignedSize = size;
00150 alignmentPattern = AllAligned;
00151 }
00152
00153 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00154 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00155
00156 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
00157 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
00158 {
00159 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
00160 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
00161 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
00162 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
00163
00164
00165 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00166 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00167
00168 if (Vectorizable)
00169 {
00170
00171
00172 for (Index j=0; j<alignedStart; ++j)
00173 {
00174 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00175 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00176 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00177 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00178 }
00179
00180 if (alignedSize>alignedStart)
00181 {
00182 switch(alignmentPattern)
00183 {
00184 case AllAligned:
00185 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00186 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00187 break;
00188 case EvenAligned:
00189 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00190 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00191 break;
00192 case FirstAligned:
00193 if(peels>1)
00194 {
00195 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
00196 ResPacket T0, T1;
00197
00198 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00199 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00200 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00201
00202 for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize)
00203 {
00204 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00205 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00206 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00207
00208 A00 = pload<LhsPacket>(&lhs0[j]);
00209 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
00210 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
00211 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
00212
00213 T0 = pcj.pmadd(A01, ptmp1, T0);
00214 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00215 T0 = pcj.pmadd(A02, ptmp2, T0);
00216 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00217 T0 = pcj.pmadd(A03, ptmp3, T0);
00218 pstore(&res[j],T0);
00219 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00220 T1 = pcj.pmadd(A11, ptmp1, T1);
00221 T1 = pcj.pmadd(A12, ptmp2, T1);
00222 T1 = pcj.pmadd(A13, ptmp3, T1);
00223 pstore(&res[j+ResPacketSize],T1);
00224 }
00225 }
00226 for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
00227 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00228 break;
00229 default:
00230 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00231 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00232 break;
00233 }
00234 }
00235 }
00236
00237
00238 for (Index j=alignedSize; j<size; ++j)
00239 {
00240 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00241 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00242 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00243 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00244 }
00245 }
00246
00247
00248 Index end = cols;
00249 Index start = columnBound;
00250 do
00251 {
00252 for (Index k=start; k<end; ++k)
00253 {
00254 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
00255 const LhsScalar* lhs0 = lhs + k*lhsStride;
00256
00257 if (Vectorizable)
00258 {
00259
00260
00261 for (Index j=0; j<alignedStart; ++j)
00262 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
00263
00264 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00265 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00266 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00267 else
00268 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00269 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00270 }
00271
00272
00273 for (Index i=alignedSize; i<size; ++i)
00274 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
00275 }
00276 if (skipColumns)
00277 {
00278 start = 0;
00279 end = skipColumns;
00280 skipColumns = 0;
00281 }
00282 else
00283 break;
00284 } while(Vectorizable);
00285 #undef _EIGEN_ACCUMULATE_PACKETS
00286 }
00287 };
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
00300 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
00301 {
00302 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00303
00304 enum {
00305 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00306 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00307 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00308 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00309 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00310 };
00311
00312 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00313 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00314 typedef typename packet_traits<ResScalar>::type _ResPacket;
00315
00316 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00317 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00318 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00319
00320 EIGEN_DONT_INLINE static void run(
00321 Index rows, Index cols,
00322 const LhsScalar* lhs, Index lhsStride,
00323 const RhsScalar* rhs, Index rhsIncr,
00324 ResScalar* res, Index resIncr,
00325 ResScalar alpha)
00326 {
00327 EIGEN_UNUSED_VARIABLE(rhsIncr);
00328 eigen_internal_assert(rhsIncr==1);
00329 #ifdef _EIGEN_ACCUMULATE_PACKETS
00330 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00331 #endif
00332
00333 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
00334 RhsPacket b = pload<RhsPacket>(&rhs[j]); \
00335 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
00336 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
00337 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
00338 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
00339
00340 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00341 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00342
00343 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
00344 const Index rowsAtOnce = 4;
00345 const Index peels = 2;
00346 const Index RhsPacketAlignedMask = RhsPacketSize-1;
00347 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00348 const Index PeelAlignedMask = RhsPacketSize*peels-1;
00349 const Index depth = cols;
00350
00351
00352
00353
00354 Index alignedStart = first_aligned(rhs, depth);
00355 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
00356 const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
00357
00358 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00359 Index alignmentPattern = alignmentStep==0 ? AllAligned
00360 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00361 : FirstAligned;
00362
00363
00364 const Index lhsAlignmentOffset = first_aligned(lhs,depth);
00365
00366
00367 Index skipRows = 0;
00368
00369 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
00370 {
00371 alignedSize = 0;
00372 alignedStart = 0;
00373 }
00374 else if (LhsPacketSize>1)
00375 {
00376 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
00377
00378 while (skipRows<LhsPacketSize &&
00379 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
00380 ++skipRows;
00381 if (skipRows==LhsPacketSize)
00382 {
00383
00384 alignmentPattern = NoneAligned;
00385 skipRows = 0;
00386 }
00387 else
00388 {
00389 skipRows = (std::min)(skipRows,Index(rows));
00390
00391 }
00392 eigen_internal_assert( alignmentPattern==NoneAligned
00393 || LhsPacketSize==1
00394 || (skipRows + rowsAtOnce >= rows)
00395 || LhsPacketSize > depth
00396 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
00397 }
00398 else if(Vectorizable)
00399 {
00400 alignedStart = 0;
00401 alignedSize = depth;
00402 alignmentPattern = AllAligned;
00403 }
00404
00405 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00406 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00407
00408 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
00409 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
00410 {
00411 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00412 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
00413
00414
00415 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00416 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00417
00418 if (Vectorizable)
00419 {
00420
00421 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
00422 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
00423
00424
00425
00426 for (Index j=0; j<alignedStart; ++j)
00427 {
00428 RhsScalar b = rhs[j];
00429 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00430 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00431 }
00432
00433 if (alignedSize>alignedStart)
00434 {
00435 switch(alignmentPattern)
00436 {
00437 case AllAligned:
00438 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00439 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00440 break;
00441 case EvenAligned:
00442 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00443 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00444 break;
00445 case FirstAligned:
00446 if (peels>1)
00447 {
00448
00449
00450
00451
00452
00453
00454 LhsPacket A01, A02, A03, A11, A12, A13;
00455 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00456 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00457 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00458
00459 for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize)
00460 {
00461 RhsPacket b = pload<RhsPacket>(&rhs[j]);
00462 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00463 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00464 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00465
00466 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
00467 ptmp1 = pcj.pmadd(A01, b, ptmp1);
00468 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00469 ptmp2 = pcj.pmadd(A02, b, ptmp2);
00470 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00471 ptmp3 = pcj.pmadd(A03, b, ptmp3);
00472 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00473
00474 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
00475 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
00476 ptmp1 = pcj.pmadd(A11, b, ptmp1);
00477 ptmp2 = pcj.pmadd(A12, b, ptmp2);
00478 ptmp3 = pcj.pmadd(A13, b, ptmp3);
00479 }
00480 }
00481 for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize)
00482 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00483 break;
00484 default:
00485 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00486 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00487 break;
00488 }
00489 tmp0 += predux(ptmp0);
00490 tmp1 += predux(ptmp1);
00491 tmp2 += predux(ptmp2);
00492 tmp3 += predux(ptmp3);
00493 }
00494 }
00495
00496
00497
00498 for (Index j=alignedSize; j<depth; ++j)
00499 {
00500 RhsScalar b = rhs[j];
00501 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00502 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00503 }
00504 res[i*resIncr] += alpha*tmp0;
00505 res[(i+offset1)*resIncr] += alpha*tmp1;
00506 res[(i+2)*resIncr] += alpha*tmp2;
00507 res[(i+offset3)*resIncr] += alpha*tmp3;
00508 }
00509
00510
00511 Index end = rows;
00512 Index start = rowBound;
00513 do
00514 {
00515 for (Index i=start; i<end; ++i)
00516 {
00517 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00518 ResPacket ptmp0 = pset1<ResPacket>(tmp0);
00519 const LhsScalar* lhs0 = lhs + i*lhsStride;
00520
00521
00522 for (Index j=0; j<alignedStart; ++j)
00523 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00524
00525 if (alignedSize>alignedStart)
00526 {
00527
00528 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00529 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00530 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00531 else
00532 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00533 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00534 tmp0 += predux(ptmp0);
00535 }
00536
00537
00538
00539 for (Index j=alignedSize; j<depth; ++j)
00540 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00541 res[i*resIncr] += alpha*tmp0;
00542 }
00543 if (skipRows)
00544 {
00545 start = 0;
00546 end = skipRows;
00547 skipRows = 0;
00548 }
00549 else
00550 break;
00551 } while(Vectorizable);
00552
00553 #undef _EIGEN_ACCUMULATE_PACKETS
00554 }
00555 };
00556
00557 }
00558
00559 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H