00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
00011 #define EIGEN_GENERAL_MATRIX_VECTOR_H
00012
00013 namespace Eigen {
00014
00015 namespace internal {
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00031 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00032 {
00033 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00034
00035 enum {
00036 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00037 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00038 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00039 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00040 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00041 };
00042
00043 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00044 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00045 typedef typename packet_traits<ResScalar>::type _ResPacket;
00046
00047 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00048 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00049 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00050
00051 EIGEN_DONT_INLINE static void run(
00052 Index rows, Index cols,
00053 const LhsScalar* lhs, Index lhsStride,
00054 const RhsScalar* rhs, Index rhsIncr,
00055 ResScalar* res, Index
00056 #ifdef EIGEN_INTERNAL_DEBUGGING
00057 resIncr
00058 #endif
00059 , RhsScalar alpha)
00060 {
00061 eigen_internal_assert(resIncr==1);
00062 #ifdef _EIGEN_ACCUMULATE_PACKETS
00063 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00064 #endif
00065 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
00066 pstore(&res[j], \
00067 padd(pload<ResPacket>(&res[j]), \
00068 padd( \
00069 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
00070 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
00071 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
00072 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
00073
00074 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00075 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00076 if(ConjugateRhs)
00077 alpha = conj(alpha);
00078
00079 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
00080 const Index columnsAtOnce = 4;
00081 const Index peels = 2;
00082 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00083 const Index ResPacketAlignedMask = ResPacketSize-1;
00084 const Index size = rows;
00085
00086
00087
00088 Index alignedStart = internal::first_aligned(res,size);
00089 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
00090 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00091
00092 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00093 Index alignmentPattern = alignmentStep==0 ? AllAligned
00094 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00095 : FirstAligned;
00096
00097
00098 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
00099
00100
00101 Index skipColumns = 0;
00102
00103 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
00104 {
00105 alignedSize = 0;
00106 alignedStart = 0;
00107 }
00108 else if (LhsPacketSize>1)
00109 {
00110 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
00111
00112 while (skipColumns<LhsPacketSize &&
00113 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
00114 ++skipColumns;
00115 if (skipColumns==LhsPacketSize)
00116 {
00117
00118 alignmentPattern = NoneAligned;
00119 skipColumns = 0;
00120 }
00121 else
00122 {
00123 skipColumns = (std::min)(skipColumns,cols);
00124
00125 }
00126
00127 eigen_internal_assert( (alignmentPattern==NoneAligned)
00128 || (skipColumns + columnsAtOnce >= cols)
00129 || LhsPacketSize > size
00130 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
00131 }
00132 else if(Vectorizable)
00133 {
00134 alignedStart = 0;
00135 alignedSize = size;
00136 alignmentPattern = AllAligned;
00137 }
00138
00139 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00140 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00141
00142 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
00143 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
00144 {
00145 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
00146 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
00147 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
00148 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
00149
00150
00151 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00152 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00153
00154 if (Vectorizable)
00155 {
00156
00157
00158 for (Index j=0; j<alignedStart; ++j)
00159 {
00160 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00161 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00162 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00163 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00164 }
00165
00166 if (alignedSize>alignedStart)
00167 {
00168 switch(alignmentPattern)
00169 {
00170 case AllAligned:
00171 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00172 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00173 break;
00174 case EvenAligned:
00175 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00176 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00177 break;
00178 case FirstAligned:
00179 {
00180 Index j = alignedStart;
00181 if(peels>1)
00182 {
00183 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
00184 ResPacket T0, T1;
00185
00186 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00187 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00188 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00189
00190 for (; j<peeledSize; j+=peels*ResPacketSize)
00191 {
00192 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00193 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00194 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00195
00196 A00 = pload<LhsPacket>(&lhs0[j]);
00197 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
00198 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
00199 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
00200
00201 T0 = pcj.pmadd(A01, ptmp1, T0);
00202 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00203 T0 = pcj.pmadd(A02, ptmp2, T0);
00204 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00205 T0 = pcj.pmadd(A03, ptmp3, T0);
00206 pstore(&res[j],T0);
00207 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00208 T1 = pcj.pmadd(A11, ptmp1, T1);
00209 T1 = pcj.pmadd(A12, ptmp2, T1);
00210 T1 = pcj.pmadd(A13, ptmp3, T1);
00211 pstore(&res[j+ResPacketSize],T1);
00212 }
00213 }
00214 for (; j<alignedSize; j+=ResPacketSize)
00215 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00216 break;
00217 }
00218 default:
00219 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00220 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00221 break;
00222 }
00223 }
00224 }
00225
00226
00227 for (Index j=alignedSize; j<size; ++j)
00228 {
00229 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00230 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00231 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00232 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00233 }
00234 }
00235
00236
00237 Index end = cols;
00238 Index start = columnBound;
00239 do
00240 {
00241 for (Index k=start; k<end; ++k)
00242 {
00243 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
00244 const LhsScalar* lhs0 = lhs + k*lhsStride;
00245
00246 if (Vectorizable)
00247 {
00248
00249
00250 for (Index j=0; j<alignedStart; ++j)
00251 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
00252
00253 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00254 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00255 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00256 else
00257 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00258 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00259 }
00260
00261
00262 for (Index i=alignedSize; i<size; ++i)
00263 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
00264 }
00265 if (skipColumns)
00266 {
00267 start = 0;
00268 end = skipColumns;
00269 skipColumns = 0;
00270 }
00271 else
00272 break;
00273 } while(Vectorizable);
00274 #undef _EIGEN_ACCUMULATE_PACKETS
00275 }
00276 };
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00289 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00290 {
00291 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00292
00293 enum {
00294 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00295 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00296 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00297 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00298 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00299 };
00300
00301 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00302 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00303 typedef typename packet_traits<ResScalar>::type _ResPacket;
00304
00305 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00306 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00307 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00308
00309 EIGEN_DONT_INLINE static void run(
00310 Index rows, Index cols,
00311 const LhsScalar* lhs, Index lhsStride,
00312 const RhsScalar* rhs, Index rhsIncr,
00313 ResScalar* res, Index resIncr,
00314 ResScalar alpha)
00315 {
00316 EIGEN_UNUSED_VARIABLE(rhsIncr);
00317 eigen_internal_assert(rhsIncr==1);
00318 #ifdef _EIGEN_ACCUMULATE_PACKETS
00319 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00320 #endif
00321
00322 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
00323 RhsPacket b = pload<RhsPacket>(&rhs[j]); \
00324 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
00325 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
00326 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
00327 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
00328
00329 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00330 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00331
00332 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
00333 const Index rowsAtOnce = 4;
00334 const Index peels = 2;
00335 const Index RhsPacketAlignedMask = RhsPacketSize-1;
00336 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00337 const Index depth = cols;
00338
00339
00340
00341
00342 Index alignedStart = internal::first_aligned(rhs, depth);
00343 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
00344 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00345
00346 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00347 Index alignmentPattern = alignmentStep==0 ? AllAligned
00348 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00349 : FirstAligned;
00350
00351
00352 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
00353
00354
00355 Index skipRows = 0;
00356
00357 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
00358 {
00359 alignedSize = 0;
00360 alignedStart = 0;
00361 }
00362 else if (LhsPacketSize>1)
00363 {
00364 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
00365
00366 while (skipRows<LhsPacketSize &&
00367 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
00368 ++skipRows;
00369 if (skipRows==LhsPacketSize)
00370 {
00371
00372 alignmentPattern = NoneAligned;
00373 skipRows = 0;
00374 }
00375 else
00376 {
00377 skipRows = (std::min)(skipRows,Index(rows));
00378
00379 }
00380 eigen_internal_assert( alignmentPattern==NoneAligned
00381 || LhsPacketSize==1
00382 || (skipRows + rowsAtOnce >= rows)
00383 || LhsPacketSize > depth
00384 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
00385 }
00386 else if(Vectorizable)
00387 {
00388 alignedStart = 0;
00389 alignedSize = depth;
00390 alignmentPattern = AllAligned;
00391 }
00392
00393 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00394 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00395
00396 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
00397 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
00398 {
00399 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00400 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
00401
00402
00403 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00404 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00405
00406 if (Vectorizable)
00407 {
00408
00409 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
00410 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
00411
00412
00413
00414 for (Index j=0; j<alignedStart; ++j)
00415 {
00416 RhsScalar b = rhs[j];
00417 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00418 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00419 }
00420
00421 if (alignedSize>alignedStart)
00422 {
00423 switch(alignmentPattern)
00424 {
00425 case AllAligned:
00426 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00427 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00428 break;
00429 case EvenAligned:
00430 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00431 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00432 break;
00433 case FirstAligned:
00434 {
00435 Index j = alignedStart;
00436 if (peels>1)
00437 {
00438
00439
00440
00441
00442
00443
00444 LhsPacket A01, A02, A03, A11, A12, A13;
00445 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00446 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00447 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00448
00449 for (; j<peeledSize; j+=peels*RhsPacketSize)
00450 {
00451 RhsPacket b = pload<RhsPacket>(&rhs[j]);
00452 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00453 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00454 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00455
00456 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
00457 ptmp1 = pcj.pmadd(A01, b, ptmp1);
00458 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00459 ptmp2 = pcj.pmadd(A02, b, ptmp2);
00460 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00461 ptmp3 = pcj.pmadd(A03, b, ptmp3);
00462 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00463
00464 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
00465 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
00466 ptmp1 = pcj.pmadd(A11, b, ptmp1);
00467 ptmp2 = pcj.pmadd(A12, b, ptmp2);
00468 ptmp3 = pcj.pmadd(A13, b, ptmp3);
00469 }
00470 }
00471 for (; j<alignedSize; j+=RhsPacketSize)
00472 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00473 break;
00474 }
00475 default:
00476 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00477 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00478 break;
00479 }
00480 tmp0 += predux(ptmp0);
00481 tmp1 += predux(ptmp1);
00482 tmp2 += predux(ptmp2);
00483 tmp3 += predux(ptmp3);
00484 }
00485 }
00486
00487
00488
00489 for (Index j=alignedSize; j<depth; ++j)
00490 {
00491 RhsScalar b = rhs[j];
00492 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00493 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00494 }
00495 res[i*resIncr] += alpha*tmp0;
00496 res[(i+offset1)*resIncr] += alpha*tmp1;
00497 res[(i+2)*resIncr] += alpha*tmp2;
00498 res[(i+offset3)*resIncr] += alpha*tmp3;
00499 }
00500
00501
00502 Index end = rows;
00503 Index start = rowBound;
00504 do
00505 {
00506 for (Index i=start; i<end; ++i)
00507 {
00508 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00509 ResPacket ptmp0 = pset1<ResPacket>(tmp0);
00510 const LhsScalar* lhs0 = lhs + i*lhsStride;
00511
00512
00513 for (Index j=0; j<alignedStart; ++j)
00514 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00515
00516 if (alignedSize>alignedStart)
00517 {
00518
00519 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00520 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00521 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00522 else
00523 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00524 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00525 tmp0 += predux(ptmp0);
00526 }
00527
00528
00529
00530 for (Index j=alignedSize; j<depth; ++j)
00531 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00532 res[i*resIncr] += alpha*tmp0;
00533 }
00534 if (skipRows)
00535 {
00536 start = 0;
00537 end = skipRows;
00538 skipRows = 0;
00539 }
00540 else
00541 break;
00542 } while(Vectorizable);
00543
00544 #undef _EIGEN_ACCUMULATE_PACKETS
00545 }
00546 };
00547
00548 }
00549
00550 }
00551
00552 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H