00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
00011 #define EIGEN_GENERAL_MATRIX_VECTOR_H
00012
00013 namespace Eigen {
00014
00015 namespace internal {
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00031 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00032 {
00033 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00034
00035 enum {
00036 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00037 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00038 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00039 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00040 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00041 };
00042
00043 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00044 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00045 typedef typename packet_traits<ResScalar>::type _ResPacket;
00046
00047 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00048 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00049 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00050
00051 EIGEN_DONT_INLINE static void run(
00052 Index rows, Index cols,
00053 const LhsScalar* lhs, Index lhsStride,
00054 const RhsScalar* rhs, Index rhsIncr,
00055 ResScalar* res, Index resIncr, RhsScalar alpha);
00056 };
00057
00058 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00059 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
00060 Index rows, Index cols,
00061 const LhsScalar* lhs, Index lhsStride,
00062 const RhsScalar* rhs, Index rhsIncr,
00063 ResScalar* res, Index resIncr, RhsScalar alpha)
00064 {
00065 EIGEN_UNUSED_VARIABLE(resIncr)
00066 eigen_internal_assert(resIncr==1);
00067 #ifdef _EIGEN_ACCUMULATE_PACKETS
00068 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00069 #endif
00070 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
00071 pstore(&res[j], \
00072 padd(pload<ResPacket>(&res[j]), \
00073 padd( \
00074 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
00075 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
00076 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
00077 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
00078
00079 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00080 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00081 if(ConjugateRhs)
00082 alpha = numext::conj(alpha);
00083
00084 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
00085 const Index columnsAtOnce = 4;
00086 const Index peels = 2;
00087 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00088 const Index ResPacketAlignedMask = ResPacketSize-1;
00089
00090 const Index size = rows;
00091
00092
00093
00094 Index alignedStart = internal::first_aligned(res,size);
00095 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
00096 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00097
00098 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00099 Index alignmentPattern = alignmentStep==0 ? AllAligned
00100 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00101 : FirstAligned;
00102
00103
00104 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
00105
00106
00107 Index skipColumns = 0;
00108
00109 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
00110 {
00111 alignedSize = 0;
00112 alignedStart = 0;
00113 }
00114 else if (LhsPacketSize>1)
00115 {
00116 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
00117
00118 while (skipColumns<LhsPacketSize &&
00119 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
00120 ++skipColumns;
00121 if (skipColumns==LhsPacketSize)
00122 {
00123
00124 alignmentPattern = NoneAligned;
00125 skipColumns = 0;
00126 }
00127 else
00128 {
00129 skipColumns = (std::min)(skipColumns,cols);
00130
00131 }
00132
00133 eigen_internal_assert( (alignmentPattern==NoneAligned)
00134 || (skipColumns + columnsAtOnce >= cols)
00135 || LhsPacketSize > size
00136 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
00137 }
00138 else if(Vectorizable)
00139 {
00140 alignedStart = 0;
00141 alignedSize = size;
00142 alignmentPattern = AllAligned;
00143 }
00144
00145 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00146 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00147
00148 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
00149 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
00150 {
00151 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
00152 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
00153 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
00154 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
00155
00156
00157 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00158 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00159
00160 if (Vectorizable)
00161 {
00162
00163
00164 for (Index j=0; j<alignedStart; ++j)
00165 {
00166 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00167 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00168 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00169 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00170 }
00171
00172 if (alignedSize>alignedStart)
00173 {
00174 switch(alignmentPattern)
00175 {
00176 case AllAligned:
00177 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00178 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00179 break;
00180 case EvenAligned:
00181 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00182 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00183 break;
00184 case FirstAligned:
00185 {
00186 Index j = alignedStart;
00187 if(peels>1)
00188 {
00189 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
00190 ResPacket T0, T1;
00191
00192 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00193 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00194 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00195
00196 for (; j<peeledSize; j+=peels*ResPacketSize)
00197 {
00198 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00199 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00200 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00201
00202 A00 = pload<LhsPacket>(&lhs0[j]);
00203 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
00204 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
00205 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
00206
00207 T0 = pcj.pmadd(A01, ptmp1, T0);
00208 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00209 T0 = pcj.pmadd(A02, ptmp2, T0);
00210 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00211 T0 = pcj.pmadd(A03, ptmp3, T0);
00212 pstore(&res[j],T0);
00213 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00214 T1 = pcj.pmadd(A11, ptmp1, T1);
00215 T1 = pcj.pmadd(A12, ptmp2, T1);
00216 T1 = pcj.pmadd(A13, ptmp3, T1);
00217 pstore(&res[j+ResPacketSize],T1);
00218 }
00219 }
00220 for (; j<alignedSize; j+=ResPacketSize)
00221 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00222 break;
00223 }
00224 default:
00225 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00226 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00227 break;
00228 }
00229 }
00230 }
00231
00232
00233 for (Index j=alignedSize; j<size; ++j)
00234 {
00235 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00236 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00237 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00238 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00239 }
00240 }
00241
00242
00243 Index end = cols;
00244 Index start = columnBound;
00245 do
00246 {
00247 for (Index k=start; k<end; ++k)
00248 {
00249 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
00250 const LhsScalar* lhs0 = lhs + k*lhsStride;
00251
00252 if (Vectorizable)
00253 {
00254
00255
00256 for (Index j=0; j<alignedStart; ++j)
00257 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
00258
00259 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00260 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00261 pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00262 else
00263 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00264 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00265 }
00266
00267
00268 for (Index i=alignedSize; i<size; ++i)
00269 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
00270 }
00271 if (skipColumns)
00272 {
00273 start = 0;
00274 end = skipColumns;
00275 skipColumns = 0;
00276 }
00277 else
00278 break;
00279 } while(Vectorizable);
00280 #undef _EIGEN_ACCUMULATE_PACKETS
00281 }
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00294 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00295 {
00296 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00297
00298 enum {
00299 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00300 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00301 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00302 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00303 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00304 };
00305
00306 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00307 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00308 typedef typename packet_traits<ResScalar>::type _ResPacket;
00309
00310 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00311 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00312 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00313
00314 EIGEN_DONT_INLINE static void run(
00315 Index rows, Index cols,
00316 const LhsScalar* lhs, Index lhsStride,
00317 const RhsScalar* rhs, Index rhsIncr,
00318 ResScalar* res, Index resIncr,
00319 ResScalar alpha);
00320 };
00321
00322 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00323 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
00324 Index rows, Index cols,
00325 const LhsScalar* lhs, Index lhsStride,
00326 const RhsScalar* rhs, Index rhsIncr,
00327 ResScalar* res, Index resIncr,
00328 ResScalar alpha)
00329 {
00330 EIGEN_UNUSED_VARIABLE(rhsIncr);
00331 eigen_internal_assert(rhsIncr==1);
00332 #ifdef _EIGEN_ACCUMULATE_PACKETS
00333 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00334 #endif
00335
00336 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
00337 RhsPacket b = pload<RhsPacket>(&rhs[j]); \
00338 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
00339 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
00340 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
00341 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
00342
00343 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00344 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00345
00346 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
00347 const Index rowsAtOnce = 4;
00348 const Index peels = 2;
00349 const Index RhsPacketAlignedMask = RhsPacketSize-1;
00350 const Index LhsPacketAlignedMask = LhsPacketSize-1;
00351
00352 const Index depth = cols;
00353
00354
00355
00356
00357 Index alignedStart = internal::first_aligned(rhs, depth);
00358 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
00359 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00360
00361 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00362 Index alignmentPattern = alignmentStep==0 ? AllAligned
00363 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00364 : FirstAligned;
00365
00366
00367 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
00368
00369
00370 Index skipRows = 0;
00371
00372 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
00373 {
00374 alignedSize = 0;
00375 alignedStart = 0;
00376 }
00377 else if (LhsPacketSize>1)
00378 {
00379 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
00380
00381 while (skipRows<LhsPacketSize &&
00382 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
00383 ++skipRows;
00384 if (skipRows==LhsPacketSize)
00385 {
00386
00387 alignmentPattern = NoneAligned;
00388 skipRows = 0;
00389 }
00390 else
00391 {
00392 skipRows = (std::min)(skipRows,Index(rows));
00393
00394 }
00395 eigen_internal_assert( alignmentPattern==NoneAligned
00396 || LhsPacketSize==1
00397 || (skipRows + rowsAtOnce >= rows)
00398 || LhsPacketSize > depth
00399 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
00400 }
00401 else if(Vectorizable)
00402 {
00403 alignedStart = 0;
00404 alignedSize = depth;
00405 alignmentPattern = AllAligned;
00406 }
00407
00408 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00409 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00410
00411 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
00412 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
00413 {
00414 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00415 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
00416
00417
00418 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00419 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00420
00421 if (Vectorizable)
00422 {
00423
00424 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
00425 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
00426
00427
00428
00429 for (Index j=0; j<alignedStart; ++j)
00430 {
00431 RhsScalar b = rhs[j];
00432 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00433 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00434 }
00435
00436 if (alignedSize>alignedStart)
00437 {
00438 switch(alignmentPattern)
00439 {
00440 case AllAligned:
00441 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00442 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00443 break;
00444 case EvenAligned:
00445 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00446 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00447 break;
00448 case FirstAligned:
00449 {
00450 Index j = alignedStart;
00451 if (peels>1)
00452 {
00453
00454
00455
00456
00457
00458
00459 LhsPacket A01, A02, A03, A11, A12, A13;
00460 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00461 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00462 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00463
00464 for (; j<peeledSize; j+=peels*RhsPacketSize)
00465 {
00466 RhsPacket b = pload<RhsPacket>(&rhs[j]);
00467 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
00468 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
00469 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
00470
00471 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
00472 ptmp1 = pcj.pmadd(A01, b, ptmp1);
00473 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
00474 ptmp2 = pcj.pmadd(A02, b, ptmp2);
00475 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
00476 ptmp3 = pcj.pmadd(A03, b, ptmp3);
00477 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
00478
00479 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
00480 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
00481 ptmp1 = pcj.pmadd(A11, b, ptmp1);
00482 ptmp2 = pcj.pmadd(A12, b, ptmp2);
00483 ptmp3 = pcj.pmadd(A13, b, ptmp3);
00484 }
00485 }
00486 for (; j<alignedSize; j+=RhsPacketSize)
00487 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00488 break;
00489 }
00490 default:
00491 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00492 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00493 break;
00494 }
00495 tmp0 += predux(ptmp0);
00496 tmp1 += predux(ptmp1);
00497 tmp2 += predux(ptmp2);
00498 tmp3 += predux(ptmp3);
00499 }
00500 }
00501
00502
00503
00504 for (Index j=alignedSize; j<depth; ++j)
00505 {
00506 RhsScalar b = rhs[j];
00507 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00508 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00509 }
00510 res[i*resIncr] += alpha*tmp0;
00511 res[(i+offset1)*resIncr] += alpha*tmp1;
00512 res[(i+2)*resIncr] += alpha*tmp2;
00513 res[(i+offset3)*resIncr] += alpha*tmp3;
00514 }
00515
00516
00517 Index end = rows;
00518 Index start = rowBound;
00519 do
00520 {
00521 for (Index i=start; i<end; ++i)
00522 {
00523 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00524 ResPacket ptmp0 = pset1<ResPacket>(tmp0);
00525 const LhsScalar* lhs0 = lhs + i*lhsStride;
00526
00527
00528 for (Index j=0; j<alignedStart; ++j)
00529 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00530
00531 if (alignedSize>alignedStart)
00532 {
00533
00534 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00535 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00536 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00537 else
00538 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00539 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00540 tmp0 += predux(ptmp0);
00541 }
00542
00543
00544
00545 for (Index j=alignedSize; j<depth; ++j)
00546 tmp0 += cj.pmul(lhs0[j], rhs[j]);
00547 res[i*resIncr] += alpha*tmp0;
00548 }
00549 if (skipRows)
00550 {
00551 start = 0;
00552 end = skipRows;
00553 skipRows = 0;
00554 }
00555 else
00556 break;
00557 } while(Vectorizable);
00558
00559 #undef _EIGEN_ACCUMULATE_PACKETS
00560 }
00561
00562 }
00563
00564 }
00565
00566 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H