GeneralMatrixVector.h
Go to the documentation of this file.
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
00011 #define EIGEN_GENERAL_MATRIX_VECTOR_H
00012 
00013 namespace Eigen { 
00014 
00015 namespace internal {
00016 
00017 /* Optimized col-major matrix * vector product:
00018  * This algorithm processes 4 columns at onces that allows to both reduce
00019  * the number of load/stores of the result by a factor 4 and to reduce
00020  * the instruction dependency. Moreover, we know that all bands have the
00021  * same alignment pattern.
00022  *
00023  * Mixing type logic: C += alpha * A * B
00024  *  |  A  |  B  |alpha| comments
00025  *  |real |cplx |cplx | no vectorization
00026  *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
00027  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
00028  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
00029  */
00030 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00031 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00032 {
00033 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00034 
00035 enum {
00036   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00037               && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00038   LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00039   RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00040   ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00041 };
00042 
00043 typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00044 typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00045 typedef typename packet_traits<ResScalar>::type  _ResPacket;
00046 
00047 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00048 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00049 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00050 
00051 EIGEN_DONT_INLINE static void run(
00052   Index rows, Index cols,
00053   const LhsScalar* lhs, Index lhsStride,
00054   const RhsScalar* rhs, Index rhsIncr,
00055   ResScalar* res, Index
00056   #ifdef EIGEN_INTERNAL_DEBUGGING
00057     resIncr
00058   #endif
00059   , RhsScalar alpha)
00060 {
00061   eigen_internal_assert(resIncr==1);
00062   #ifdef _EIGEN_ACCUMULATE_PACKETS
00063   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00064   #endif
00065   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
00066     pstore(&res[j], \
00067       padd(pload<ResPacket>(&res[j]), \
00068         padd( \
00069           padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
00070                   pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
00071           padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
00072                   pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
00073 
00074   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00075   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00076   if(ConjugateRhs)
00077     alpha = conj(alpha);
00078 
00079   enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
00080   const Index columnsAtOnce = 4;
00081   const Index peels = 2;
00082   const Index LhsPacketAlignedMask = LhsPacketSize-1;
00083   const Index ResPacketAlignedMask = ResPacketSize-1;
00084   const Index size = rows;
00085   
00086   // How many coeffs of the result do we have to skip to be aligned.
00087   // Here we assume data are at least aligned on the base scalar type.
00088   Index alignedStart = internal::first_aligned(res,size);
00089   Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
00090   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00091 
00092   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00093   Index alignmentPattern = alignmentStep==0 ? AllAligned
00094                        : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00095                        : FirstAligned;
00096 
00097   // we cannot assume the first element is aligned because of sub-matrices
00098   const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
00099 
00100   // find how many columns do we have to skip to be aligned with the result (if possible)
00101   Index skipColumns = 0;
00102   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
00103   if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
00104   {
00105     alignedSize = 0;
00106     alignedStart = 0;
00107   }
00108   else if (LhsPacketSize>1)
00109   {
00110     eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
00111 
00112     while (skipColumns<LhsPacketSize &&
00113           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
00114       ++skipColumns;
00115     if (skipColumns==LhsPacketSize)
00116     {
00117       // nothing can be aligned, no need to skip any column
00118       alignmentPattern = NoneAligned;
00119       skipColumns = 0;
00120     }
00121     else
00122     {
00123       skipColumns = (std::min)(skipColumns,cols);
00124       // note that the skiped columns are processed later.
00125     }
00126 
00127     eigen_internal_assert(  (alignmentPattern==NoneAligned)
00128                       || (skipColumns + columnsAtOnce >= cols)
00129                       || LhsPacketSize > size
00130                       || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
00131   }
00132   else if(Vectorizable)
00133   {
00134     alignedStart = 0;
00135     alignedSize = size;
00136     alignmentPattern = AllAligned;
00137   }
00138 
00139   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00140   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00141 
00142   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
00143   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
00144   {
00145     RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
00146               ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
00147               ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
00148               ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
00149 
00150     // this helps a lot generating better binary code
00151     const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
00152                     *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00153 
00154     if (Vectorizable)
00155     {
00156       /* explicit vectorization */
00157       // process initial unaligned coeffs
00158       for (Index j=0; j<alignedStart; ++j)
00159       {
00160         res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00161         res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00162         res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00163         res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00164       }
00165 
00166       if (alignedSize>alignedStart)
00167       {
00168         switch(alignmentPattern)
00169         {
00170           case AllAligned:
00171             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00172               _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00173             break;
00174           case EvenAligned:
00175             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00176               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00177             break;
00178           case FirstAligned:
00179           {
00180             Index j = alignedStart;
00181             if(peels>1)
00182             {
00183               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
00184               ResPacket T0, T1;
00185 
00186               A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00187               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00188               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00189 
00190               for (; j<peeledSize; j+=peels*ResPacketSize)
00191               {
00192                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
00193                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
00194                 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
00195 
00196                 A00 = pload<LhsPacket>(&lhs0[j]);
00197                 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
00198                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
00199                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
00200 
00201                 T0  = pcj.pmadd(A01, ptmp1, T0);
00202                 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
00203                 T0  = pcj.pmadd(A02, ptmp2, T0);
00204                 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
00205                 T0  = pcj.pmadd(A03, ptmp3, T0);
00206                 pstore(&res[j],T0);
00207                 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
00208                 T1  = pcj.pmadd(A11, ptmp1, T1);
00209                 T1  = pcj.pmadd(A12, ptmp2, T1);
00210                 T1  = pcj.pmadd(A13, ptmp3, T1);
00211                 pstore(&res[j+ResPacketSize],T1);
00212               }
00213             }
00214             for (; j<alignedSize; j+=ResPacketSize)
00215               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00216             break;
00217           }
00218           default:
00219             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00220               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00221             break;
00222         }
00223       }
00224     } // end explicit vectorization
00225 
00226     /* process remaining coeffs (or all if there is no explicit vectorization) */
00227     for (Index j=alignedSize; j<size; ++j)
00228     {
00229       res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00230       res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00231       res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00232       res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00233     }
00234   }
00235 
00236   // process remaining first and last columns (at most columnsAtOnce-1)
00237   Index end = cols;
00238   Index start = columnBound;
00239   do
00240   {
00241     for (Index k=start; k<end; ++k)
00242     {
00243       RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
00244       const LhsScalar* lhs0 = lhs + k*lhsStride;
00245 
00246       if (Vectorizable)
00247       {
00248         /* explicit vectorization */
00249         // process first unaligned result's coeffs
00250         for (Index j=0; j<alignedStart; ++j)
00251           res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
00252         // process aligned result's coeffs
00253         if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00254           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00255             pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00256         else
00257           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00258             pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00259       }
00260 
00261       // process remaining scalars (or all if no explicit vectorization)
00262       for (Index i=alignedSize; i<size; ++i)
00263         res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
00264     }
00265     if (skipColumns)
00266     {
00267       start = 0;
00268       end = skipColumns;
00269       skipColumns = 0;
00270     }
00271     else
00272       break;
00273   } while(Vectorizable);
00274   #undef _EIGEN_ACCUMULATE_PACKETS
00275 }
00276 };
00277 
00278 /* Optimized row-major matrix * vector product:
00279  * This algorithm processes 4 rows at onces that allows to both reduce
00280  * the number of load/stores of the result by a factor 4 and to reduce
00281  * the instruction dependency. Moreover, we know that all bands have the
00282  * same alignment pattern.
00283  *
00284  * Mixing type logic:
00285  *  - alpha is always a complex (or converted to a complex)
00286  *  - no vectorization
00287  */
00288 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00289 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00290 {
00291 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00292 
00293 enum {
00294   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00295               && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00296   LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00297   RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00298   ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00299 };
00300 
00301 typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00302 typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00303 typedef typename packet_traits<ResScalar>::type  _ResPacket;
00304 
00305 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00306 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00307 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00308   
00309 EIGEN_DONT_INLINE static void run(
00310   Index rows, Index cols,
00311   const LhsScalar* lhs, Index lhsStride,
00312   const RhsScalar* rhs, Index rhsIncr,
00313   ResScalar* res, Index resIncr,
00314   ResScalar alpha)
00315 {
00316   EIGEN_UNUSED_VARIABLE(rhsIncr);
00317   eigen_internal_assert(rhsIncr==1);
00318   #ifdef _EIGEN_ACCUMULATE_PACKETS
00319   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00320   #endif
00321 
00322   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
00323     RhsPacket b = pload<RhsPacket>(&rhs[j]); \
00324     ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
00325     ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
00326     ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
00327     ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
00328 
00329   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00330   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00331 
00332   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
00333   const Index rowsAtOnce = 4;
00334   const Index peels = 2;
00335   const Index RhsPacketAlignedMask = RhsPacketSize-1;
00336   const Index LhsPacketAlignedMask = LhsPacketSize-1;
00337   const Index depth = cols;
00338 
00339   // How many coeffs of the result do we have to skip to be aligned.
00340   // Here we assume data are at least aligned on the base scalar type
00341   // if that's not the case then vectorization is discarded, see below.
00342   Index alignedStart = internal::first_aligned(rhs, depth);
00343   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
00344   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00345 
00346   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00347   Index alignmentPattern = alignmentStep==0 ? AllAligned
00348                          : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00349                          : FirstAligned;
00350 
00351   // we cannot assume the first element is aligned because of sub-matrices
00352   const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
00353 
00354   // find how many rows do we have to skip to be aligned with rhs (if possible)
00355   Index skipRows = 0;
00356   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
00357   if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
00358   {
00359     alignedSize = 0;
00360     alignedStart = 0;
00361   }
00362   else if (LhsPacketSize>1)
00363   {
00364     eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
00365 
00366     while (skipRows<LhsPacketSize &&
00367            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
00368       ++skipRows;
00369     if (skipRows==LhsPacketSize)
00370     {
00371       // nothing can be aligned, no need to skip any column
00372       alignmentPattern = NoneAligned;
00373       skipRows = 0;
00374     }
00375     else
00376     {
00377       skipRows = (std::min)(skipRows,Index(rows));
00378       // note that the skiped columns are processed later.
00379     }
00380     eigen_internal_assert(  alignmentPattern==NoneAligned
00381                       || LhsPacketSize==1
00382                       || (skipRows + rowsAtOnce >= rows)
00383                       || LhsPacketSize > depth
00384                       || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
00385   }
00386   else if(Vectorizable)
00387   {
00388     alignedStart = 0;
00389     alignedSize = depth;
00390     alignmentPattern = AllAligned;
00391   }
00392 
00393   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00394   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00395 
00396   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
00397   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
00398   {
00399     EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00400     ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
00401 
00402     // this helps the compiler generating good binary code
00403     const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
00404                     *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00405 
00406     if (Vectorizable)
00407     {
00408       /* explicit vectorization */
00409       ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
00410                 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
00411 
00412       // process initial unaligned coeffs
00413       // FIXME this loop get vectorized by the compiler !
00414       for (Index j=0; j<alignedStart; ++j)
00415       {
00416         RhsScalar b = rhs[j];
00417         tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00418         tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00419       }
00420 
00421       if (alignedSize>alignedStart)
00422       {
00423         switch(alignmentPattern)
00424         {
00425           case AllAligned:
00426             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00427               _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00428             break;
00429           case EvenAligned:
00430             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00431               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00432             break;
00433           case FirstAligned:
00434           {
00435             Index j = alignedStart;
00436             if (peels>1)
00437             {
00438               /* Here we proccess 4 rows with with two peeled iterations to hide
00439                * the overhead of unaligned loads. Moreover unaligned loads are handled
00440                * using special shift/move operations between the two aligned packets
00441                * overlaping the desired unaligned packet. This is *much* more efficient
00442                * than basic unaligned loads.
00443                */
00444               LhsPacket A01, A02, A03, A11, A12, A13;
00445               A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00446               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00447               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00448 
00449               for (; j<peeledSize; j+=peels*RhsPacketSize)
00450               {
00451                 RhsPacket b = pload<RhsPacket>(&rhs[j]);
00452                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
00453                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
00454                 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
00455 
00456                 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
00457                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
00458                 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
00459                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
00460                 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
00461                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
00462                 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
00463 
00464                 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
00465                 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
00466                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
00467                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
00468                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
00469               }
00470             }
00471             for (; j<alignedSize; j+=RhsPacketSize)
00472               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00473             break;
00474           }
00475           default:
00476             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00477               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00478             break;
00479         }
00480         tmp0 += predux(ptmp0);
00481         tmp1 += predux(ptmp1);
00482         tmp2 += predux(ptmp2);
00483         tmp3 += predux(ptmp3);
00484       }
00485     } // end explicit vectorization
00486 
00487     // process remaining coeffs (or all if no explicit vectorization)
00488     // FIXME this loop get vectorized by the compiler !
00489     for (Index j=alignedSize; j<depth; ++j)
00490     {
00491       RhsScalar b = rhs[j];
00492       tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00493       tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00494     }
00495     res[i*resIncr]            += alpha*tmp0;
00496     res[(i+offset1)*resIncr]  += alpha*tmp1;
00497     res[(i+2)*resIncr]        += alpha*tmp2;
00498     res[(i+offset3)*resIncr]  += alpha*tmp3;
00499   }
00500 
00501   // process remaining first and last rows (at most columnsAtOnce-1)
00502   Index end = rows;
00503   Index start = rowBound;
00504   do
00505   {
00506     for (Index i=start; i<end; ++i)
00507     {
00508       EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00509       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
00510       const LhsScalar* lhs0 = lhs + i*lhsStride;
00511       // process first unaligned result's coeffs
00512       // FIXME this loop get vectorized by the compiler !
00513       for (Index j=0; j<alignedStart; ++j)
00514         tmp0 += cj.pmul(lhs0[j], rhs[j]);
00515 
00516       if (alignedSize>alignedStart)
00517       {
00518         // process aligned rhs coeffs
00519         if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00520           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00521             ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00522         else
00523           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00524             ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00525         tmp0 += predux(ptmp0);
00526       }
00527 
00528       // process remaining scalars
00529       // FIXME this loop get vectorized by the compiler !
00530       for (Index j=alignedSize; j<depth; ++j)
00531         tmp0 += cj.pmul(lhs0[j], rhs[j]);
00532       res[i*resIncr] += alpha*tmp0;
00533     }
00534     if (skipRows)
00535     {
00536       start = 0;
00537       end = skipRows;
00538       skipRows = 0;
00539     }
00540     else
00541       break;
00542   } while(Vectorizable);
00543 
00544   #undef _EIGEN_ACCUMULATE_PACKETS
00545 }
00546 };
00547 
00548 } // end namespace internal
00549 
00550 } // end namespace Eigen
00551 
00552 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H


win_eigen
Author(s): Daniel Stonier
autogenerated on Wed Sep 16 2015 07:10:39