hand_vec_interface.hh
Go to the documentation of this file.
00001 //=====================================================
00002 // File   :  hand_vec_interface.hh
00003 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
00004 //=====================================================
00005 //
00006 // This program is free software; you can redistribute it and/or
00007 // modify it under the terms of the GNU General Public License
00008 // as published by the Free Software Foundation; either version 2
00009 // of the License, or (at your option) any later version.
00010 //
00011 // This program is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 // You should have received a copy of the GNU General Public License
00016 // along with this program; if not, write to the Free Software
00017 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00018 //
00019 #ifndef HAND_VEC_INTERFACE_HH
00020 #define HAND_VEC_INTERFACE_HH
00021 
00022 #include <Eigen/Core>
00023 #include "f77_interface.hh"
00024 
00025 using namespace Eigen;
00026 
00027 template<class real>
00028 class hand_vec_interface : public f77_interface_base<real> {
00029 
00030 public :
00031 
00032   typedef typename internal::packet_traits<real>::type Packet;
00033   static const int PacketSize = internal::packet_traits<real>::size;
00034 
00035   typedef typename f77_interface_base<real>::stl_matrix stl_matrix;
00036   typedef typename f77_interface_base<real>::stl_vector stl_vector;
00037   typedef typename f77_interface_base<real>::gene_matrix gene_matrix;
00038   typedef typename f77_interface_base<real>::gene_vector gene_vector;
00039 
00040   static void free_matrix(gene_matrix & A, int N){
00041     internal::aligned_free(A);
00042   }
00043 
00044   static void free_vector(gene_vector & B){
00045     internal::aligned_free(B);
00046   }
00047 
00048   static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
00049     int N = A_stl.size();
00050     A = (real*)internal::aligned_malloc(N*N*sizeof(real));
00051     for (int j=0;j<N;j++)
00052       for (int i=0;i<N;i++)
00053         A[i+N*j] = A_stl[j][i];
00054   }
00055 
00056   static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){
00057     int N = B_stl.size();
00058     B = (real*)internal::aligned_malloc(N*sizeof(real));
00059     for (int i=0;i<N;i++)
00060       B[i] = B_stl[i];
00061   }
00062 
00063   static inline std::string name() {
00064     #ifdef PEELING
00065     return "hand_vectorized_peeling";
00066     #else
00067     return "hand_vectorized";
00068     #endif
00069   }
00070 
00071   static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
00072   {
00073     asm("#begin matrix_vector_product");
00074     int AN = (N/PacketSize)*PacketSize;
00075     int ANP = (AN/(2*PacketSize))*2*PacketSize;
00076     int bound = (N/4)*4;
00077     for (int i=0;i<N;i++)
00078       X[i] = 0;
00079 
00080     for (int i=0;i<bound;i+=4)
00081     {
00082       register real* __restrict__ A0 = A + i*N;
00083       register real* __restrict__ A1 = A + (i+1)*N;
00084       register real* __restrict__ A2 = A + (i+2)*N;
00085       register real* __restrict__ A3 = A + (i+3)*N;
00086 
00087       Packet ptmp0 = internal::pset1(B[i]);
00088       Packet ptmp1 = internal::pset1(B[i+1]);
00089       Packet ptmp2 = internal::pset1(B[i+2]);
00090       Packet ptmp3 = internal::pset1(B[i+3]);
00091 //       register Packet ptmp0, ptmp1, ptmp2, ptmp3;
00092 //       asm(
00093 //
00094 //           "movss     (%[B],%[j],4), %[ptmp0]  \n\t"
00095 //           "shufps   $0,%[ptmp0],%[ptmp0] \n\t"
00096 //           "movss    4(%[B],%[j],4), %[ptmp1]  \n\t"
00097 //           "shufps   $0,%[ptmp1],%[ptmp1] \n\t"
00098 //           "movss    8(%[B],%[j],4), %[ptmp2]  \n\t"
00099 //           "shufps   $0,%[ptmp2],%[ptmp2] \n\t"
00100 //           "movss   12(%[B],%[j],4), %[ptmp3]  \n\t"
00101 //           "shufps   $0,%[ptmp3],%[ptmp3] \n\t"
00102 //           : [ptmp0] "=x" (ptmp0),
00103 //             [ptmp1] "=x" (ptmp1),
00104 //             [ptmp2] "=x" (ptmp2),
00105 //             [ptmp3] "=x" (ptmp3)
00106 //           : [B] "r" (B),
00107 //             [j] "r" (size_t(i))
00108 //           : );
00109 
00110       if (AN>0)
00111       {
00112 //         for (size_t j = 0;j<ANP;j+=8)
00113 //         {
00114 //           asm(
00115 //
00116 //           "movaps     (%[A0],%[j],4), %%xmm8  \n\t"
00117 //           "movaps   16(%[A0],%[j],4), %%xmm12 \n\t"
00118 //           "movups     (%[A3],%[j],4), %%xmm11 \n\t"
00119 //           "movups   16(%[A3],%[j],4), %%xmm15 \n\t"
00120 //           "movups     (%[A2],%[j],4), %%xmm10 \n\t"
00121 //           "movups   16(%[A2],%[j],4), %%xmm14 \n\t"
00122 //           "movups     (%[A1],%[j],4), %%xmm9  \n\t"
00123 //           "movups   16(%[A1],%[j],4), %%xmm13 \n\t"
00124 //
00125 //           "mulps %[ptmp0], %%xmm8  \n\t"
00126 //           "addps (%[res0],%[j],4), %%xmm8  \n\t"
00127 //           "mulps %[ptmp3], %%xmm11 \n\t"
00128 //           "addps %%xmm11, %%xmm8  \n\t"
00129 //           "mulps %[ptmp2], %%xmm10 \n\t"
00130 //           "addps %%xmm10, %%xmm8  \n\t"
00131 //           "mulps %[ptmp1], %%xmm9  \n\t"
00132 //           "addps %%xmm9, %%xmm8   \n\t"
00133 //           "movaps %%xmm8, (%[res0],%[j],4)  \n\t"
00134 //
00135 //           "mulps %[ptmp0], %%xmm12 \n\t"
00136 //           "addps 16(%[res0],%[j],4), %%xmm12  \n\t"
00137 //           "mulps %[ptmp3], %%xmm15 \n\t"
00138 //           "addps %%xmm15, %%xmm12  \n\t"
00139 //           "mulps %[ptmp2], %%xmm14 \n\t"
00140 //           "addps %%xmm14, %%xmm12  \n\t"
00141 //           "mulps %[ptmp1], %%xmm13 \n\t"
00142 //           "addps %%xmm13, %%xmm12  \n\t"
00143 //           "movaps %%xmm12, 16(%[res0],%[j],4) \n\t"
00144 //           :
00145 //           : [res0] "r" (X), [j] "r" (j),[A0] "r" (A0),
00146 //             [A1] "r" (A1),
00147 //             [A2] "r" (A2),
00148 //             [A3] "r" (A3),
00149 //             [ptmp0] "x" (ptmp0),
00150 //             [ptmp1] "x" (ptmp1),
00151 //             [ptmp2] "x" (ptmp2),
00152 //             [ptmp3] "x" (ptmp3)
00153 //           : "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "%r14");
00154 //         }
00155           register Packet A00;
00156           register Packet A01;
00157           register Packet A02;
00158           register Packet A03;
00159           register Packet A10;
00160           register Packet A11;
00161           register Packet A12;
00162           register Packet A13;
00163           for (int j = 0;j<ANP;j+=2*PacketSize)
00164           {
00165 //             A00 = internal::pload(&A0[j]);
00166 //             A01 = internal::ploadu(&A1[j]);
00167 //             A02 = internal::ploadu(&A2[j]);
00168 //             A03 = internal::ploadu(&A3[j]);
00169 //             A10 = internal::pload(&A0[j+PacketSize]);
00170 //             A11 = internal::ploadu(&A1[j+PacketSize]);
00171 //             A12 = internal::ploadu(&A2[j+PacketSize]);
00172 //             A13 = internal::ploadu(&A3[j+PacketSize]);
00173 //
00174 //             A00 = internal::pmul(ptmp0, A00);
00175 //             A01 = internal::pmul(ptmp1, A01);
00176 //             A02 = internal::pmul(ptmp2, A02);
00177 //             A03 = internal::pmul(ptmp3, A03);
00178 //             A10 = internal::pmul(ptmp0, A10);
00179 //             A11 = internal::pmul(ptmp1, A11);
00180 //             A12 = internal::pmul(ptmp2, A12);
00181 //             A13 = internal::pmul(ptmp3, A13);
00182 //
00183 //             A00 = internal::padd(A00,A01);
00184 //             A02 = internal::padd(A02,A03);
00185 //             A00 = internal::padd(A00,internal::pload(&X[j]));
00186 //             A00 = internal::padd(A00,A02);
00187 //             internal::pstore(&X[j],A00);
00188 //
00189 //             A10 = internal::padd(A10,A11);
00190 //             A12 = internal::padd(A12,A13);
00191 //             A10 = internal::padd(A10,internal::pload(&X[j+PacketSize]));
00192 //             A10 = internal::padd(A10,A12);
00193 //             internal::pstore(&X[j+PacketSize],A10);
00194 
00195             internal::pstore(&X[j],
00196               internal::padd(internal::pload(&X[j]),
00197                 internal::padd(
00198                   internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j])),internal::pmul(ptmp1,internal::ploadu(&A1[j]))),
00199                   internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j])),internal::pmul(ptmp3,internal::ploadu(&A3[j]))) )));
00200 
00201             internal::pstore(&X[j+PacketSize],
00202               internal::padd(internal::pload(&X[j+PacketSize]),
00203                 internal::padd(
00204                   internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j+PacketSize])),internal::pmul(ptmp1,internal::ploadu(&A1[j+PacketSize]))),
00205                   internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j+PacketSize])),internal::pmul(ptmp3,internal::ploadu(&A3[j+PacketSize]))) )));
00206           }
00207           for (int j = ANP;j<AN;j+=PacketSize)
00208             internal::pstore(&X[j],
00209               internal::padd(internal::pload(&X[j]),
00210                 internal::padd(
00211                   internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j])),internal::pmul(ptmp1,internal::ploadu(&A1[j]))),
00212                   internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j])),internal::pmul(ptmp3,internal::ploadu(&A3[j]))) )));
00213       }
00214       // process remaining scalars
00215       for (int j=AN;j<N;j++)
00216         X[j] += internal::pfirst(ptmp0) * A0[j] + internal::pfirst(ptmp1) * A1[j] + internal::pfirst(ptmp2) * A2[j] + internal::pfirst(ptmp3) * A3[j];
00217     }
00218     for (int i=bound;i<N;i++)
00219     {
00220       real tmp0 = B[i];
00221       Packet ptmp0 = internal::pset1(tmp0);
00222       int iN0 = i*N;
00223       if (AN>0)
00224       {
00225         bool aligned0 = (iN0 % PacketSize) == 0;
00226         if (aligned0)
00227           for (int j = 0;j<AN;j+=PacketSize)
00228             internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j])));
00229         else
00230           for (int j = 0;j<AN;j+=PacketSize)
00231             internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j])));
00232       }
00233       // process remaining scalars
00234       for (int j=AN;j<N;j++)
00235         X[j] += tmp0 * A[j+iN0];
00236     }
00237     asm("#end matrix_vector_product");
00238   }
00239   
00240   static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
00241   {
00242     
00243 //     int AN = (N/PacketSize)*PacketSize;
00244 //     int ANP = (AN/(2*PacketSize))*2*PacketSize;
00245 //     int bound = (N/4)*4;
00246     for (int i=0;i<N;i++)
00247       X[i] = 0;
00248     
00249     int bound = std::max(0,N-8) & 0xfffffffE;
00250 
00251     for (int j=0;j<bound;j+=2)
00252     {
00253       register real* __restrict__ A0 = A + j*N;
00254       register real* __restrict__ A1 = A + (j+1)*N;
00255       
00256       real t0 = B[j];
00257       Packet ptmp0 = internal::pset1(t0);
00258       real t1 = B[j+1];
00259       Packet ptmp1 = internal::pset1(t1);
00260       
00261       real t2 = 0;
00262       Packet ptmp2 = internal::pset1(t2);
00263       real t3 = 0;
00264       Packet ptmp3 = internal::pset1(t3);
00265       
00266       int starti = j+2;
00267       int alignedEnd = starti;
00268       int alignedStart = (starti) + internal::first_aligned(&X[starti], N-starti);
00269       alignedEnd = alignedStart + ((N-alignedStart)/(PacketSize))*(PacketSize);
00270 
00271       X[j]   += t0 * A0[j];
00272       X[j+1] += t1 * A1[j];
00273       
00274       X[j+1] += t0 * A0[j+1];
00275       t2 += A0[j+1] * B[j+1];
00276       
00277 //       alignedStart = alignedEnd;
00278       for (int i=starti; i<alignedStart; ++i) {
00279         X[i] += t0 * A0[i] + t1 * A1[i];
00280         t2 += A0[i] * B[i];
00281         t3 += A1[i] * B[i];
00282       }
00283       asm("#begin symv");
00284       for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) {
00285         Packet A0i = internal::ploadu(&A0[i]);
00286         Packet A1i = internal::ploadu(&A1[i]);
00287 //         Packet A0i1 = internal::ploadu(&A0[i+PacketSize]);
00288         Packet Xi = internal::pload(&X[i]);
00289         Packet Bi = internal::pload/*u*/(&B[i]);
00290 //         Packet Xi1 = internal::pload(&X[i+PacketSize]);
00291 //         Packet Bi1 = internal::pload/*u*/(&B[i+PacketSize]);
00292         Xi = internal::padd(internal::padd(Xi, internal::pmul(ptmp0, A0i)), internal::pmul(ptmp1, A1i));
00293         ptmp2 = internal::padd(ptmp2, internal::pmul(A0i, Bi));
00294         ptmp3 = internal::padd(ptmp3, internal::pmul(A1i, Bi));
00295 //         Xi1 = internal::padd(Xi1, internal::pmul(ptmp1, A0i1));
00296 //         ptmp2 = internal::padd(ptmp2, internal::pmul(A0i1, Bi1));
00297 //         
00298         internal::pstore(&X[i],Xi);
00299 //         internal::pstore(&X[i+PacketSize],Xi1);
00300 //         asm(
00301 //           "prefetchnta   64(%[A0],%[i],4)   \n\t"
00302 //           //"movups     (%[A0],%[i],4), %%xmm8  \n\t"
00303 //           "movsd       (%[A0],%[i],4), %%xmm8  \n\t"
00304 //           "movhps     8(%[A0],%[i],4), %%xmm8  \n\t"
00305 // //           "movups   16(%[A0],%[i],4), %%xmm9  \n\t"
00306 // //           "movups   64(%[A0],%[i],4), %%xmm15  \n\t"
00307 //           "movaps     (%[B], %[i],4), %%xmm12 \n\t"
00308 // //           "movaps   16(%[B], %[i],4), %%xmm13 \n\t"
00309 //           "movaps     (%[X], %[i],4), %%xmm10 \n\t"
00310 // //           "movaps   16(%[X], %[i],4), %%xmm11 \n\t"
00311 //           
00312 //           "mulps %%xmm8, %%xmm12  \n\t"
00313 // //           "mulps %%xmm9, %%xmm13  \n\t"
00314 //           
00315 //           "mulps %[ptmp1], %%xmm8  \n\t"
00316 //           "addps %%xmm12, %[ptmp2]  \n\t"
00317 //           "addps %%xmm8, %%xmm10  \n\t"
00318 //           
00319 //           
00320 //           
00321 //           
00322 // //           "mulps %[ptmp1], %%xmm9  \n\t"
00323 //           
00324 // //           "addps %%xmm9, %%xmm11  \n\t"
00325 // //           "addps %%xmm13, %[ptmp2]  \n\t"
00326 //           
00327 //           "movaps %%xmm10,   (%[X],%[i],4) \n\t"
00328 // //           "movaps %%xmm11, 16(%[X],%[i],4) \n\t"
00329 //           : 
00330 //           : [X] "r" (X), [i] "r" (i), [A0] "r" (A0),
00331 //             [B] "r" (B),
00332 //             [ptmp1] "x" (ptmp1),
00333 //             [ptmp2] "x" (ptmp2)
00334 //           : "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm15");
00335       }
00336       asm("#end symv");
00337       for (int i=alignedEnd; i<N; i++) {
00338         X[i] += t0 * A0[i] + t1 * A1[i];
00339         t2 += A0[i] * B[i];
00340         t3 += A1[i] * B[i];
00341       }
00342       
00343       
00344       X[j]   += t2 + internal::predux(ptmp2);
00345       X[j+1] += t3 + internal::predux(ptmp3);
00346     }
00347     for (int j=bound;j<N;j++)
00348     {
00349       register real* __restrict__ A0 = A + j*N;
00350       
00351       real t1 = B[j];
00352       real t2 = 0;
00353       X[j] += t1 * A0[j];
00354       for (int i=j+1; i<N; i+=PacketSize) {
00355         X[i] += t1 * A0[i];
00356         t2 += A0[i] * B[i];
00357       }
00358       X[j] += t2;
00359     }
00360     
00361   }
00362 
00363 //   static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
00364 //   {
00365 //     asm("#begin matrix_vector_product");
00366 //     int AN = (N/PacketSize)*PacketSize;
00367 //     int ANP = (AN/(2*PacketSize))*2*PacketSize;
00368 //     int bound = (N/4)*4;
00369 //     for (int i=0;i<N;i++)
00370 //       X[i] = 0;
00371 //
00372 //     for (int i=0;i<bound;i+=4)
00373 //     {
00374 //       real tmp0 = B[i];
00375 //       Packet ptmp0 = internal::pset1(tmp0);
00376 //       real tmp1 = B[i+1];
00377 //       Packet ptmp1 = internal::pset1(tmp1);
00378 //       real tmp2 = B[i+2];
00379 //       Packet ptmp2 = internal::pset1(tmp2);
00380 //       real tmp3 = B[i+3];
00381 //       Packet ptmp3 = internal::pset1(tmp3);
00382 //       int iN0 = i*N;
00383 //       int iN1 = (i+1)*N;
00384 //       int iN2 = (i+2)*N;
00385 //       int iN3 = (i+3)*N;
00386 //       if (AN>0)
00387 //       {
00388 // //         int aligned0 = (iN0 % PacketSize);
00389 //         int aligned1 = (iN1 % PacketSize);
00390 //
00391 //         if (aligned1==0)
00392 //         {
00393 //           for (int j = 0;j<AN;j+=PacketSize)
00394 //           {
00395 //             internal::pstore(&X[j],
00396 //               internal::padd(internal::pload(&X[j]),
00397 //                 internal::padd(
00398 //                   internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::pload(&A[j+iN1]))),
00399 //                   internal::padd(internal::pmul(ptmp2,internal::pload(&A[j+iN2])),internal::pmul(ptmp3,internal::pload(&A[j+iN3]))) )));
00400 //           }
00401 //         }
00402 //         else if (aligned1==2)
00403 //         {
00404 //           for (int j = 0;j<AN;j+=PacketSize)
00405 //           {
00406 //             internal::pstore(&X[j],
00407 //               internal::padd(internal::pload(&X[j]),
00408 //                 internal::padd(
00409 //                   internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))),
00410 //                   internal::padd(internal::pmul(ptmp2,internal::pload(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) )));
00411 //           }
00412 //         }
00413 //         else
00414 //         {
00415 //           for (int j = 0;j<ANP;j+=2*PacketSize)
00416 //           {
00417 //             internal::pstore(&X[j],
00418 //               internal::padd(internal::pload(&X[j]),
00419 //                 internal::padd(
00420 //                   internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))),
00421 //                   internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) )));
00422 //
00423 //             internal::pstore(&X[j+PacketSize],
00424 //               internal::padd(internal::pload(&X[j+PacketSize]),
00425 //                 internal::padd(
00426 //                   internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+PacketSize+iN1]))),
00427 //                   internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+PacketSize+iN3]))) )));
00428 //
00429 // //             internal::pstore(&X[j+2*PacketSize],
00430 // //               internal::padd(internal::pload(&X[j+2*PacketSize]),
00431 // //                 internal::padd(
00432 // //                   internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+2*PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+2*PacketSize+iN1]))),
00433 // //                   internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+2*PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+2*PacketSize+iN3]))) )));
00434 // //
00435 // //             internal::pstore(&X[j+3*PacketSize],
00436 // //               internal::padd(internal::pload(&X[j+3*PacketSize]),
00437 // //                 internal::padd(
00438 // //                   internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+3*PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+3*PacketSize+iN1]))),
00439 // //                   internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+3*PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+3*PacketSize+iN3]))) )));
00440 //
00441 //           }
00442 //           for (int j = ANP;j<AN;j+=PacketSize)
00443 //             internal::pstore(&X[j],
00444 //               internal::padd(internal::pload(&X[j]),
00445 //                 internal::padd(
00446 //                   internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))),
00447 //                   internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) )));
00448 //         }
00449 //       }
00450 //       // process remaining scalars
00451 //       for (int j=AN;j<N;j++)
00452 //         X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1] + tmp2 * A[j+iN2] + tmp3 * A[j+iN3];
00453 //     }
00454 //     for (int i=bound;i<N;i++)
00455 //     {
00456 //       real tmp0 = B[i];
00457 //       Packet ptmp0 = internal::pset1(tmp0);
00458 //       int iN0 = i*N;
00459 //       if (AN>0)
00460 //       {
00461 //         bool aligned0 = (iN0 % PacketSize) == 0;
00462 //         if (aligned0)
00463 //           for (int j = 0;j<AN;j+=PacketSize)
00464 //             internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j])));
00465 //         else
00466 //           for (int j = 0;j<AN;j+=PacketSize)
00467 //             internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j])));
00468 //       }
00469 //       // process remaining scalars
00470 //       for (int j=AN;j<N;j++)
00471 //         X[j] += tmp0 * A[j+iN0];
00472 //     }
00473 //     asm("#end matrix_vector_product");
00474 //   }
00475 
00476 //   static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
00477 //   {
00478 //     asm("#begin matrix_vector_product");
00479 //     int AN = (N/PacketSize)*PacketSize;
00480 //     for (int i=0;i<N;i++)
00481 //       X[i] = 0;
00482 //
00483 //     for (int i=0;i<N;i+=2)
00484 //     {
00485 //       real tmp0 = B[i];
00486 //       Packet ptmp0 = internal::pset1(tmp0);
00487 //       real tmp1 = B[i+1];
00488 //       Packet ptmp1 = internal::pset1(tmp1);
00489 //       int iN0 = i*N;
00490 //       int iN1 = (i+1)*N;
00491 //       if (AN>0)
00492 //       {
00493 //         bool aligned0 = (iN0 % PacketSize) == 0;
00494 //         bool aligned1 = (iN1 % PacketSize) == 0;
00495 //
00496 //         if (aligned0 && aligned1)
00497 //         {
00498 //           for (int j = 0;j<AN;j+=PacketSize)
00499 //           {
00500 //             internal::pstore(&X[j],
00501 //               internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),
00502 //               internal::padd(internal::pmul(ptmp1,internal::pload(&A[j+iN1])),internal::pload(&X[j]))));
00503 //           }
00504 //         }
00505 //         else if (aligned0)
00506 //         {
00507 //           for (int j = 0;j<AN;j+=PacketSize)
00508 //           {
00509 //             internal::pstore(&X[j],
00510 //               internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),
00511 //               internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j]))));
00512 //           }
00513 //         }
00514 //         else if (aligned1)
00515 //         {
00516 //           for (int j = 0;j<AN;j+=PacketSize)
00517 //           {
00518 //             internal::pstore(&X[j],
00519 //               internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),
00520 //               internal::padd(internal::pmul(ptmp1,internal::pload(&A[j+iN1])),internal::pload(&X[j]))));
00521 //           }
00522 //         }
00523 //         else
00524 //         {
00525 //           int ANP = (AN/(4*PacketSize))*4*PacketSize;
00526 //           for (int j = 0;j<ANP;j+=4*PacketSize)
00527 //           {
00528 //             internal::pstore(&X[j],
00529 //               internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),
00530 //               internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j]))));
00531 //
00532 //             internal::pstore(&X[j+PacketSize],
00533 //               internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+PacketSize+iN0])),
00534 //               internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+PacketSize+iN1])),internal::pload(&X[j+PacketSize]))));
00535 //
00536 //             internal::pstore(&X[j+2*PacketSize],
00537 //               internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+2*PacketSize+iN0])),
00538 //               internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+2*PacketSize+iN1])),internal::pload(&X[j+2*PacketSize]))));
00539 //
00540 //             internal::pstore(&X[j+3*PacketSize],
00541 //               internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+3*PacketSize+iN0])),
00542 //               internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+3*PacketSize+iN1])),internal::pload(&X[j+3*PacketSize]))));
00543 //           }
00544 //           for (int j = ANP;j<AN;j+=PacketSize)
00545 //             internal::pstore(&X[j],
00546 //               internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),
00547 //               internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j]))));
00548 //         }
00549 //       }
00550 //       // process remaining scalars
00551 //       for (int j=AN;j<N;j++)
00552 //         X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1];
00553 //     }
00554 //     int remaining = (N/2)*2;
00555 //     for (int i=remaining;i<N;i++)
00556 //     {
00557 //       real tmp0 = B[i];
00558 //       Packet ptmp0 = internal::pset1(tmp0);
00559 //       int iN0 = i*N;
00560 //       if (AN>0)
00561 //       {
00562 //         bool aligned0 = (iN0 % PacketSize) == 0;
00563 //         if (aligned0)
00564 //           for (int j = 0;j<AN;j+=PacketSize)
00565 //             internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j])));
00566 //         else
00567 //           for (int j = 0;j<AN;j+=PacketSize)
00568 //             internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j])));
00569 //       }
00570 //       // process remaining scalars
00571 //       for (int j=AN;j<N;j++)
00572 //         X[j] += tmp0 * A[j+iN0];
00573 //     }
00574 //     asm("#end matrix_vector_product");
00575 //   }
00576 
00577 //   static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
00578 //   {
00579 //     asm("#begin matrix_vector_product");
00580 //     int AN = (N/PacketSize)*PacketSize;
00581 //     for (int i=0;i<N;i++)
00582 //       X[i] = 0;
00583 //     for (int i=0;i<N;i++)
00584 //     {
00585 //       real tmp = B[i];
00586 //       Packet ptmp = internal::pset1(tmp);
00587 //       int iN = i*N;
00588 //       if (AN>0)
00589 //       {
00590 //         bool aligned = (iN % PacketSize) == 0;
00591 //         if (aligned)
00592 //         {
00593 //           #ifdef PEELING
00594 //           Packet A0, A1, A2, X0, X1, X2;
00595 //           int ANP = (AN/(8*PacketSize))*8*PacketSize;
00596 //           for (int j = 0;j<ANP;j+=PacketSize*8)
00597 //           {
00598 //             A0 = internal::pload(&A[j+iN]);
00599 //             X0 = internal::pload(&X[j]);
00600 //             A1 = internal::pload(&A[j+PacketSize+iN]);
00601 //             X1 = internal::pload(&X[j+PacketSize]);
00602 //             A2 = internal::pload(&A[j+2*PacketSize+iN]);
00603 //             X2 = internal::pload(&X[j+2*PacketSize]);
00604 //             internal::pstore(&X[j], internal::padd(X0, internal::pmul(ptmp,A0)));
00605 //             A0 = internal::pload(&A[j+3*PacketSize+iN]);
00606 //             X0 = internal::pload(&X[j+3*PacketSize]);
00607 //             internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1)));
00608 //             A1 = internal::pload(&A[j+4*PacketSize+iN]);
00609 //             X1 = internal::pload(&X[j+4*PacketSize]);
00610 //             internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X2), internal::pmul(ptmp,A2)));
00611 //             A2 = internal::pload(&A[j+5*PacketSize+iN]);
00612 //             X2 = internal::pload(&X[j+5*PacketSize]);
00613 //             internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X0), internal::pmul(ptmp,A0)));
00614 //             A0 = internal::pload(&A[j+6*PacketSize+iN]);
00615 //             X0 = internal::pload(&X[j+6*PacketSize]);
00616 //             internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1)));
00617 //             A1 = internal::pload(&A[j+7*PacketSize+iN]);
00618 //             X1 = internal::pload(&X[j+7*PacketSize]);
00619 //             internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X2), internal::pmul(ptmp,A2)));
00620 //             internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X0), internal::pmul(ptmp,A0)));
00621 //             internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1)));
00622 // //
00623 // //             internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN]))));
00624 // //             internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X[j+PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+PacketSize+iN]))));
00625 // //             internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X[j+2*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+2*PacketSize+iN]))));
00626 // //             internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X[j+3*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+3*PacketSize+iN]))));
00627 // //             internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X[j+4*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+4*PacketSize+iN]))));
00628 // //             internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X[j+5*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+5*PacketSize+iN]))));
00629 // //             internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X[j+6*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+6*PacketSize+iN]))));
00630 // //             internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X[j+7*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+7*PacketSize+iN]))));
00631 //           }
00632 //           for (int j = ANP;j<AN;j+=PacketSize)
00633 //             internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN]))));
00634 //           #else
00635 //           for (int j = 0;j<AN;j+=PacketSize)
00636 //             internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN]))));
00637 //           #endif
00638 //         }
00639 //         else
00640 //         {
00641 //           #ifdef PEELING
00642 //           int ANP = (AN/(8*PacketSize))*8*PacketSize;
00643 //           for (int j = 0;j<ANP;j+=PacketSize*8)
00644 //           {
00645 //             internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN]))));
00646 //             internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X[j+PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+PacketSize+iN]))));
00647 //             internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X[j+2*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+2*PacketSize+iN]))));
00648 //             internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X[j+3*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+3*PacketSize+iN]))));
00649 //             internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X[j+4*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+4*PacketSize+iN]))));
00650 //             internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X[j+5*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+5*PacketSize+iN]))));
00651 //             internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X[j+6*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+6*PacketSize+iN]))));
00652 //             internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X[j+7*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+7*PacketSize+iN]))));
00653 //           }
00654 //           for (int j = ANP;j<AN;j+=PacketSize)
00655 //             internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN]))));
00656 //           #else
00657 //           for (int j = 0;j<AN;j+=PacketSize)
00658 //             internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN]))));
00659 //           #endif
00660 //         }
00661 //       }
00662 //       // process remaining scalars
00663 //       for (int j=AN;j<N;j++)
00664 //         X[j] += tmp * A[j+iN];
00665 //     }
00666 //     asm("#end matrix_vector_product");
00667 //   }
00668 
00669     static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
00670   {
00671     int AN = (N/PacketSize)*PacketSize;
00672     int bound = (N/4)*4;
00673     for (int i=0;i<bound;i+=4)
00674     {
00675       real tmp0 = 0;
00676       Packet ptmp0 = internal::pset1(real(0));
00677       real tmp1 = 0;
00678       Packet ptmp1 = internal::pset1(real(0));
00679       real tmp2 = 0;
00680       Packet ptmp2 = internal::pset1(real(0));
00681       real tmp3 = 0;
00682       Packet ptmp3 = internal::pset1(real(0));
00683       int iN0 = i*N;
00684       int iN1 = (i+1)*N;
00685       int iN2 = (i+2)*N;
00686       int iN3 = (i+3)*N;
00687       if (AN>0)
00688       {
00689         int align1 = (iN1 % PacketSize);
00690         if (align1==0)
00691         {
00692           for (int j = 0;j<AN;j+=PacketSize)
00693           {
00694             Packet b = internal::pload(&B[j]);
00695             ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0])));
00696             ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::pload(&A[j+iN1])));
00697             ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::pload(&A[j+iN2])));
00698             ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::pload(&A[j+iN3])));
00699           }
00700         }
00701         else if (align1==2)
00702         {
00703           for (int j = 0;j<AN;j+=PacketSize)
00704           {
00705             Packet b = internal::pload(&B[j]);
00706             ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0])));
00707             ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::ploadu(&A[j+iN1])));
00708             ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::pload(&A[j+iN2])));
00709             ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::ploadu(&A[j+iN3])));
00710           }
00711         }
00712         else
00713         {
00714           for (int j = 0;j<AN;j+=PacketSize)
00715           {
00716             Packet b = internal::pload(&B[j]);
00717             ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0])));
00718             ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::ploadu(&A[j+iN1])));
00719             ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::ploadu(&A[j+iN2])));
00720             ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::ploadu(&A[j+iN3])));
00721           }
00722         }
00723         tmp0 = internal::predux(ptmp0);
00724         tmp1 = internal::predux(ptmp1);
00725         tmp2 = internal::predux(ptmp2);
00726         tmp3 = internal::predux(ptmp3);
00727       }
00728       // process remaining scalars
00729       for (int j=AN;j<N;j++)
00730       {
00731         tmp0 += B[j] * A[j+iN0];
00732         tmp1 += B[j] * A[j+iN1];
00733         tmp2 += B[j] * A[j+iN2];
00734         tmp3 += B[j] * A[j+iN3];
00735       }
00736       X[i+0] = tmp0;
00737       X[i+1] = tmp1;
00738       X[i+2] = tmp2;
00739       X[i+3] = tmp3;
00740     }
00741 
00742     for (int i=bound;i<N;i++)
00743     {
00744       real tmp0 = 0;
00745       Packet ptmp0 = internal::pset1(real(0));
00746       int iN0 = i*N;
00747       if (AN>0)
00748       {
00749         if (iN0 % PacketSize==0)
00750           for (int j = 0;j<AN;j+=PacketSize)
00751             ptmp0 = internal::padd(ptmp0, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN0])));
00752         else
00753           for (int j = 0;j<AN;j+=PacketSize)
00754             ptmp0 = internal::padd(ptmp0, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN0])));
00755         tmp0 = internal::predux(ptmp0);
00756       }
00757       // process remaining scalars
00758       for (int j=AN;j<N;j++)
00759         tmp0 += B[j] * A[j+iN0];
00760       X[i+0] = tmp0;
00761     }
00762   }
00763 
00764 //   static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
00765 //   {
00766 //     int AN = (N/PacketSize)*PacketSize;
00767 //     for (int i=0;i<N;i++)
00768 //       X[i] = 0;
00769 //     for (int i=0;i<N;i++)
00770 //     {
00771 //       real tmp = 0;
00772 //       Packet ptmp = internal::pset1(real(0));
00773 //       int iN = i*N;
00774 //       if (AN>0)
00775 //       {
00776 //         bool aligned = (iN % PacketSize) == 0;
00777 //         if (aligned)
00778 //         {
00779 //           #ifdef PEELING
00780 //           int ANP = (AN/(8*PacketSize))*8*PacketSize;
00781 //           for (int j = 0;j<ANP;j+=PacketSize*8)
00782 //           {
00783 //             ptmp =
00784 //               internal::padd(internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN])),
00785 //               internal::padd(internal::pmul(internal::pload(&B[j+PacketSize]), internal::pload(&A[j+PacketSize+iN])),
00786 //               internal::padd(internal::pmul(internal::pload(&B[j+2*PacketSize]), internal::pload(&A[j+2*PacketSize+iN])),
00787 //               internal::padd(internal::pmul(internal::pload(&B[j+3*PacketSize]), internal::pload(&A[j+3*PacketSize+iN])),
00788 //               internal::padd(internal::pmul(internal::pload(&B[j+4*PacketSize]), internal::pload(&A[j+4*PacketSize+iN])),
00789 //               internal::padd(internal::pmul(internal::pload(&B[j+5*PacketSize]), internal::pload(&A[j+5*PacketSize+iN])),
00790 //               internal::padd(internal::pmul(internal::pload(&B[j+6*PacketSize]), internal::pload(&A[j+6*PacketSize+iN])),
00791 //               internal::padd(internal::pmul(internal::pload(&B[j+7*PacketSize]), internal::pload(&A[j+7*PacketSize+iN])),
00792 //               ptmp))))))));
00793 //           }
00794 //           for (int j = ANP;j<AN;j+=PacketSize)
00795 //             ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN])));
00796 //           #else
00797 //           for (int j = 0;j<AN;j+=PacketSize)
00798 //             ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN])));
00799 //           #endif
00800 //         }
00801 //         else
00802 //         {
00803 //           #ifdef PEELING
00804 //           int ANP = (AN/(8*PacketSize))*8*PacketSize;
00805 //           for (int j = 0;j<ANP;j+=PacketSize*8)
00806 //           {
00807 //             ptmp =
00808 //               internal::padd(internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN])),
00809 //               internal::padd(internal::pmul(internal::pload(&B[j+PacketSize]), internal::ploadu(&A[j+PacketSize+iN])),
00810 //               internal::padd(internal::pmul(internal::pload(&B[j+2*PacketSize]), internal::ploadu(&A[j+2*PacketSize+iN])),
00811 //               internal::padd(internal::pmul(internal::pload(&B[j+3*PacketSize]), internal::ploadu(&A[j+3*PacketSize+iN])),
00812 //               internal::padd(internal::pmul(internal::pload(&B[j+4*PacketSize]), internal::ploadu(&A[j+4*PacketSize+iN])),
00813 //               internal::padd(internal::pmul(internal::pload(&B[j+5*PacketSize]), internal::ploadu(&A[j+5*PacketSize+iN])),
00814 //               internal::padd(internal::pmul(internal::pload(&B[j+6*PacketSize]), internal::ploadu(&A[j+6*PacketSize+iN])),
00815 //               internal::padd(internal::pmul(internal::pload(&B[j+7*PacketSize]), internal::ploadu(&A[j+7*PacketSize+iN])),
00816 //               ptmp))))))));
00817 //           }
00818 //           for (int j = ANP;j<AN;j+=PacketSize)
00819 //             ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN])));
00820 //           #else
00821 //           for (int j = 0;j<AN;j+=PacketSize)
00822 //             ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN])));
00823 //           #endif
00824 //         }
00825 //         tmp = internal::predux(ptmp);
00826 //       }
00827 //       // process remaining scalars
00828 //       for (int j=AN;j<N;j++)
00829 //         tmp += B[j] * A[j+iN];
00830 //       X[i] = tmp;
00831 //     }
00832 //   }
00833 
00834   static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){
00835     int AN = (N/PacketSize)*PacketSize;
00836     if (AN>0)
00837     {
00838       Packet pcoef = internal::pset1(coef);
00839       #ifdef PEELING
00840       const int peelSize = 3;
00841       int ANP = (AN/(peelSize*PacketSize))*peelSize*PacketSize;
00842       float* X1 = X + PacketSize;
00843       float* Y1 = Y + PacketSize;
00844       float* X2 = X + 2*PacketSize;
00845       float* Y2 = Y + 2*PacketSize;
00846       Packet x0,x1,x2,y0,y1,y2;
00847       for (int j = 0;j<ANP;j+=PacketSize*peelSize)
00848       {
00849         x0 = internal::pload(X+j);
00850         x1 = internal::pload(X1+j);
00851         x2 = internal::pload(X2+j);
00852 
00853         y0 = internal::pload(Y+j);
00854         y1 = internal::pload(Y1+j);
00855         y2 = internal::pload(Y2+j);
00856 
00857         y0 = internal::pmadd(pcoef, x0, y0);
00858         y1 = internal::pmadd(pcoef, x1, y1);
00859         y2 = internal::pmadd(pcoef, x2, y2);
00860 
00861         internal::pstore(Y+j,  y0);
00862         internal::pstore(Y1+j, y1);
00863         internal::pstore(Y2+j, y2);
00864 //         internal::pstore(&Y[j+2*PacketSize], internal::padd(internal::pload(&Y[j+2*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+2*PacketSize]))));
00865 //         internal::pstore(&Y[j+3*PacketSize], internal::padd(internal::pload(&Y[j+3*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+3*PacketSize]))));
00866 //         internal::pstore(&Y[j+4*PacketSize], internal::padd(internal::pload(&Y[j+4*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+4*PacketSize]))));
00867 //         internal::pstore(&Y[j+5*PacketSize], internal::padd(internal::pload(&Y[j+5*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+5*PacketSize]))));
00868 //         internal::pstore(&Y[j+6*PacketSize], internal::padd(internal::pload(&Y[j+6*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+6*PacketSize]))));
00869 //         internal::pstore(&Y[j+7*PacketSize], internal::padd(internal::pload(&Y[j+7*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+7*PacketSize]))));
00870       }
00871       for (int j = ANP;j<AN;j+=PacketSize)
00872         internal::pstore(&Y[j], internal::padd(internal::pload(&Y[j]), internal::pmul(pcoef,internal::pload(&X[j]))));
00873       #else
00874       for (int j = 0;j<AN;j+=PacketSize)
00875         internal::pstore(&Y[j], internal::padd(internal::pload(&Y[j]), internal::pmul(pcoef,internal::pload(&X[j]))));
00876       #endif
00877     }
00878     // process remaining scalars
00879     for (int i=AN;i<N;i++)
00880       Y[i] += coef * X[i];
00881   }
00882 
00883 
00884 };
00885 
00886 #endif


libicr
Author(s): Robert Krug
autogenerated on Mon Jan 6 2014 11:32:45