00001 //===================================================== 00002 // File : hand_vec_interface.hh 00003 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr> 00004 //===================================================== 00005 // 00006 // This program is free software; you can redistribute it and/or 00007 // modify it under the terms of the GNU General Public License 00008 // as published by the Free Software Foundation; either version 2 00009 // of the License, or (at your option) any later version. 00010 // 00011 // This program is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 // You should have received a copy of the GNU General Public License 00016 // along with this program; if not, write to the Free Software 00017 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00018 // 00019 #ifndef HAND_VEC_INTERFACE_HH 00020 #define HAND_VEC_INTERFACE_HH 00021 00022 #include <Eigen/Core> 00023 #include "f77_interface.hh" 00024 00025 using namespace Eigen; 00026 00027 template<class real> 00028 class hand_vec_interface : public f77_interface_base<real> { 00029 00030 public : 00031 00032 typedef typename internal::packet_traits<real>::type Packet; 00033 static const int PacketSize = internal::packet_traits<real>::size; 00034 00035 typedef typename f77_interface_base<real>::stl_matrix stl_matrix; 00036 typedef typename f77_interface_base<real>::stl_vector stl_vector; 00037 typedef typename f77_interface_base<real>::gene_matrix gene_matrix; 00038 typedef typename f77_interface_base<real>::gene_vector gene_vector; 00039 00040 static void free_matrix(gene_matrix & A, int N){ 00041 internal::aligned_free(A); 00042 } 00043 00044 static void free_vector(gene_vector & B){ 00045 internal::aligned_free(B); 00046 } 00047 00048 static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ 00049 int N = A_stl.size(); 00050 A = (real*)internal::aligned_malloc(N*N*sizeof(real)); 00051 for (int j=0;j<N;j++) 00052 for (int i=0;i<N;i++) 00053 A[i+N*j] = A_stl[j][i]; 00054 } 00055 00056 static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){ 00057 int N = B_stl.size(); 00058 B = (real*)internal::aligned_malloc(N*sizeof(real)); 00059 for (int i=0;i<N;i++) 00060 B[i] = B_stl[i]; 00061 } 00062 00063 static inline std::string name() { 00064 #ifdef PEELING 00065 return "hand_vectorized_peeling"; 00066 #else 00067 return "hand_vectorized"; 00068 #endif 00069 } 00070 00071 static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) 00072 { 00073 asm("#begin matrix_vector_product"); 00074 int AN = (N/PacketSize)*PacketSize; 00075 int ANP = (AN/(2*PacketSize))*2*PacketSize; 00076 int bound = (N/4)*4; 00077 for (int i=0;i<N;i++) 00078 X[i] = 0; 00079 00080 for (int i=0;i<bound;i+=4) 00081 { 00082 register real* __restrict__ A0 = A + i*N; 00083 register real* __restrict__ A1 = A + (i+1)*N; 00084 register real* __restrict__ A2 = A + (i+2)*N; 00085 register real* __restrict__ A3 = A + (i+3)*N; 00086 00087 Packet ptmp0 = internal::pset1(B[i]); 00088 Packet ptmp1 = internal::pset1(B[i+1]); 00089 Packet ptmp2 = internal::pset1(B[i+2]); 00090 Packet ptmp3 = internal::pset1(B[i+3]); 00091 // register Packet ptmp0, ptmp1, ptmp2, ptmp3; 00092 // asm( 00093 // 00094 // "movss (%[B],%[j],4), %[ptmp0] \n\t" 00095 // "shufps $0,%[ptmp0],%[ptmp0] \n\t" 00096 // "movss 4(%[B],%[j],4), %[ptmp1] \n\t" 00097 // "shufps $0,%[ptmp1],%[ptmp1] \n\t" 00098 // "movss 8(%[B],%[j],4), %[ptmp2] \n\t" 00099 // "shufps $0,%[ptmp2],%[ptmp2] \n\t" 00100 // "movss 12(%[B],%[j],4), %[ptmp3] \n\t" 00101 // "shufps $0,%[ptmp3],%[ptmp3] \n\t" 00102 // : [ptmp0] "=x" (ptmp0), 00103 // [ptmp1] "=x" (ptmp1), 00104 // [ptmp2] "=x" (ptmp2), 00105 // [ptmp3] "=x" (ptmp3) 00106 // : [B] "r" (B), 00107 // [j] "r" (size_t(i)) 00108 // : ); 00109 00110 if (AN>0) 00111 { 00112 // for (size_t j = 0;j<ANP;j+=8) 00113 // { 00114 // asm( 00115 // 00116 // "movaps (%[A0],%[j],4), %%xmm8 \n\t" 00117 // "movaps 16(%[A0],%[j],4), %%xmm12 \n\t" 00118 // "movups (%[A3],%[j],4), %%xmm11 \n\t" 00119 // "movups 16(%[A3],%[j],4), %%xmm15 \n\t" 00120 // "movups (%[A2],%[j],4), %%xmm10 \n\t" 00121 // "movups 16(%[A2],%[j],4), %%xmm14 \n\t" 00122 // "movups (%[A1],%[j],4), %%xmm9 \n\t" 00123 // "movups 16(%[A1],%[j],4), %%xmm13 \n\t" 00124 // 00125 // "mulps %[ptmp0], %%xmm8 \n\t" 00126 // "addps (%[res0],%[j],4), %%xmm8 \n\t" 00127 // "mulps %[ptmp3], %%xmm11 \n\t" 00128 // "addps %%xmm11, %%xmm8 \n\t" 00129 // "mulps %[ptmp2], %%xmm10 \n\t" 00130 // "addps %%xmm10, %%xmm8 \n\t" 00131 // "mulps %[ptmp1], %%xmm9 \n\t" 00132 // "addps %%xmm9, %%xmm8 \n\t" 00133 // "movaps %%xmm8, (%[res0],%[j],4) \n\t" 00134 // 00135 // "mulps %[ptmp0], %%xmm12 \n\t" 00136 // "addps 16(%[res0],%[j],4), %%xmm12 \n\t" 00137 // "mulps %[ptmp3], %%xmm15 \n\t" 00138 // "addps %%xmm15, %%xmm12 \n\t" 00139 // "mulps %[ptmp2], %%xmm14 \n\t" 00140 // "addps %%xmm14, %%xmm12 \n\t" 00141 // "mulps %[ptmp1], %%xmm13 \n\t" 00142 // "addps %%xmm13, %%xmm12 \n\t" 00143 // "movaps %%xmm12, 16(%[res0],%[j],4) \n\t" 00144 // : 00145 // : [res0] "r" (X), [j] "r" (j),[A0] "r" (A0), 00146 // [A1] "r" (A1), 00147 // [A2] "r" (A2), 00148 // [A3] "r" (A3), 00149 // [ptmp0] "x" (ptmp0), 00150 // [ptmp1] "x" (ptmp1), 00151 // [ptmp2] "x" (ptmp2), 00152 // [ptmp3] "x" (ptmp3) 00153 // : "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "%r14"); 00154 // } 00155 register Packet A00; 00156 register Packet A01; 00157 register Packet A02; 00158 register Packet A03; 00159 register Packet A10; 00160 register Packet A11; 00161 register Packet A12; 00162 register Packet A13; 00163 for (int j = 0;j<ANP;j+=2*PacketSize) 00164 { 00165 // A00 = internal::pload(&A0[j]); 00166 // A01 = internal::ploadu(&A1[j]); 00167 // A02 = internal::ploadu(&A2[j]); 00168 // A03 = internal::ploadu(&A3[j]); 00169 // A10 = internal::pload(&A0[j+PacketSize]); 00170 // A11 = internal::ploadu(&A1[j+PacketSize]); 00171 // A12 = internal::ploadu(&A2[j+PacketSize]); 00172 // A13 = internal::ploadu(&A3[j+PacketSize]); 00173 // 00174 // A00 = internal::pmul(ptmp0, A00); 00175 // A01 = internal::pmul(ptmp1, A01); 00176 // A02 = internal::pmul(ptmp2, A02); 00177 // A03 = internal::pmul(ptmp3, A03); 00178 // A10 = internal::pmul(ptmp0, A10); 00179 // A11 = internal::pmul(ptmp1, A11); 00180 // A12 = internal::pmul(ptmp2, A12); 00181 // A13 = internal::pmul(ptmp3, A13); 00182 // 00183 // A00 = internal::padd(A00,A01); 00184 // A02 = internal::padd(A02,A03); 00185 // A00 = internal::padd(A00,internal::pload(&X[j])); 00186 // A00 = internal::padd(A00,A02); 00187 // internal::pstore(&X[j],A00); 00188 // 00189 // A10 = internal::padd(A10,A11); 00190 // A12 = internal::padd(A12,A13); 00191 // A10 = internal::padd(A10,internal::pload(&X[j+PacketSize])); 00192 // A10 = internal::padd(A10,A12); 00193 // internal::pstore(&X[j+PacketSize],A10); 00194 00195 internal::pstore(&X[j], 00196 internal::padd(internal::pload(&X[j]), 00197 internal::padd( 00198 internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j])),internal::pmul(ptmp1,internal::ploadu(&A1[j]))), 00199 internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j])),internal::pmul(ptmp3,internal::ploadu(&A3[j]))) ))); 00200 00201 internal::pstore(&X[j+PacketSize], 00202 internal::padd(internal::pload(&X[j+PacketSize]), 00203 internal::padd( 00204 internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j+PacketSize])),internal::pmul(ptmp1,internal::ploadu(&A1[j+PacketSize]))), 00205 internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j+PacketSize])),internal::pmul(ptmp3,internal::ploadu(&A3[j+PacketSize]))) ))); 00206 } 00207 for (int j = ANP;j<AN;j+=PacketSize) 00208 internal::pstore(&X[j], 00209 internal::padd(internal::pload(&X[j]), 00210 internal::padd( 00211 internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j])),internal::pmul(ptmp1,internal::ploadu(&A1[j]))), 00212 internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j])),internal::pmul(ptmp3,internal::ploadu(&A3[j]))) ))); 00213 } 00214 // process remaining scalars 00215 for (int j=AN;j<N;j++) 00216 X[j] += internal::pfirst(ptmp0) * A0[j] + internal::pfirst(ptmp1) * A1[j] + internal::pfirst(ptmp2) * A2[j] + internal::pfirst(ptmp3) * A3[j]; 00217 } 00218 for (int i=bound;i<N;i++) 00219 { 00220 real tmp0 = B[i]; 00221 Packet ptmp0 = internal::pset1(tmp0); 00222 int iN0 = i*N; 00223 if (AN>0) 00224 { 00225 bool aligned0 = (iN0 % PacketSize) == 0; 00226 if (aligned0) 00227 for (int j = 0;j<AN;j+=PacketSize) 00228 internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j]))); 00229 else 00230 for (int j = 0;j<AN;j+=PacketSize) 00231 internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j]))); 00232 } 00233 // process remaining scalars 00234 for (int j=AN;j<N;j++) 00235 X[j] += tmp0 * A[j+iN0]; 00236 } 00237 asm("#end matrix_vector_product"); 00238 } 00239 00240 static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) 00241 { 00242 00243 // int AN = (N/PacketSize)*PacketSize; 00244 // int ANP = (AN/(2*PacketSize))*2*PacketSize; 00245 // int bound = (N/4)*4; 00246 for (int i=0;i<N;i++) 00247 X[i] = 0; 00248 00249 int bound = std::max(0,N-8) & 0xfffffffE; 00250 00251 for (int j=0;j<bound;j+=2) 00252 { 00253 register real* __restrict__ A0 = A + j*N; 00254 register real* __restrict__ A1 = A + (j+1)*N; 00255 00256 real t0 = B[j]; 00257 Packet ptmp0 = internal::pset1(t0); 00258 real t1 = B[j+1]; 00259 Packet ptmp1 = internal::pset1(t1); 00260 00261 real t2 = 0; 00262 Packet ptmp2 = internal::pset1(t2); 00263 real t3 = 0; 00264 Packet ptmp3 = internal::pset1(t3); 00265 00266 int starti = j+2; 00267 int alignedEnd = starti; 00268 int alignedStart = (starti) + internal::first_aligned(&X[starti], N-starti); 00269 alignedEnd = alignedStart + ((N-alignedStart)/(PacketSize))*(PacketSize); 00270 00271 X[j] += t0 * A0[j]; 00272 X[j+1] += t1 * A1[j]; 00273 00274 X[j+1] += t0 * A0[j+1]; 00275 t2 += A0[j+1] * B[j+1]; 00276 00277 // alignedStart = alignedEnd; 00278 for (int i=starti; i<alignedStart; ++i) { 00279 X[i] += t0 * A0[i] + t1 * A1[i]; 00280 t2 += A0[i] * B[i]; 00281 t3 += A1[i] * B[i]; 00282 } 00283 asm("#begin symv"); 00284 for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) { 00285 Packet A0i = internal::ploadu(&A0[i]); 00286 Packet A1i = internal::ploadu(&A1[i]); 00287 // Packet A0i1 = internal::ploadu(&A0[i+PacketSize]); 00288 Packet Xi = internal::pload(&X[i]); 00289 Packet Bi = internal::pload/*u*/(&B[i]); 00290 // Packet Xi1 = internal::pload(&X[i+PacketSize]); 00291 // Packet Bi1 = internal::pload/*u*/(&B[i+PacketSize]); 00292 Xi = internal::padd(internal::padd(Xi, internal::pmul(ptmp0, A0i)), internal::pmul(ptmp1, A1i)); 00293 ptmp2 = internal::padd(ptmp2, internal::pmul(A0i, Bi)); 00294 ptmp3 = internal::padd(ptmp3, internal::pmul(A1i, Bi)); 00295 // Xi1 = internal::padd(Xi1, internal::pmul(ptmp1, A0i1)); 00296 // ptmp2 = internal::padd(ptmp2, internal::pmul(A0i1, Bi1)); 00297 // 00298 internal::pstore(&X[i],Xi); 00299 // internal::pstore(&X[i+PacketSize],Xi1); 00300 // asm( 00301 // "prefetchnta 64(%[A0],%[i],4) \n\t" 00302 // //"movups (%[A0],%[i],4), %%xmm8 \n\t" 00303 // "movsd (%[A0],%[i],4), %%xmm8 \n\t" 00304 // "movhps 8(%[A0],%[i],4), %%xmm8 \n\t" 00305 // // "movups 16(%[A0],%[i],4), %%xmm9 \n\t" 00306 // // "movups 64(%[A0],%[i],4), %%xmm15 \n\t" 00307 // "movaps (%[B], %[i],4), %%xmm12 \n\t" 00308 // // "movaps 16(%[B], %[i],4), %%xmm13 \n\t" 00309 // "movaps (%[X], %[i],4), %%xmm10 \n\t" 00310 // // "movaps 16(%[X], %[i],4), %%xmm11 \n\t" 00311 // 00312 // "mulps %%xmm8, %%xmm12 \n\t" 00313 // // "mulps %%xmm9, %%xmm13 \n\t" 00314 // 00315 // "mulps %[ptmp1], %%xmm8 \n\t" 00316 // "addps %%xmm12, %[ptmp2] \n\t" 00317 // "addps %%xmm8, %%xmm10 \n\t" 00318 // 00319 // 00320 // 00321 // 00322 // // "mulps %[ptmp1], %%xmm9 \n\t" 00323 // 00324 // // "addps %%xmm9, %%xmm11 \n\t" 00325 // // "addps %%xmm13, %[ptmp2] \n\t" 00326 // 00327 // "movaps %%xmm10, (%[X],%[i],4) \n\t" 00328 // // "movaps %%xmm11, 16(%[X],%[i],4) \n\t" 00329 // : 00330 // : [X] "r" (X), [i] "r" (i), [A0] "r" (A0), 00331 // [B] "r" (B), 00332 // [ptmp1] "x" (ptmp1), 00333 // [ptmp2] "x" (ptmp2) 00334 // : "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm15"); 00335 } 00336 asm("#end symv"); 00337 for (int i=alignedEnd; i<N; i++) { 00338 X[i] += t0 * A0[i] + t1 * A1[i]; 00339 t2 += A0[i] * B[i]; 00340 t3 += A1[i] * B[i]; 00341 } 00342 00343 00344 X[j] += t2 + internal::predux(ptmp2); 00345 X[j+1] += t3 + internal::predux(ptmp3); 00346 } 00347 for (int j=bound;j<N;j++) 00348 { 00349 register real* __restrict__ A0 = A + j*N; 00350 00351 real t1 = B[j]; 00352 real t2 = 0; 00353 X[j] += t1 * A0[j]; 00354 for (int i=j+1; i<N; i+=PacketSize) { 00355 X[i] += t1 * A0[i]; 00356 t2 += A0[i] * B[i]; 00357 } 00358 X[j] += t2; 00359 } 00360 00361 } 00362 00363 // static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) 00364 // { 00365 // asm("#begin matrix_vector_product"); 00366 // int AN = (N/PacketSize)*PacketSize; 00367 // int ANP = (AN/(2*PacketSize))*2*PacketSize; 00368 // int bound = (N/4)*4; 00369 // for (int i=0;i<N;i++) 00370 // X[i] = 0; 00371 // 00372 // for (int i=0;i<bound;i+=4) 00373 // { 00374 // real tmp0 = B[i]; 00375 // Packet ptmp0 = internal::pset1(tmp0); 00376 // real tmp1 = B[i+1]; 00377 // Packet ptmp1 = internal::pset1(tmp1); 00378 // real tmp2 = B[i+2]; 00379 // Packet ptmp2 = internal::pset1(tmp2); 00380 // real tmp3 = B[i+3]; 00381 // Packet ptmp3 = internal::pset1(tmp3); 00382 // int iN0 = i*N; 00383 // int iN1 = (i+1)*N; 00384 // int iN2 = (i+2)*N; 00385 // int iN3 = (i+3)*N; 00386 // if (AN>0) 00387 // { 00388 // // int aligned0 = (iN0 % PacketSize); 00389 // int aligned1 = (iN1 % PacketSize); 00390 // 00391 // if (aligned1==0) 00392 // { 00393 // for (int j = 0;j<AN;j+=PacketSize) 00394 // { 00395 // internal::pstore(&X[j], 00396 // internal::padd(internal::pload(&X[j]), 00397 // internal::padd( 00398 // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::pload(&A[j+iN1]))), 00399 // internal::padd(internal::pmul(ptmp2,internal::pload(&A[j+iN2])),internal::pmul(ptmp3,internal::pload(&A[j+iN3]))) ))); 00400 // } 00401 // } 00402 // else if (aligned1==2) 00403 // { 00404 // for (int j = 0;j<AN;j+=PacketSize) 00405 // { 00406 // internal::pstore(&X[j], 00407 // internal::padd(internal::pload(&X[j]), 00408 // internal::padd( 00409 // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))), 00410 // internal::padd(internal::pmul(ptmp2,internal::pload(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) ))); 00411 // } 00412 // } 00413 // else 00414 // { 00415 // for (int j = 0;j<ANP;j+=2*PacketSize) 00416 // { 00417 // internal::pstore(&X[j], 00418 // internal::padd(internal::pload(&X[j]), 00419 // internal::padd( 00420 // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))), 00421 // internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) ))); 00422 // 00423 // internal::pstore(&X[j+PacketSize], 00424 // internal::padd(internal::pload(&X[j+PacketSize]), 00425 // internal::padd( 00426 // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+PacketSize+iN1]))), 00427 // internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+PacketSize+iN3]))) ))); 00428 // 00429 // // internal::pstore(&X[j+2*PacketSize], 00430 // // internal::padd(internal::pload(&X[j+2*PacketSize]), 00431 // // internal::padd( 00432 // // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+2*PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+2*PacketSize+iN1]))), 00433 // // internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+2*PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+2*PacketSize+iN3]))) ))); 00434 // // 00435 // // internal::pstore(&X[j+3*PacketSize], 00436 // // internal::padd(internal::pload(&X[j+3*PacketSize]), 00437 // // internal::padd( 00438 // // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+3*PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+3*PacketSize+iN1]))), 00439 // // internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+3*PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+3*PacketSize+iN3]))) ))); 00440 // 00441 // } 00442 // for (int j = ANP;j<AN;j+=PacketSize) 00443 // internal::pstore(&X[j], 00444 // internal::padd(internal::pload(&X[j]), 00445 // internal::padd( 00446 // internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))), 00447 // internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) ))); 00448 // } 00449 // } 00450 // // process remaining scalars 00451 // for (int j=AN;j<N;j++) 00452 // X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1] + tmp2 * A[j+iN2] + tmp3 * A[j+iN3]; 00453 // } 00454 // for (int i=bound;i<N;i++) 00455 // { 00456 // real tmp0 = B[i]; 00457 // Packet ptmp0 = internal::pset1(tmp0); 00458 // int iN0 = i*N; 00459 // if (AN>0) 00460 // { 00461 // bool aligned0 = (iN0 % PacketSize) == 0; 00462 // if (aligned0) 00463 // for (int j = 0;j<AN;j+=PacketSize) 00464 // internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j]))); 00465 // else 00466 // for (int j = 0;j<AN;j+=PacketSize) 00467 // internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j]))); 00468 // } 00469 // // process remaining scalars 00470 // for (int j=AN;j<N;j++) 00471 // X[j] += tmp0 * A[j+iN0]; 00472 // } 00473 // asm("#end matrix_vector_product"); 00474 // } 00475 00476 // static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) 00477 // { 00478 // asm("#begin matrix_vector_product"); 00479 // int AN = (N/PacketSize)*PacketSize; 00480 // for (int i=0;i<N;i++) 00481 // X[i] = 0; 00482 // 00483 // for (int i=0;i<N;i+=2) 00484 // { 00485 // real tmp0 = B[i]; 00486 // Packet ptmp0 = internal::pset1(tmp0); 00487 // real tmp1 = B[i+1]; 00488 // Packet ptmp1 = internal::pset1(tmp1); 00489 // int iN0 = i*N; 00490 // int iN1 = (i+1)*N; 00491 // if (AN>0) 00492 // { 00493 // bool aligned0 = (iN0 % PacketSize) == 0; 00494 // bool aligned1 = (iN1 % PacketSize) == 0; 00495 // 00496 // if (aligned0 && aligned1) 00497 // { 00498 // for (int j = 0;j<AN;j+=PacketSize) 00499 // { 00500 // internal::pstore(&X[j], 00501 // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])), 00502 // internal::padd(internal::pmul(ptmp1,internal::pload(&A[j+iN1])),internal::pload(&X[j])))); 00503 // } 00504 // } 00505 // else if (aligned0) 00506 // { 00507 // for (int j = 0;j<AN;j+=PacketSize) 00508 // { 00509 // internal::pstore(&X[j], 00510 // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])), 00511 // internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j])))); 00512 // } 00513 // } 00514 // else if (aligned1) 00515 // { 00516 // for (int j = 0;j<AN;j+=PacketSize) 00517 // { 00518 // internal::pstore(&X[j], 00519 // internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])), 00520 // internal::padd(internal::pmul(ptmp1,internal::pload(&A[j+iN1])),internal::pload(&X[j])))); 00521 // } 00522 // } 00523 // else 00524 // { 00525 // int ANP = (AN/(4*PacketSize))*4*PacketSize; 00526 // for (int j = 0;j<ANP;j+=4*PacketSize) 00527 // { 00528 // internal::pstore(&X[j], 00529 // internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])), 00530 // internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j])))); 00531 // 00532 // internal::pstore(&X[j+PacketSize], 00533 // internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+PacketSize+iN0])), 00534 // internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+PacketSize+iN1])),internal::pload(&X[j+PacketSize])))); 00535 // 00536 // internal::pstore(&X[j+2*PacketSize], 00537 // internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+2*PacketSize+iN0])), 00538 // internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+2*PacketSize+iN1])),internal::pload(&X[j+2*PacketSize])))); 00539 // 00540 // internal::pstore(&X[j+3*PacketSize], 00541 // internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+3*PacketSize+iN0])), 00542 // internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+3*PacketSize+iN1])),internal::pload(&X[j+3*PacketSize])))); 00543 // } 00544 // for (int j = ANP;j<AN;j+=PacketSize) 00545 // internal::pstore(&X[j], 00546 // internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])), 00547 // internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j])))); 00548 // } 00549 // } 00550 // // process remaining scalars 00551 // for (int j=AN;j<N;j++) 00552 // X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1]; 00553 // } 00554 // int remaining = (N/2)*2; 00555 // for (int i=remaining;i<N;i++) 00556 // { 00557 // real tmp0 = B[i]; 00558 // Packet ptmp0 = internal::pset1(tmp0); 00559 // int iN0 = i*N; 00560 // if (AN>0) 00561 // { 00562 // bool aligned0 = (iN0 % PacketSize) == 0; 00563 // if (aligned0) 00564 // for (int j = 0;j<AN;j+=PacketSize) 00565 // internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j]))); 00566 // else 00567 // for (int j = 0;j<AN;j+=PacketSize) 00568 // internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j]))); 00569 // } 00570 // // process remaining scalars 00571 // for (int j=AN;j<N;j++) 00572 // X[j] += tmp0 * A[j+iN0]; 00573 // } 00574 // asm("#end matrix_vector_product"); 00575 // } 00576 00577 // static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) 00578 // { 00579 // asm("#begin matrix_vector_product"); 00580 // int AN = (N/PacketSize)*PacketSize; 00581 // for (int i=0;i<N;i++) 00582 // X[i] = 0; 00583 // for (int i=0;i<N;i++) 00584 // { 00585 // real tmp = B[i]; 00586 // Packet ptmp = internal::pset1(tmp); 00587 // int iN = i*N; 00588 // if (AN>0) 00589 // { 00590 // bool aligned = (iN % PacketSize) == 0; 00591 // if (aligned) 00592 // { 00593 // #ifdef PEELING 00594 // Packet A0, A1, A2, X0, X1, X2; 00595 // int ANP = (AN/(8*PacketSize))*8*PacketSize; 00596 // for (int j = 0;j<ANP;j+=PacketSize*8) 00597 // { 00598 // A0 = internal::pload(&A[j+iN]); 00599 // X0 = internal::pload(&X[j]); 00600 // A1 = internal::pload(&A[j+PacketSize+iN]); 00601 // X1 = internal::pload(&X[j+PacketSize]); 00602 // A2 = internal::pload(&A[j+2*PacketSize+iN]); 00603 // X2 = internal::pload(&X[j+2*PacketSize]); 00604 // internal::pstore(&X[j], internal::padd(X0, internal::pmul(ptmp,A0))); 00605 // A0 = internal::pload(&A[j+3*PacketSize+iN]); 00606 // X0 = internal::pload(&X[j+3*PacketSize]); 00607 // internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1))); 00608 // A1 = internal::pload(&A[j+4*PacketSize+iN]); 00609 // X1 = internal::pload(&X[j+4*PacketSize]); 00610 // internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X2), internal::pmul(ptmp,A2))); 00611 // A2 = internal::pload(&A[j+5*PacketSize+iN]); 00612 // X2 = internal::pload(&X[j+5*PacketSize]); 00613 // internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X0), internal::pmul(ptmp,A0))); 00614 // A0 = internal::pload(&A[j+6*PacketSize+iN]); 00615 // X0 = internal::pload(&X[j+6*PacketSize]); 00616 // internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1))); 00617 // A1 = internal::pload(&A[j+7*PacketSize+iN]); 00618 // X1 = internal::pload(&X[j+7*PacketSize]); 00619 // internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X2), internal::pmul(ptmp,A2))); 00620 // internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X0), internal::pmul(ptmp,A0))); 00621 // internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1))); 00622 // // 00623 // // internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN])))); 00624 // // internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X[j+PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+PacketSize+iN])))); 00625 // // internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X[j+2*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+2*PacketSize+iN])))); 00626 // // internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X[j+3*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+3*PacketSize+iN])))); 00627 // // internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X[j+4*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+4*PacketSize+iN])))); 00628 // // internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X[j+5*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+5*PacketSize+iN])))); 00629 // // internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X[j+6*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+6*PacketSize+iN])))); 00630 // // internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X[j+7*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+7*PacketSize+iN])))); 00631 // } 00632 // for (int j = ANP;j<AN;j+=PacketSize) 00633 // internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN])))); 00634 // #else 00635 // for (int j = 0;j<AN;j+=PacketSize) 00636 // internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN])))); 00637 // #endif 00638 // } 00639 // else 00640 // { 00641 // #ifdef PEELING 00642 // int ANP = (AN/(8*PacketSize))*8*PacketSize; 00643 // for (int j = 0;j<ANP;j+=PacketSize*8) 00644 // { 00645 // internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN])))); 00646 // internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X[j+PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+PacketSize+iN])))); 00647 // internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X[j+2*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+2*PacketSize+iN])))); 00648 // internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X[j+3*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+3*PacketSize+iN])))); 00649 // internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X[j+4*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+4*PacketSize+iN])))); 00650 // internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X[j+5*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+5*PacketSize+iN])))); 00651 // internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X[j+6*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+6*PacketSize+iN])))); 00652 // internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X[j+7*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+7*PacketSize+iN])))); 00653 // } 00654 // for (int j = ANP;j<AN;j+=PacketSize) 00655 // internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN])))); 00656 // #else 00657 // for (int j = 0;j<AN;j+=PacketSize) 00658 // internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN])))); 00659 // #endif 00660 // } 00661 // } 00662 // // process remaining scalars 00663 // for (int j=AN;j<N;j++) 00664 // X[j] += tmp * A[j+iN]; 00665 // } 00666 // asm("#end matrix_vector_product"); 00667 // } 00668 00669 static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) 00670 { 00671 int AN = (N/PacketSize)*PacketSize; 00672 int bound = (N/4)*4; 00673 for (int i=0;i<bound;i+=4) 00674 { 00675 real tmp0 = 0; 00676 Packet ptmp0 = internal::pset1(real(0)); 00677 real tmp1 = 0; 00678 Packet ptmp1 = internal::pset1(real(0)); 00679 real tmp2 = 0; 00680 Packet ptmp2 = internal::pset1(real(0)); 00681 real tmp3 = 0; 00682 Packet ptmp3 = internal::pset1(real(0)); 00683 int iN0 = i*N; 00684 int iN1 = (i+1)*N; 00685 int iN2 = (i+2)*N; 00686 int iN3 = (i+3)*N; 00687 if (AN>0) 00688 { 00689 int align1 = (iN1 % PacketSize); 00690 if (align1==0) 00691 { 00692 for (int j = 0;j<AN;j+=PacketSize) 00693 { 00694 Packet b = internal::pload(&B[j]); 00695 ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0]))); 00696 ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::pload(&A[j+iN1]))); 00697 ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::pload(&A[j+iN2]))); 00698 ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::pload(&A[j+iN3]))); 00699 } 00700 } 00701 else if (align1==2) 00702 { 00703 for (int j = 0;j<AN;j+=PacketSize) 00704 { 00705 Packet b = internal::pload(&B[j]); 00706 ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0]))); 00707 ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::ploadu(&A[j+iN1]))); 00708 ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::pload(&A[j+iN2]))); 00709 ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::ploadu(&A[j+iN3]))); 00710 } 00711 } 00712 else 00713 { 00714 for (int j = 0;j<AN;j+=PacketSize) 00715 { 00716 Packet b = internal::pload(&B[j]); 00717 ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0]))); 00718 ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::ploadu(&A[j+iN1]))); 00719 ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::ploadu(&A[j+iN2]))); 00720 ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::ploadu(&A[j+iN3]))); 00721 } 00722 } 00723 tmp0 = internal::predux(ptmp0); 00724 tmp1 = internal::predux(ptmp1); 00725 tmp2 = internal::predux(ptmp2); 00726 tmp3 = internal::predux(ptmp3); 00727 } 00728 // process remaining scalars 00729 for (int j=AN;j<N;j++) 00730 { 00731 tmp0 += B[j] * A[j+iN0]; 00732 tmp1 += B[j] * A[j+iN1]; 00733 tmp2 += B[j] * A[j+iN2]; 00734 tmp3 += B[j] * A[j+iN3]; 00735 } 00736 X[i+0] = tmp0; 00737 X[i+1] = tmp1; 00738 X[i+2] = tmp2; 00739 X[i+3] = tmp3; 00740 } 00741 00742 for (int i=bound;i<N;i++) 00743 { 00744 real tmp0 = 0; 00745 Packet ptmp0 = internal::pset1(real(0)); 00746 int iN0 = i*N; 00747 if (AN>0) 00748 { 00749 if (iN0 % PacketSize==0) 00750 for (int j = 0;j<AN;j+=PacketSize) 00751 ptmp0 = internal::padd(ptmp0, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN0]))); 00752 else 00753 for (int j = 0;j<AN;j+=PacketSize) 00754 ptmp0 = internal::padd(ptmp0, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN0]))); 00755 tmp0 = internal::predux(ptmp0); 00756 } 00757 // process remaining scalars 00758 for (int j=AN;j<N;j++) 00759 tmp0 += B[j] * A[j+iN0]; 00760 X[i+0] = tmp0; 00761 } 00762 } 00763 00764 // static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) 00765 // { 00766 // int AN = (N/PacketSize)*PacketSize; 00767 // for (int i=0;i<N;i++) 00768 // X[i] = 0; 00769 // for (int i=0;i<N;i++) 00770 // { 00771 // real tmp = 0; 00772 // Packet ptmp = internal::pset1(real(0)); 00773 // int iN = i*N; 00774 // if (AN>0) 00775 // { 00776 // bool aligned = (iN % PacketSize) == 0; 00777 // if (aligned) 00778 // { 00779 // #ifdef PEELING 00780 // int ANP = (AN/(8*PacketSize))*8*PacketSize; 00781 // for (int j = 0;j<ANP;j+=PacketSize*8) 00782 // { 00783 // ptmp = 00784 // internal::padd(internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN])), 00785 // internal::padd(internal::pmul(internal::pload(&B[j+PacketSize]), internal::pload(&A[j+PacketSize+iN])), 00786 // internal::padd(internal::pmul(internal::pload(&B[j+2*PacketSize]), internal::pload(&A[j+2*PacketSize+iN])), 00787 // internal::padd(internal::pmul(internal::pload(&B[j+3*PacketSize]), internal::pload(&A[j+3*PacketSize+iN])), 00788 // internal::padd(internal::pmul(internal::pload(&B[j+4*PacketSize]), internal::pload(&A[j+4*PacketSize+iN])), 00789 // internal::padd(internal::pmul(internal::pload(&B[j+5*PacketSize]), internal::pload(&A[j+5*PacketSize+iN])), 00790 // internal::padd(internal::pmul(internal::pload(&B[j+6*PacketSize]), internal::pload(&A[j+6*PacketSize+iN])), 00791 // internal::padd(internal::pmul(internal::pload(&B[j+7*PacketSize]), internal::pload(&A[j+7*PacketSize+iN])), 00792 // ptmp)))))))); 00793 // } 00794 // for (int j = ANP;j<AN;j+=PacketSize) 00795 // ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN]))); 00796 // #else 00797 // for (int j = 0;j<AN;j+=PacketSize) 00798 // ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN]))); 00799 // #endif 00800 // } 00801 // else 00802 // { 00803 // #ifdef PEELING 00804 // int ANP = (AN/(8*PacketSize))*8*PacketSize; 00805 // for (int j = 0;j<ANP;j+=PacketSize*8) 00806 // { 00807 // ptmp = 00808 // internal::padd(internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN])), 00809 // internal::padd(internal::pmul(internal::pload(&B[j+PacketSize]), internal::ploadu(&A[j+PacketSize+iN])), 00810 // internal::padd(internal::pmul(internal::pload(&B[j+2*PacketSize]), internal::ploadu(&A[j+2*PacketSize+iN])), 00811 // internal::padd(internal::pmul(internal::pload(&B[j+3*PacketSize]), internal::ploadu(&A[j+3*PacketSize+iN])), 00812 // internal::padd(internal::pmul(internal::pload(&B[j+4*PacketSize]), internal::ploadu(&A[j+4*PacketSize+iN])), 00813 // internal::padd(internal::pmul(internal::pload(&B[j+5*PacketSize]), internal::ploadu(&A[j+5*PacketSize+iN])), 00814 // internal::padd(internal::pmul(internal::pload(&B[j+6*PacketSize]), internal::ploadu(&A[j+6*PacketSize+iN])), 00815 // internal::padd(internal::pmul(internal::pload(&B[j+7*PacketSize]), internal::ploadu(&A[j+7*PacketSize+iN])), 00816 // ptmp)))))))); 00817 // } 00818 // for (int j = ANP;j<AN;j+=PacketSize) 00819 // ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN]))); 00820 // #else 00821 // for (int j = 0;j<AN;j+=PacketSize) 00822 // ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN]))); 00823 // #endif 00824 // } 00825 // tmp = internal::predux(ptmp); 00826 // } 00827 // // process remaining scalars 00828 // for (int j=AN;j<N;j++) 00829 // tmp += B[j] * A[j+iN]; 00830 // X[i] = tmp; 00831 // } 00832 // } 00833 00834 static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ 00835 int AN = (N/PacketSize)*PacketSize; 00836 if (AN>0) 00837 { 00838 Packet pcoef = internal::pset1(coef); 00839 #ifdef PEELING 00840 const int peelSize = 3; 00841 int ANP = (AN/(peelSize*PacketSize))*peelSize*PacketSize; 00842 float* X1 = X + PacketSize; 00843 float* Y1 = Y + PacketSize; 00844 float* X2 = X + 2*PacketSize; 00845 float* Y2 = Y + 2*PacketSize; 00846 Packet x0,x1,x2,y0,y1,y2; 00847 for (int j = 0;j<ANP;j+=PacketSize*peelSize) 00848 { 00849 x0 = internal::pload(X+j); 00850 x1 = internal::pload(X1+j); 00851 x2 = internal::pload(X2+j); 00852 00853 y0 = internal::pload(Y+j); 00854 y1 = internal::pload(Y1+j); 00855 y2 = internal::pload(Y2+j); 00856 00857 y0 = internal::pmadd(pcoef, x0, y0); 00858 y1 = internal::pmadd(pcoef, x1, y1); 00859 y2 = internal::pmadd(pcoef, x2, y2); 00860 00861 internal::pstore(Y+j, y0); 00862 internal::pstore(Y1+j, y1); 00863 internal::pstore(Y2+j, y2); 00864 // internal::pstore(&Y[j+2*PacketSize], internal::padd(internal::pload(&Y[j+2*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+2*PacketSize])))); 00865 // internal::pstore(&Y[j+3*PacketSize], internal::padd(internal::pload(&Y[j+3*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+3*PacketSize])))); 00866 // internal::pstore(&Y[j+4*PacketSize], internal::padd(internal::pload(&Y[j+4*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+4*PacketSize])))); 00867 // internal::pstore(&Y[j+5*PacketSize], internal::padd(internal::pload(&Y[j+5*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+5*PacketSize])))); 00868 // internal::pstore(&Y[j+6*PacketSize], internal::padd(internal::pload(&Y[j+6*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+6*PacketSize])))); 00869 // internal::pstore(&Y[j+7*PacketSize], internal::padd(internal::pload(&Y[j+7*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+7*PacketSize])))); 00870 } 00871 for (int j = ANP;j<AN;j+=PacketSize) 00872 internal::pstore(&Y[j], internal::padd(internal::pload(&Y[j]), internal::pmul(pcoef,internal::pload(&X[j])))); 00873 #else 00874 for (int j = 0;j<AN;j+=PacketSize) 00875 internal::pstore(&Y[j], internal::padd(internal::pload(&Y[j]), internal::pmul(pcoef,internal::pload(&X[j])))); 00876 #endif 00877 } 00878 // process remaining scalars 00879 for (int i=AN;i<N;i++) 00880 Y[i] += coef * X[i]; 00881 } 00882 00883 00884 }; 00885 00886 #endif