00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H
00026 #define EIGEN_CACHE_FRIENDLY_PRODUCT_H
00027
00028 template <int L2MemorySize,typename Scalar>
00029 struct ei_L2_block_traits {
00030 enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
00031 };
00032
00033 #ifndef EIGEN_EXTERN_INSTANTIATIONS
00034
00035 template<typename Scalar>
00036 void ei_cache_friendly_product(
00037 int _rows, int _cols, int depth,
00038 bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
00039 bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
00040 bool resRowMajor, Scalar* res, int resStride)
00041 {
00042 const Scalar* EIGEN_RESTRICT lhs;
00043 const Scalar* EIGEN_RESTRICT rhs;
00044 int lhsStride, rhsStride, rows, cols;
00045 bool lhsRowMajor;
00046
00047 if (resRowMajor)
00048 {
00049 lhs = _rhs;
00050 rhs = _lhs;
00051 lhsStride = _rhsStride;
00052 rhsStride = _lhsStride;
00053 cols = _rows;
00054 rows = _cols;
00055 lhsRowMajor = !_rhsRowMajor;
00056 ei_assert(_lhsRowMajor);
00057 }
00058 else
00059 {
00060 lhs = _lhs;
00061 rhs = _rhs;
00062 lhsStride = _lhsStride;
00063 rhsStride = _rhsStride;
00064 rows = _rows;
00065 cols = _cols;
00066 lhsRowMajor = _lhsRowMajor;
00067 ei_assert(!_rhsRowMajor);
00068 }
00069
00070 typedef typename ei_packet_traits<Scalar>::type PacketType;
00071
00072 enum {
00073 PacketSize = sizeof(PacketType)/sizeof(Scalar),
00074 #if (defined __i386__)
00075
00076
00077 MaxBlockRows = 4,
00078 MaxBlockRows_ClampingMask = 0xFFFFFC,
00079 #else
00080 MaxBlockRows = 8,
00081 MaxBlockRows_ClampingMask = 0xFFFFF8,
00082 #endif
00083
00084 MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
00085 };
00086
00087 const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (std::size_t(res)%16==0));
00088
00089 const int remainingSize = depth % PacketSize;
00090 const int size = depth - remainingSize;
00091 const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
00092 const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
00093 const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
00094 const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
00095 const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (std::size_t(rhs)%16!=0));
00096 Scalar* EIGEN_RESTRICT block = 0;
00097 const int allocBlockSize = l2BlockRows*size;
00098 block = ei_aligned_stack_new(Scalar, allocBlockSize);
00099 Scalar* EIGEN_RESTRICT rhsCopy
00100 = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned);
00101
00102
00103 for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
00104 {
00105 const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
00106 const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask;
00107 const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW;
00108
00109
00110
00111 int count = 0;
00112
00113
00114 for(int l2k=0; l2k<size; l2k+=l2BlockSize)
00115 {
00116 const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
00117
00118 for (int i = l2i; i<l2blockRowEndBW; i+=MaxBlockRows)
00119 {
00120
00121
00122
00123 for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
00124 {
00125
00126
00127 if (lhsRowMajor)
00128 {
00129 for (int w=0; w<MaxBlockRows; ++w)
00130 for (int s=0; s<PacketSize; ++s)
00131 block[count++] = lhs[(i+w)*lhsStride + (k+s)];
00132 }
00133 else
00134 {
00135 for (int w=0; w<MaxBlockRows; ++w)
00136 for (int s=0; s<PacketSize; ++s)
00137 block[count++] = lhs[(i+w) + (k+s)*lhsStride];
00138 }
00139 }
00140 }
00141 if (l2blockRemainingRows>0)
00142 {
00143 for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
00144 {
00145 if (lhsRowMajor)
00146 {
00147 for (int w=0; w<l2blockRemainingRows; ++w)
00148 for (int s=0; s<PacketSize; ++s)
00149 block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
00150 }
00151 else
00152 {
00153 for (int w=0; w<l2blockRemainingRows; ++w)
00154 for (int s=0; s<PacketSize; ++s)
00155 block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
00156 }
00157 }
00158 }
00159 }
00160
00161 for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
00162 {
00163 int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
00164
00165 for(int l2k=0; l2k<size; l2k+=l2BlockSize)
00166 {
00167
00168 int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
00169
00170
00171 if (needRhsCopy)
00172 for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
00173 {
00174 ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned);
00175 std::memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar));
00176 }
00177
00178
00179 for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
00180 {
00181 int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
00182 const Scalar* EIGEN_RESTRICT localB = &block[offsetblock];
00183
00184 for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
00185 {
00186 const Scalar* EIGEN_RESTRICT rhsColumn;
00187 if (needRhsCopy)
00188 rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
00189 else
00190 rhsColumn = &(rhs[l1j*rhsStride]);
00191
00192 PacketType dst[MaxBlockRows];
00193 dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
00194 if (MaxBlockRows==8)
00195 dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
00196
00197 PacketType tmp;
00198
00199 for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
00200 {
00201 tmp = ei_ploadu(&rhsColumn[k]);
00202 PacketType A0, A1, A2, A3, A4, A5;
00203 A0 = ei_pload(localB + k*MaxBlockRows);
00204 A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize);
00205 A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize);
00206 A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize);
00207 if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize);
00208 if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize);
00209 dst[0] = ei_pmadd(tmp, A0, dst[0]);
00210 if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize);
00211 dst[1] = ei_pmadd(tmp, A1, dst[1]);
00212 if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize);
00213 dst[2] = ei_pmadd(tmp, A2, dst[2]);
00214 dst[3] = ei_pmadd(tmp, A3, dst[3]);
00215 if (MaxBlockRows==8)
00216 {
00217 dst[4] = ei_pmadd(tmp, A4, dst[4]);
00218 dst[5] = ei_pmadd(tmp, A5, dst[5]);
00219 dst[6] = ei_pmadd(tmp, A0, dst[6]);
00220 dst[7] = ei_pmadd(tmp, A1, dst[7]);
00221 }
00222 }
00223
00224 Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]);
00225
00226 if (PacketSize>1 && resIsAligned)
00227 {
00228
00229 ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0])));
00230 if (PacketSize==2)
00231 ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
00232 if (MaxBlockRows==8)
00233 {
00234 ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
00235 if (PacketSize==2)
00236 ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
00237 }
00238 }
00239 else
00240 {
00241
00242 localRes[0] += ei_predux(dst[0]);
00243 localRes[1] += ei_predux(dst[1]);
00244 localRes[2] += ei_predux(dst[2]);
00245 localRes[3] += ei_predux(dst[3]);
00246 if (MaxBlockRows==8)
00247 {
00248 localRes[4] += ei_predux(dst[4]);
00249 localRes[5] += ei_predux(dst[5]);
00250 localRes[6] += ei_predux(dst[6]);
00251 localRes[7] += ei_predux(dst[7]);
00252 }
00253 }
00254 }
00255 }
00256 if (l2blockRemainingRows>0)
00257 {
00258 int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
00259 const Scalar* localB = &block[offsetblock];
00260
00261 for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
00262 {
00263 const Scalar* EIGEN_RESTRICT rhsColumn;
00264 if (needRhsCopy)
00265 rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
00266 else
00267 rhsColumn = &(rhs[l1j*rhsStride]);
00268
00269 PacketType dst[MaxBlockRows];
00270 dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
00271 if (MaxBlockRows==8)
00272 dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
00273
00274
00275 PacketType tmp;
00276
00277 for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
00278 {
00279 tmp = ei_pload(&rhsColumn[k]);
00280
00281 dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]);
00282 if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]);
00283 if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
00284 if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
00285 if (MaxBlockRows==8)
00286 {
00287 if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
00288 if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
00289 if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
00290 if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
00291 }
00292 }
00293
00294 Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]);
00295
00296
00297 localRes[0] += ei_predux(dst[0]);
00298 if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
00299 if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
00300 if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
00301 if (MaxBlockRows==8)
00302 {
00303 if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
00304 if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
00305 if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
00306 if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
00307 }
00308
00309 }
00310 }
00311 }
00312 }
00313 }
00314 if (PacketSize>1 && remainingSize)
00315 {
00316 if (lhsRowMajor)
00317 {
00318 for (int j=0; j<cols; ++j)
00319 for (int i=0; i<rows; ++i)
00320 {
00321 Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
00322
00323 for (int k=1; k<remainingSize; ++k)
00324 tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
00325 res[i+j*resStride] += tmp;
00326 }
00327 }
00328 else
00329 {
00330 for (int j=0; j<cols; ++j)
00331 for (int i=0; i<rows; ++i)
00332 {
00333 Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
00334 for (int k=1; k<remainingSize; ++k)
00335 tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
00336 res[i+j*resStride] += tmp;
00337 }
00338 }
00339 }
00340
00341 ei_aligned_stack_delete(Scalar, block, allocBlockSize);
00342 ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned);
00343 }
00344
00345 #endif // EIGEN_EXTERN_INSTANTIATIONS
00346
00347
00348
00349
00350
00351
00352
00353
00354 template<typename Scalar, typename RhsType>
00355 EIGEN_DONT_INLINE void ei_cache_friendly_product_colmajor_times_vector(
00356 int size,
00357 const Scalar* lhs, int lhsStride,
00358 const RhsType& rhs,
00359 Scalar* res)
00360 {
00361 #ifdef _EIGEN_ACCUMULATE_PACKETS
00362 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00363 #endif
00364 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
00365 ei_pstore(&res[j], \
00366 ei_padd(ei_pload(&res[j]), \
00367 ei_padd( \
00368 ei_padd(ei_pmul(ptmp0,EIGEN_CAT(ei_ploa , A0)(&lhs0[j])), \
00369 ei_pmul(ptmp1,EIGEN_CAT(ei_ploa , A13)(&lhs1[j]))), \
00370 ei_padd(ei_pmul(ptmp2,EIGEN_CAT(ei_ploa , A2)(&lhs2[j])), \
00371 ei_pmul(ptmp3,EIGEN_CAT(ei_ploa , A13)(&lhs3[j]))) )))
00372
00373 typedef typename ei_packet_traits<Scalar>::type Packet;
00374 const int PacketSize = sizeof(Packet)/sizeof(Scalar);
00375
00376 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
00377 const int columnsAtOnce = 4;
00378 const int peels = 2;
00379 const int PacketAlignedMask = PacketSize-1;
00380 const int PeelAlignedMask = PacketSize*peels-1;
00381
00382
00383
00384 const int alignedStart = ei_alignmentOffset(res,size);
00385 const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
00386 const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
00387
00388 const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
00389 int alignmentPattern = alignmentStep==0 ? AllAligned
00390 : alignmentStep==(PacketSize/2) ? EvenAligned
00391 : FirstAligned;
00392
00393
00394 const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
00395
00396
00397 int skipColumns = 0;
00398 if (PacketSize>1)
00399 {
00400 ei_internal_assert(std::size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize);
00401
00402 while (skipColumns<PacketSize &&
00403 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize))
00404 ++skipColumns;
00405 if (skipColumns==PacketSize)
00406 {
00407
00408 alignmentPattern = NoneAligned;
00409 skipColumns = 0;
00410 }
00411 else
00412 {
00413 skipColumns = std::min(skipColumns,rhs.size());
00414
00415 }
00416
00417 ei_internal_assert((alignmentPattern==NoneAligned) || (std::size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
00418 }
00419
00420 int offset1 = (FirstAligned && alignmentStep==1?3:1);
00421 int offset3 = (FirstAligned && alignmentStep==1?1:3);
00422
00423 int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
00424 for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
00425 {
00426 Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+offset1]),
00427 ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+offset3]);
00428
00429
00430 const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00431 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00432
00433 if (PacketSize>1)
00434 {
00435
00436
00437 for (int j=0; j<alignedStart; ++j)
00438 res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
00439
00440 if (alignedSize>alignedStart)
00441 {
00442 switch(alignmentPattern)
00443 {
00444 case AllAligned:
00445 for (int j = alignedStart; j<alignedSize; j+=PacketSize)
00446 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00447 break;
00448 case EvenAligned:
00449 for (int j = alignedStart; j<alignedSize; j+=PacketSize)
00450 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00451 break;
00452 case FirstAligned:
00453 if(peels>1)
00454 {
00455 Packet A00, A01, A02, A03, A10, A11, A12, A13;
00456
00457 A01 = ei_pload(&lhs1[alignedStart-1]);
00458 A02 = ei_pload(&lhs2[alignedStart-2]);
00459 A03 = ei_pload(&lhs3[alignedStart-3]);
00460
00461 for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
00462 {
00463 A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11);
00464 A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12);
00465 A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13);
00466
00467 A00 = ei_pload (&lhs0[j]);
00468 A10 = ei_pload (&lhs0[j+PacketSize]);
00469 A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j]));
00470 A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize]));
00471
00472 A00 = ei_pmadd(ptmp1, A01, A00);
00473 A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01);
00474 A00 = ei_pmadd(ptmp2, A02, A00);
00475 A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02);
00476 A00 = ei_pmadd(ptmp3, A03, A00);
00477 ei_pstore(&res[j],A00);
00478 A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03);
00479 A10 = ei_pmadd(ptmp1, A11, A10);
00480 A10 = ei_pmadd(ptmp2, A12, A10);
00481 A10 = ei_pmadd(ptmp3, A13, A10);
00482 ei_pstore(&res[j+PacketSize],A10);
00483 }
00484 }
00485 for (int j = peeledSize; j<alignedSize; j+=PacketSize)
00486 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00487 break;
00488 default:
00489 for (int j = alignedStart; j<alignedSize; j+=PacketSize)
00490 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00491 break;
00492 }
00493 }
00494 }
00495
00496
00497 for (int j=alignedSize; j<size; ++j)
00498 res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
00499 }
00500
00501
00502 int end = rhs.size();
00503 int start = columnBound;
00504 do
00505 {
00506 for (int i=start; i<end; ++i)
00507 {
00508 Packet ptmp0 = ei_pset1(rhs[i]);
00509 const Scalar* lhs0 = lhs + i*lhsStride;
00510
00511 if (PacketSize>1)
00512 {
00513
00514
00515 for (int j=0; j<alignedStart; ++j)
00516 res[j] += ei_pfirst(ptmp0) * lhs0[j];
00517
00518
00519 if ((std::size_t(lhs0+alignedStart)%sizeof(Packet))==0)
00520 for (int j = alignedStart;j<alignedSize;j+=PacketSize)
00521 ei_pstore(&res[j], ei_pmadd(ptmp0,ei_pload(&lhs0[j]),ei_pload(&res[j])));
00522 else
00523 for (int j = alignedStart;j<alignedSize;j+=PacketSize)
00524 ei_pstore(&res[j], ei_pmadd(ptmp0,ei_ploadu(&lhs0[j]),ei_pload(&res[j])));
00525 }
00526
00527
00528 for (int j=alignedSize; j<size; ++j)
00529 res[j] += ei_pfirst(ptmp0) * lhs0[j];
00530 }
00531 if (skipColumns)
00532 {
00533 start = 0;
00534 end = skipColumns;
00535 skipColumns = 0;
00536 }
00537 else
00538 break;
00539 } while(PacketSize>1);
00540 #undef _EIGEN_ACCUMULATE_PACKETS
00541 }
00542
00543
00544 template<typename Scalar, typename ResType>
00545 EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
00546 const Scalar* lhs, int lhsStride,
00547 const Scalar* rhs, int rhsSize,
00548 ResType& res)
00549 {
00550 #ifdef _EIGEN_ACCUMULATE_PACKETS
00551 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00552 #endif
00553
00554 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
00555 Packet b = ei_pload(&rhs[j]); \
00556 ptmp0 = ei_pmadd(b, EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), ptmp0); \
00557 ptmp1 = ei_pmadd(b, EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), ptmp1); \
00558 ptmp2 = ei_pmadd(b, EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), ptmp2); \
00559 ptmp3 = ei_pmadd(b, EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), ptmp3); }
00560
00561 typedef typename ei_packet_traits<Scalar>::type Packet;
00562 const int PacketSize = sizeof(Packet)/sizeof(Scalar);
00563
00564 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
00565 const int rowsAtOnce = 4;
00566 const int peels = 2;
00567 const int PacketAlignedMask = PacketSize-1;
00568 const int PeelAlignedMask = PacketSize*peels-1;
00569 const int size = rhsSize;
00570
00571
00572
00573 const int alignedStart = ei_alignmentOffset(rhs, size);
00574 const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
00575 const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
00576
00577 const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
00578 int alignmentPattern = alignmentStep==0 ? AllAligned
00579 : alignmentStep==(PacketSize/2) ? EvenAligned
00580 : FirstAligned;
00581
00582
00583 const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
00584
00585
00586 int skipRows = 0;
00587 if (PacketSize>1)
00588 {
00589 ei_internal_assert(std::size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize);
00590
00591 while (skipRows<PacketSize &&
00592 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize))
00593 ++skipRows;
00594 if (skipRows==PacketSize)
00595 {
00596
00597 alignmentPattern = NoneAligned;
00598 skipRows = 0;
00599 }
00600 else
00601 {
00602 skipRows = std::min(skipRows,res.size());
00603
00604 }
00605 ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1
00606 || (std::size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
00607 }
00608
00609 int offset1 = (FirstAligned && alignmentStep==1?3:1);
00610 int offset3 = (FirstAligned && alignmentStep==1?1:3);
00611
00612 int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
00613 for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
00614 {
00615 Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
00616
00617
00618 const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
00619 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00620
00621 if (PacketSize>1)
00622 {
00623
00624 Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
00625
00626
00627
00628 for (int j=0; j<alignedStart; ++j)
00629 {
00630 Scalar b = rhs[j];
00631 tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j];
00632 }
00633
00634 if (alignedSize>alignedStart)
00635 {
00636 switch(alignmentPattern)
00637 {
00638 case AllAligned:
00639 for (int j = alignedStart; j<alignedSize; j+=PacketSize)
00640 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00641 break;
00642 case EvenAligned:
00643 for (int j = alignedStart; j<alignedSize; j+=PacketSize)
00644 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00645 break;
00646 case FirstAligned:
00647 if (peels>1)
00648 {
00649
00650
00651
00652
00653
00654
00655 Packet A01, A02, A03, b, A11, A12, A13;
00656 A01 = ei_pload(&lhs1[alignedStart-1]);
00657 A02 = ei_pload(&lhs2[alignedStart-2]);
00658 A03 = ei_pload(&lhs3[alignedStart-3]);
00659
00660 for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
00661 {
00662 b = ei_pload(&rhs[j]);
00663 A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11);
00664 A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12);
00665 A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13);
00666
00667 ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j]), ptmp0);
00668 ptmp1 = ei_pmadd(b, A01, ptmp1);
00669 A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01);
00670 ptmp2 = ei_pmadd(b, A02, ptmp2);
00671 A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02);
00672 ptmp3 = ei_pmadd(b, A03, ptmp3);
00673 A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03);
00674
00675 b = ei_pload(&rhs[j+PacketSize]);
00676 ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j+PacketSize]), ptmp0);
00677 ptmp1 = ei_pmadd(b, A11, ptmp1);
00678 ptmp2 = ei_pmadd(b, A12, ptmp2);
00679 ptmp3 = ei_pmadd(b, A13, ptmp3);
00680 }
00681 }
00682 for (int j = peeledSize; j<alignedSize; j+=PacketSize)
00683 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00684 break;
00685 default:
00686 for (int j = alignedStart; j<alignedSize; j+=PacketSize)
00687 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00688 break;
00689 }
00690 tmp0 += ei_predux(ptmp0);
00691 tmp1 += ei_predux(ptmp1);
00692 tmp2 += ei_predux(ptmp2);
00693 tmp3 += ei_predux(ptmp3);
00694 }
00695 }
00696
00697
00698
00699 for (int j=alignedSize; j<size; ++j)
00700 {
00701 Scalar b = rhs[j];
00702 tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j];
00703 }
00704 res[i] += tmp0; res[i+offset1] += tmp1; res[i+2] += tmp2; res[i+offset3] += tmp3;
00705 }
00706
00707
00708 int end = res.size();
00709 int start = rowBound;
00710 do
00711 {
00712 for (int i=start; i<end; ++i)
00713 {
00714 Scalar tmp0 = Scalar(0);
00715 Packet ptmp0 = ei_pset1(tmp0);
00716 const Scalar* lhs0 = lhs + i*lhsStride;
00717
00718
00719 for (int j=0; j<alignedStart; ++j)
00720 tmp0 += rhs[j] * lhs0[j];
00721
00722 if (alignedSize>alignedStart)
00723 {
00724
00725 if ((std::size_t(lhs0+alignedStart)%sizeof(Packet))==0)
00726 for (int j = alignedStart;j<alignedSize;j+=PacketSize)
00727 ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_pload(&lhs0[j]), ptmp0);
00728 else
00729 for (int j = alignedStart;j<alignedSize;j+=PacketSize)
00730 ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_ploadu(&lhs0[j]), ptmp0);
00731 tmp0 += ei_predux(ptmp0);
00732 }
00733
00734
00735
00736 for (int j=alignedSize; j<size; ++j)
00737 tmp0 += rhs[j] * lhs0[j];
00738 res[i] += tmp0;
00739 }
00740 if (skipRows)
00741 {
00742 start = 0;
00743 end = skipRows;
00744 skipRows = 0;
00745 }
00746 else
00747 break;
00748 } while(PacketSize>1);
00749
00750 #undef _EIGEN_ACCUMULATE_PACKETS
00751 }
00752
00753 #endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H