GeneralMatrixVector.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11 #define EIGEN_GENERAL_MATRIX_VECTOR_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 /* Optimized col-major matrix * vector product:
18  * This algorithm processes 4 columns at onces that allows to both reduce
19  * the number of load/stores of the result by a factor 4 and to reduce
20  * the instruction dependency. Moreover, we know that all bands have the
21  * same alignment pattern.
22  *
23  * Mixing type logic: C += alpha * A * B
24  * | A | B |alpha| comments
25  * |real |cplx |cplx | no vectorization
26  * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
27  * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
28  * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
29  */
30 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
31 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
32 {
34 
35 enum {
38  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
39  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
40  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
41 };
42 
46 
50 
51 EIGEN_DONT_INLINE static void run(
52  Index rows, Index cols,
53  const LhsScalar* lhs, Index lhsStride,
54  const RhsScalar* rhs, Index rhsIncr,
55  ResScalar* res, Index
56  #ifdef EIGEN_INTERNAL_DEBUGGING
57  resIncr
58  #endif
59  , RhsScalar alpha);
60 };
61 
62 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
64  Index rows, Index cols,
65  const LhsScalar* lhs, Index lhsStride,
66  const RhsScalar* rhs, Index rhsIncr,
67  ResScalar* res, Index
68  #ifdef EIGEN_INTERNAL_DEBUGGING
69  resIncr
70  #endif
71  , RhsScalar alpha)
72 {
73  eigen_internal_assert(resIncr==1);
74  #ifdef _EIGEN_ACCUMULATE_PACKETS
75  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
76  #endif
77  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
78  pstore(&res[j], \
79  padd(pload<ResPacket>(&res[j]), \
80  padd( \
81  padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
82  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
83  padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
84  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
85 
88  if(ConjugateRhs)
89  alpha = numext::conj(alpha);
90 
91  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
92  const Index columnsAtOnce = 4;
93  const Index peels = 2;
94  const Index LhsPacketAlignedMask = LhsPacketSize-1;
95  const Index ResPacketAlignedMask = ResPacketSize-1;
96 // const Index PeelAlignedMask = ResPacketSize*peels-1;
97  const Index size = rows;
98 
99  // How many coeffs of the result do we have to skip to be aligned.
100  // Here we assume data are at least aligned on the base scalar type.
101  Index alignedStart = internal::first_aligned(res,size);
102  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
103  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
104 
105  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
106  Index alignmentPattern = alignmentStep==0 ? AllAligned
107  : alignmentStep==(LhsPacketSize/2) ? EvenAligned
108  : FirstAligned;
109 
110  // we cannot assume the first element is aligned because of sub-matrices
111  const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
112 
113  // find how many columns do we have to skip to be aligned with the result (if possible)
114  Index skipColumns = 0;
115  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
116  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
117  {
118  alignedSize = 0;
119  alignedStart = 0;
120  }
121  else if (LhsPacketSize>1)
122  {
123  eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
124 
125  while (skipColumns<LhsPacketSize &&
126  alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
127  ++skipColumns;
128  if (skipColumns==LhsPacketSize)
129  {
130  // nothing can be aligned, no need to skip any column
131  alignmentPattern = NoneAligned;
132  skipColumns = 0;
133  }
134  else
135  {
136  skipColumns = (std::min)(skipColumns,cols);
137  // note that the skiped columns are processed later.
138  }
139 
140  eigen_internal_assert( (alignmentPattern==NoneAligned)
141  || (skipColumns + columnsAtOnce >= cols)
142  || LhsPacketSize > size
143  || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
144  }
145  else if(Vectorizable)
146  {
147  alignedStart = 0;
148  alignedSize = size;
149  alignmentPattern = AllAligned;
150  }
151 
152  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
153  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
154 
155  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
156  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
157  {
158  RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
159  ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
160  ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
161  ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
162 
163  // this helps a lot generating better binary code
164  const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
165  *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
166 
167  if (Vectorizable)
168  {
169  /* explicit vectorization */
170  // process initial unaligned coeffs
171  for (Index j=0; j<alignedStart; ++j)
172  {
173  res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
174  res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
175  res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
176  res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
177  }
178 
179  if (alignedSize>alignedStart)
180  {
181  switch(alignmentPattern)
182  {
183  case AllAligned:
184  for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
186  break;
187  case EvenAligned:
188  for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
190  break;
191  case FirstAligned:
192  {
193  Index j = alignedStart;
194  if(peels>1)
195  {
196  LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
197  ResPacket T0, T1;
198 
199  A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
200  A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
201  A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
202 
203  for (; j<peeledSize; j+=peels*ResPacketSize)
204  {
205  A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
206  A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
207  A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
208 
209  A00 = pload<LhsPacket>(&lhs0[j]);
210  A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
211  T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
212  T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
213 
214  T0 = pcj.pmadd(A01, ptmp1, T0);
215  A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
216  T0 = pcj.pmadd(A02, ptmp2, T0);
217  A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
218  T0 = pcj.pmadd(A03, ptmp3, T0);
219  pstore(&res[j],T0);
220  A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
221  T1 = pcj.pmadd(A11, ptmp1, T1);
222  T1 = pcj.pmadd(A12, ptmp2, T1);
223  T1 = pcj.pmadd(A13, ptmp3, T1);
224  pstore(&res[j+ResPacketSize],T1);
225  }
226  }
227  for (; j<alignedSize; j+=ResPacketSize)
229  break;
230  }
231  default:
232  for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
233  _EIGEN_ACCUMULATE_PACKETS(du,du,du);
234  break;
235  }
236  }
237  } // end explicit vectorization
238 
239  /* process remaining coeffs (or all if there is no explicit vectorization) */
240  for (Index j=alignedSize; j<size; ++j)
241  {
242  res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
243  res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
244  res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
245  res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
246  }
247  }
248 
249  // process remaining first and last columns (at most columnsAtOnce-1)
250  Index end = cols;
251  Index start = columnBound;
252  do
253  {
254  for (Index k=start; k<end; ++k)
255  {
256  RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
257  const LhsScalar* lhs0 = lhs + k*lhsStride;
258 
259  if (Vectorizable)
260  {
261  /* explicit vectorization */
262  // process first unaligned result's coeffs
263  for (Index j=0; j<alignedStart; ++j)
264  res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
265  // process aligned result's coeffs
266  if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
267  for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
268  pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
269  else
270  for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
271  pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
272  }
273 
274  // process remaining scalars (or all if no explicit vectorization)
275  for (Index i=alignedSize; i<size; ++i)
276  res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
277  }
278  if (skipColumns)
279  {
280  start = 0;
281  end = skipColumns;
282  skipColumns = 0;
283  }
284  else
285  break;
286  } while(Vectorizable);
287  #undef _EIGEN_ACCUMULATE_PACKETS
288 }
289 
290 /* Optimized row-major matrix * vector product:
291  * This algorithm processes 4 rows at onces that allows to both reduce
292  * the number of load/stores of the result by a factor 4 and to reduce
293  * the instruction dependency. Moreover, we know that all bands have the
294  * same alignment pattern.
295  *
296  * Mixing type logic:
297  * - alpha is always a complex (or converted to a complex)
298  * - no vectorization
299  */
300 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
301 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
302 {
304 
305 enum {
308  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
309  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
310  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
311 };
312 
316 
320 
321 EIGEN_DONT_INLINE static void run(
322  Index rows, Index cols,
323  const LhsScalar* lhs, Index lhsStride,
324  const RhsScalar* rhs, Index rhsIncr,
325  ResScalar* res, Index resIncr,
326  ResScalar alpha);
327 };
328 
329 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
331  Index rows, Index cols,
332  const LhsScalar* lhs, Index lhsStride,
333  const RhsScalar* rhs, Index rhsIncr,
334  ResScalar* res, Index resIncr,
335  ResScalar alpha)
336 {
337  EIGEN_UNUSED_VARIABLE(rhsIncr);
338  eigen_internal_assert(rhsIncr==1);
339  #ifdef _EIGEN_ACCUMULATE_PACKETS
340  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
341  #endif
342 
343  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
344  RhsPacket b = pload<RhsPacket>(&rhs[j]); \
345  ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
346  ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
347  ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
348  ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
349 
352 
353  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
354  const Index rowsAtOnce = 4;
355  const Index peels = 2;
356  const Index RhsPacketAlignedMask = RhsPacketSize-1;
357  const Index LhsPacketAlignedMask = LhsPacketSize-1;
358 // const Index PeelAlignedMask = RhsPacketSize*peels-1;
359  const Index depth = cols;
360 
361  // How many coeffs of the result do we have to skip to be aligned.
362  // Here we assume data are at least aligned on the base scalar type
363  // if that's not the case then vectorization is discarded, see below.
364  Index alignedStart = internal::first_aligned(rhs, depth);
365  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
366  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
367 
368  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
369  Index alignmentPattern = alignmentStep==0 ? AllAligned
370  : alignmentStep==(LhsPacketSize/2) ? EvenAligned
371  : FirstAligned;
372 
373  // we cannot assume the first element is aligned because of sub-matrices
374  const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
375 
376  // find how many rows do we have to skip to be aligned with rhs (if possible)
377  Index skipRows = 0;
378  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
379  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
380  {
381  alignedSize = 0;
382  alignedStart = 0;
383  }
384  else if (LhsPacketSize>1)
385  {
386  eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
387 
388  while (skipRows<LhsPacketSize &&
389  alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
390  ++skipRows;
391  if (skipRows==LhsPacketSize)
392  {
393  // nothing can be aligned, no need to skip any column
394  alignmentPattern = NoneAligned;
395  skipRows = 0;
396  }
397  else
398  {
399  skipRows = (std::min)(skipRows,Index(rows));
400  // note that the skiped columns are processed later.
401  }
402  eigen_internal_assert( alignmentPattern==NoneAligned
403  || LhsPacketSize==1
404  || (skipRows + rowsAtOnce >= rows)
405  || LhsPacketSize > depth
406  || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
407  }
408  else if(Vectorizable)
409  {
410  alignedStart = 0;
411  alignedSize = depth;
412  alignmentPattern = AllAligned;
413  }
414 
415  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
416  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
417 
418  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
419  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
420  {
422  ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
423 
424  // this helps the compiler generating good binary code
425  const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
426  *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
427 
428  if (Vectorizable)
429  {
430  /* explicit vectorization */
431  ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
432  ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
433 
434  // process initial unaligned coeffs
435  // FIXME this loop get vectorized by the compiler !
436  for (Index j=0; j<alignedStart; ++j)
437  {
438  RhsScalar b = rhs[j];
439  tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
440  tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
441  }
442 
443  if (alignedSize>alignedStart)
444  {
445  switch(alignmentPattern)
446  {
447  case AllAligned:
448  for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
450  break;
451  case EvenAligned:
452  for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
454  break;
455  case FirstAligned:
456  {
457  Index j = alignedStart;
458  if (peels>1)
459  {
460  /* Here we proccess 4 rows with with two peeled iterations to hide
461  * the overhead of unaligned loads. Moreover unaligned loads are handled
462  * using special shift/move operations between the two aligned packets
463  * overlaping the desired unaligned packet. This is *much* more efficient
464  * than basic unaligned loads.
465  */
466  LhsPacket A01, A02, A03, A11, A12, A13;
467  A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
468  A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
469  A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
470 
471  for (; j<peeledSize; j+=peels*RhsPacketSize)
472  {
473  RhsPacket b = pload<RhsPacket>(&rhs[j]);
474  A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
475  A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
476  A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
477 
478  ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
479  ptmp1 = pcj.pmadd(A01, b, ptmp1);
480  A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
481  ptmp2 = pcj.pmadd(A02, b, ptmp2);
482  A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
483  ptmp3 = pcj.pmadd(A03, b, ptmp3);
484  A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
485 
486  b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
487  ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
488  ptmp1 = pcj.pmadd(A11, b, ptmp1);
489  ptmp2 = pcj.pmadd(A12, b, ptmp2);
490  ptmp3 = pcj.pmadd(A13, b, ptmp3);
491  }
492  }
493  for (; j<alignedSize; j+=RhsPacketSize)
495  break;
496  }
497  default:
498  for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
499  _EIGEN_ACCUMULATE_PACKETS(du,du,du);
500  break;
501  }
502  tmp0 += predux(ptmp0);
503  tmp1 += predux(ptmp1);
504  tmp2 += predux(ptmp2);
505  tmp3 += predux(ptmp3);
506  }
507  } // end explicit vectorization
508 
509  // process remaining coeffs (or all if no explicit vectorization)
510  // FIXME this loop get vectorized by the compiler !
511  for (Index j=alignedSize; j<depth; ++j)
512  {
513  RhsScalar b = rhs[j];
514  tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
515  tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
516  }
517  res[i*resIncr] += alpha*tmp0;
518  res[(i+offset1)*resIncr] += alpha*tmp1;
519  res[(i+2)*resIncr] += alpha*tmp2;
520  res[(i+offset3)*resIncr] += alpha*tmp3;
521  }
522 
523  // process remaining first and last rows (at most columnsAtOnce-1)
524  Index end = rows;
525  Index start = rowBound;
526  do
527  {
528  for (Index i=start; i<end; ++i)
529  {
531  ResPacket ptmp0 = pset1<ResPacket>(tmp0);
532  const LhsScalar* lhs0 = lhs + i*lhsStride;
533  // process first unaligned result's coeffs
534  // FIXME this loop get vectorized by the compiler !
535  for (Index j=0; j<alignedStart; ++j)
536  tmp0 += cj.pmul(lhs0[j], rhs[j]);
537 
538  if (alignedSize>alignedStart)
539  {
540  // process aligned rhs coeffs
541  if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
542  for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
543  ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
544  else
545  for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
546  ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
547  tmp0 += predux(ptmp0);
548  }
549 
550  // process remaining scalars
551  // FIXME this loop get vectorized by the compiler !
552  for (Index j=alignedSize; j<depth; ++j)
553  tmp0 += cj.pmul(lhs0[j], rhs[j]);
554  res[i*resIncr] += alpha*tmp0;
555  }
556  if (skipRows)
557  {
558  start = 0;
559  end = skipRows;
560  skipRows = 0;
561  }
562  else
563  break;
564  } while(Vectorizable);
565 
566  #undef _EIGEN_ACCUMULATE_PACKETS
567 }
568 
569 } // end namespace internal
570 
571 } // end namespace Eigen
572 
573 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
d
#define EIGEN_ALIGN16
#define EIGEN_UNUSED_VARIABLE(var)
Definition: LDLT.h:16
#define eigen_internal_assert(x)
void pstore(Scalar *to, const Packet &from)
unpacket_traits< Packet >::type pfirst(const Packet &a)
unpacket_traits< Packet >::type predux(const Packet &a)
#define _EIGEN_ACCUMULATE_PACKETS(A0, A13, A2)
#define EIGEN_DONT_INLINE
static Derived::Index first_aligned(const Derived &m)
void run(ClassLoader *loader)


tuw_aruco
Author(s): Lukas Pfeifhofer
autogenerated on Mon Jun 10 2019 15:40:50