GeneralMatrixVector.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11 #define EIGEN_GENERAL_MATRIX_VECTOR_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
21 };
22 
23 template <int N, typename T1, typename T2, typename T3>
24 struct gemv_packet_cond { typedef T3 type; };
25 
26 template <typename T1, typename T2, typename T3>
27 struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
28 
29 template <typename T1, typename T2, typename T3>
30 struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
31 
32 template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
34 {
36 
37 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
38  typedef typename gemv_packet_cond<packet_size, \
39  typename packet_traits<name ## Scalar>::type, \
40  typename packet_traits<name ## Scalar>::half, \
41  typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
42  prefix ## name ## Packet
43 
44  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
45  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
46  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
47 #undef PACKET_DECL_COND_PREFIX
48 
49 public:
50  enum {
54  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
55  RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
56  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
57  };
58 
62 };
63 
64 
65 /* Optimized col-major matrix * vector product:
66  * This algorithm processes the matrix per vertical panels,
67  * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
68  *
69  * Mixing type logic: C += alpha * A * B
70  * | A | B |alpha| comments
71  * |real |cplx |cplx | no vectorization
72  * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
73  * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
74  * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
75  *
76  * The same reasoning apply for the transposed case.
77  */
78 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
79 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
80 {
84 
86 
87  typedef typename Traits::LhsPacket LhsPacket;
88  typedef typename Traits::RhsPacket RhsPacket;
89  typedef typename Traits::ResPacket ResPacket;
90 
94 
98 
100  Index rows, Index cols,
101  const LhsMapper& lhs,
102  const RhsMapper& rhs,
103  ResScalar* res, Index resIncr,
104  RhsScalar alpha);
105 };
106 
107 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
109  Index rows, Index cols,
110  const LhsMapper& alhs,
111  const RhsMapper& rhs,
112  ResScalar* res, Index resIncr,
113  RhsScalar alpha)
114 {
115  EIGEN_UNUSED_VARIABLE(resIncr);
116  eigen_internal_assert(resIncr==1);
117 
118  // The following copy tells the compiler that lhs's attributes are not modified outside this function
119  // This helps GCC to generate propoer code.
120  LhsMapper lhs(alhs);
121 
126 
127  const Index lhsStride = lhs.stride();
128  // TODO: for padded aligned inputs, we could enable aligned reads
129  enum { LhsAlignment = Unaligned,
130  ResPacketSize = Traits::ResPacketSize,
131  ResPacketSizeHalf = HalfTraits::ResPacketSize,
132  ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
133  LhsPacketSize = Traits::LhsPacketSize,
134  HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
135  HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
136  };
137 
138  const Index n8 = rows-8*ResPacketSize+1;
139  const Index n4 = rows-4*ResPacketSize+1;
140  const Index n3 = rows-3*ResPacketSize+1;
141  const Index n2 = rows-2*ResPacketSize+1;
142  const Index n1 = rows-1*ResPacketSize+1;
143  const Index n_half = rows-1*ResPacketSizeHalf+1;
144  const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
145 
146  // TODO: improve the following heuristic:
147  const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
148  ResPacket palpha = pset1<ResPacket>(alpha);
149  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
150  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
151 
152  for(Index j2=0; j2<cols; j2+=block_cols)
153  {
154  Index jend = numext::mini(j2+block_cols,cols);
155  Index i=0;
156  for(; i<n8; i+=ResPacketSize*8)
157  {
158  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
159  c1 = pset1<ResPacket>(ResScalar(0)),
160  c2 = pset1<ResPacket>(ResScalar(0)),
161  c3 = pset1<ResPacket>(ResScalar(0)),
162  c4 = pset1<ResPacket>(ResScalar(0)),
163  c5 = pset1<ResPacket>(ResScalar(0)),
164  c6 = pset1<ResPacket>(ResScalar(0)),
165  c7 = pset1<ResPacket>(ResScalar(0));
166 
167  for(Index j=j2; j<jend; j+=1)
168  {
169  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
170  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
171  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
172  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
173  c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
174  c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
175  c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
176  c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
177  c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
178  }
179  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
180  pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
181  pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
182  pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
183  pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
184  pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
185  pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
186  pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
187  }
188  if(i<n4)
189  {
190  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
191  c1 = pset1<ResPacket>(ResScalar(0)),
192  c2 = pset1<ResPacket>(ResScalar(0)),
193  c3 = pset1<ResPacket>(ResScalar(0));
194 
195  for(Index j=j2; j<jend; j+=1)
196  {
197  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
198  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
199  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
200  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
201  c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
202  }
203  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
204  pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
205  pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
206  pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
207 
208  i+=ResPacketSize*4;
209  }
210  if(i<n3)
211  {
212  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
213  c1 = pset1<ResPacket>(ResScalar(0)),
214  c2 = pset1<ResPacket>(ResScalar(0));
215 
216  for(Index j=j2; j<jend; j+=1)
217  {
218  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
219  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
220  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
221  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
222  }
223  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
224  pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
225  pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
226 
227  i+=ResPacketSize*3;
228  }
229  if(i<n2)
230  {
231  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
232  c1 = pset1<ResPacket>(ResScalar(0));
233 
234  for(Index j=j2; j<jend; j+=1)
235  {
236  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
237  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
238  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
239  }
240  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
241  pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
242  i+=ResPacketSize*2;
243  }
244  if(i<n1)
245  {
246  ResPacket c0 = pset1<ResPacket>(ResScalar(0));
247  for(Index j=j2; j<jend; j+=1)
248  {
249  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
250  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
251  }
252  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
253  i+=ResPacketSize;
254  }
255  if(HasHalf && i<n_half)
256  {
257  ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
258  for(Index j=j2; j<jend; j+=1)
259  {
260  RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
261  c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
262  }
263  pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
264  i+=ResPacketSizeHalf;
265  }
266  if(HasQuarter && i<n_quarter)
267  {
268  ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
269  for(Index j=j2; j<jend; j+=1)
270  {
271  RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
272  c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
273  }
274  pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
275  i+=ResPacketSizeQuarter;
276  }
277  for(;i<rows;++i)
278  {
279  ResScalar c0(0);
280  for(Index j=j2; j<jend; j+=1)
281  c0 += cj.pmul(lhs(i,j), rhs(j,0));
282  res[i] += alpha*c0;
283  }
284  }
285 }
286 
287 /* Optimized row-major matrix * vector product:
288  * This algorithm processes 4 rows at once that allows to both reduce
289  * the number of load/stores of the result by a factor 4 and to reduce
290  * the instruction dependency. Moreover, we know that all bands have the
291  * same alignment pattern.
292  *
293  * Mixing type logic:
294  * - alpha is always a complex (or converted to a complex)
295  * - no vectorization
296  */
297 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
298 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
299 {
303 
305 
306  typedef typename Traits::LhsPacket LhsPacket;
307  typedef typename Traits::RhsPacket RhsPacket;
308  typedef typename Traits::ResPacket ResPacket;
309 
313 
317 
319  Index rows, Index cols,
320  const LhsMapper& lhs,
321  const RhsMapper& rhs,
322  ResScalar* res, Index resIncr,
323  ResScalar alpha);
324 };
325 
326 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
328  Index rows, Index cols,
329  const LhsMapper& alhs,
330  const RhsMapper& rhs,
331  ResScalar* res, Index resIncr,
333 {
334  // The following copy tells the compiler that lhs's attributes are not modified outside this function
335  // This helps GCC to generate propoer code.
336  LhsMapper lhs(alhs);
337 
338  eigen_internal_assert(rhs.stride()==1);
343 
344  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
345  // processing 8 rows at once might be counter productive wrt cache.
346  const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
347  const Index n4 = rows-3;
348  const Index n2 = rows-1;
349 
350  // TODO: for padded aligned inputs, we could enable aligned reads
351  enum { LhsAlignment = Unaligned,
352  ResPacketSize = Traits::ResPacketSize,
353  ResPacketSizeHalf = HalfTraits::ResPacketSize,
354  ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
355  LhsPacketSize = Traits::LhsPacketSize,
356  LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
357  LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
358  HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
359  HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
360  };
361 
362  Index i=0;
363  for(; i<n8; i+=8)
364  {
365  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
366  c1 = pset1<ResPacket>(ResScalar(0)),
367  c2 = pset1<ResPacket>(ResScalar(0)),
368  c3 = pset1<ResPacket>(ResScalar(0)),
369  c4 = pset1<ResPacket>(ResScalar(0)),
370  c5 = pset1<ResPacket>(ResScalar(0)),
371  c6 = pset1<ResPacket>(ResScalar(0)),
372  c7 = pset1<ResPacket>(ResScalar(0));
373 
374  Index j=0;
375  for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
376  {
377  RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
378 
379  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
380  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
381  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
382  c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
383  c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
384  c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
385  c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
386  c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
387  }
388  ResScalar cc0 = predux(c0);
389  ResScalar cc1 = predux(c1);
390  ResScalar cc2 = predux(c2);
391  ResScalar cc3 = predux(c3);
392  ResScalar cc4 = predux(c4);
393  ResScalar cc5 = predux(c5);
394  ResScalar cc6 = predux(c6);
395  ResScalar cc7 = predux(c7);
396  for(; j<cols; ++j)
397  {
398  RhsScalar b0 = rhs(j,0);
399 
400  cc0 += cj.pmul(lhs(i+0,j), b0);
401  cc1 += cj.pmul(lhs(i+1,j), b0);
402  cc2 += cj.pmul(lhs(i+2,j), b0);
403  cc3 += cj.pmul(lhs(i+3,j), b0);
404  cc4 += cj.pmul(lhs(i+4,j), b0);
405  cc5 += cj.pmul(lhs(i+5,j), b0);
406  cc6 += cj.pmul(lhs(i+6,j), b0);
407  cc7 += cj.pmul(lhs(i+7,j), b0);
408  }
409  res[(i+0)*resIncr] += alpha*cc0;
410  res[(i+1)*resIncr] += alpha*cc1;
411  res[(i+2)*resIncr] += alpha*cc2;
412  res[(i+3)*resIncr] += alpha*cc3;
413  res[(i+4)*resIncr] += alpha*cc4;
414  res[(i+5)*resIncr] += alpha*cc5;
415  res[(i+6)*resIncr] += alpha*cc6;
416  res[(i+7)*resIncr] += alpha*cc7;
417  }
418  for(; i<n4; i+=4)
419  {
420  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
421  c1 = pset1<ResPacket>(ResScalar(0)),
422  c2 = pset1<ResPacket>(ResScalar(0)),
423  c3 = pset1<ResPacket>(ResScalar(0));
424 
425  Index j=0;
426  for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
427  {
428  RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
429 
430  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
431  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
432  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
433  c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
434  }
435  ResScalar cc0 = predux(c0);
436  ResScalar cc1 = predux(c1);
437  ResScalar cc2 = predux(c2);
438  ResScalar cc3 = predux(c3);
439  for(; j<cols; ++j)
440  {
441  RhsScalar b0 = rhs(j,0);
442 
443  cc0 += cj.pmul(lhs(i+0,j), b0);
444  cc1 += cj.pmul(lhs(i+1,j), b0);
445  cc2 += cj.pmul(lhs(i+2,j), b0);
446  cc3 += cj.pmul(lhs(i+3,j), b0);
447  }
448  res[(i+0)*resIncr] += alpha*cc0;
449  res[(i+1)*resIncr] += alpha*cc1;
450  res[(i+2)*resIncr] += alpha*cc2;
451  res[(i+3)*resIncr] += alpha*cc3;
452  }
453  for(; i<n2; i+=2)
454  {
455  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
456  c1 = pset1<ResPacket>(ResScalar(0));
457 
458  Index j=0;
459  for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
460  {
461  RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
462 
463  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
464  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
465  }
466  ResScalar cc0 = predux(c0);
467  ResScalar cc1 = predux(c1);
468  for(; j<cols; ++j)
469  {
470  RhsScalar b0 = rhs(j,0);
471 
472  cc0 += cj.pmul(lhs(i+0,j), b0);
473  cc1 += cj.pmul(lhs(i+1,j), b0);
474  }
475  res[(i+0)*resIncr] += alpha*cc0;
476  res[(i+1)*resIncr] += alpha*cc1;
477  }
478  for(; i<rows; ++i)
479  {
480  ResPacket c0 = pset1<ResPacket>(ResScalar(0));
481  ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
482  ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
483  Index j=0;
484  for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
485  {
486  RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
487  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
488  }
489  ResScalar cc0 = predux(c0);
490  if (HasHalf) {
491  for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
492  {
493  RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
494  c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
495  }
496  cc0 += predux(c0_h);
497  }
498  if (HasQuarter) {
499  for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
500  {
501  RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
502  c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
503  }
504  cc0 += predux(c0_q);
505  }
506  for(; j<cols; ++j)
507  {
508  cc0 += cj.pmul(lhs(i,j), rhs(j,0));
509  }
510  res[i*resIncr] += alpha*cc0;
511  }
512 }
513 
514 } // end namespace internal
515 
516 } // end namespace Eigen
517 
518 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
static const Pose3 T3(Rot3::Rodrigues(-90, 0, 0), Point3(1, 2, 3))
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType &x, const RhsType &y, const ResultType &c) const
Definition: ConjHelper.h:67
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
static const Pose3 T2(Rot3::Rodrigues(0.3, 0.2, 0.1), P2)
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
#define EIGEN_DONT_INLINE
Definition: Macros.h:940
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
int RealScalar * palpha
RealScalar alpha
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
constexpr descr< N - 1 > _(char const (&text)[N])
Definition: descr.h:109
static const Similarity3 T1(R, Point3(3.5, -8.2, 4.2), 1)
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:801
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType &x, const RhsType &y) const
Definition: ConjHelper.h:71
#define eigen_internal_assert(x)
Definition: Macros.h:1043
Generic expression where a coefficient-wise unary operator is applied to an expression.
Definition: CwiseUnaryOp.h:55
std::ptrdiff_t j
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:1076


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:34:16