GeneralMatrixVector.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11 #define EIGEN_GENERAL_MATRIX_VECTOR_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
21 };
22 
23 template <int N, typename T1, typename T2, typename T3>
24 struct gemv_packet_cond { typedef T3 type; };
25 
26 template <typename T1, typename T2, typename T3>
27 struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
28 
29 template <typename T1, typename T2, typename T3>
30 struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
31 
32 template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
34 {
36 
37 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
38  typedef typename gemv_packet_cond<packet_size, \
39  typename packet_traits<name ## Scalar>::type, \
40  typename packet_traits<name ## Scalar>::half, \
41  typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
42  prefix ## name ## Packet
43 
44  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
45  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
46  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
47 #undef PACKET_DECL_COND_PREFIX
48 
49 public:
50  enum {
57  };
58 
62 };
63 
64 
65 /* Optimized col-major matrix * vector product:
66  * This algorithm processes the matrix per vertical panels,
67  * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
68  *
69  * Mixing type logic: C += alpha * A * B
70  * | A | B |alpha| comments
71  * |real |cplx |cplx | no vectorization
72  * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
73  * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
74  * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
75  *
76  * The same reasoning apply for the transposed case.
77  */
78 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
79 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
80 {
84 
86 
87  typedef typename Traits::LhsPacket LhsPacket;
88  typedef typename Traits::RhsPacket RhsPacket;
89  typedef typename Traits::ResPacket ResPacket;
90 
94 
98 
100  Index rows, Index cols,
101  const LhsMapper& lhs,
102  const RhsMapper& rhs,
103  ResScalar* res, Index resIncr,
104  RhsScalar alpha);
105 };
106 
107 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
109  Index rows, Index cols,
110  const LhsMapper& alhs,
111  const RhsMapper& rhs,
112  ResScalar* res, Index resIncr,
113  RhsScalar alpha)
114 {
115  EIGEN_UNUSED_VARIABLE(resIncr);
116  eigen_internal_assert(resIncr==1);
117 
118  // The following copy tells the compiler that lhs's attributes are not modified outside this function
119  // This helps GCC to generate propoer code.
120  LhsMapper lhs(alhs);
121 
126 
127  const Index lhsStride = lhs.stride();
128  // TODO: for padded aligned inputs, we could enable aligned reads
129  enum { LhsAlignment = Unaligned,
130  ResPacketSize = Traits::ResPacketSize,
131  ResPacketSizeHalf = HalfTraits::ResPacketSize,
132  ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
133  LhsPacketSize = Traits::LhsPacketSize,
134  HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
135  HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
136  };
137 
138  const Index n8 = rows-8*ResPacketSize+1;
139  const Index n4 = rows-4*ResPacketSize+1;
140  const Index n3 = rows-3*ResPacketSize+1;
141  const Index n2 = rows-2*ResPacketSize+1;
142  const Index n1 = rows-1*ResPacketSize+1;
143  const Index n_half = rows-1*ResPacketSizeHalf+1;
144  const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
145 
146  // TODO: improve the following heuristic:
147  const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
148  ResPacket palpha = pset1<ResPacket>(alpha);
149  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
150  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
151 
152  for(Index j2=0; j2<cols; j2+=block_cols)
153  {
154  Index jend = numext::mini(j2+block_cols,cols);
155  Index i=0;
156  for(; i<n8; i+=ResPacketSize*8)
157  {
158  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
159  c1 = pset1<ResPacket>(ResScalar(0)),
160  c2 = pset1<ResPacket>(ResScalar(0)),
161  c3 = pset1<ResPacket>(ResScalar(0)),
162  c4 = pset1<ResPacket>(ResScalar(0)),
163  c5 = pset1<ResPacket>(ResScalar(0)),
164  c6 = pset1<ResPacket>(ResScalar(0)),
165  c7 = pset1<ResPacket>(ResScalar(0));
166 
167  for(Index j=j2; j<jend; j+=1)
168  {
169  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
170  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
171  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
172  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
173  c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
174  c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
175  c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
176  c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
177  c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
178  }
179  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
180  pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
181  pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
182  pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
183  pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
184  pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
185  pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
186  pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
187  }
188  if(i<n4)
189  {
190  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
191  c1 = pset1<ResPacket>(ResScalar(0)),
192  c2 = pset1<ResPacket>(ResScalar(0)),
193  c3 = pset1<ResPacket>(ResScalar(0));
194 
195  for(Index j=j2; j<jend; j+=1)
196  {
197  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
198  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
199  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
200  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
201  c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
202  }
203  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
204  pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
205  pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
206  pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
207 
208  i+=ResPacketSize*4;
209  }
210  if(i<n3)
211  {
212  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
213  c1 = pset1<ResPacket>(ResScalar(0)),
214  c2 = pset1<ResPacket>(ResScalar(0));
215 
216  for(Index j=j2; j<jend; j+=1)
217  {
218  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
219  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
220  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
221  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
222  }
223  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
224  pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
225  pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
226 
227  i+=ResPacketSize*3;
228  }
229  if(i<n2)
230  {
231  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
232  c1 = pset1<ResPacket>(ResScalar(0));
233 
234  for(Index j=j2; j<jend; j+=1)
235  {
236  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
237  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
238  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
239  }
240  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
241  pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
242  i+=ResPacketSize*2;
243  }
244  if(i<n1)
245  {
246  ResPacket c0 = pset1<ResPacket>(ResScalar(0));
247  for(Index j=j2; j<jend; j+=1)
248  {
249  RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
250  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
251  }
252  pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
253  i+=ResPacketSize;
254  }
255  if(HasHalf && i<n_half)
256  {
257  ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
258  for(Index j=j2; j<jend; j+=1)
259  {
260  RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
261  c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
262  }
263  pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
264  i+=ResPacketSizeHalf;
265  }
266  if(HasQuarter && i<n_quarter)
267  {
268  ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
269  for(Index j=j2; j<jend; j+=1)
270  {
271  RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
272  c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
273  }
274  pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
275  i+=ResPacketSizeQuarter;
276  }
277  for(;i<rows;++i)
278  {
279  ResScalar c0(0);
280  for(Index j=j2; j<jend; j+=1)
281  c0 += cj.pmul(lhs(i,j), rhs(j,0));
282  res[i] += alpha*c0;
283  }
284  }
285 }
286 
287 /* Optimized row-major matrix * vector product:
288  * This algorithm processes 4 rows at once that allows to both reduce
289  * the number of load/stores of the result by a factor 4 and to reduce
290  * the instruction dependency. Moreover, we know that all bands have the
291  * same alignment pattern.
292  *
293  * Mixing type logic:
294  * - alpha is always a complex (or converted to a complex)
295  * - no vectorization
296  */
297 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
298 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
299 {
303 
305 
306  typedef typename Traits::LhsPacket LhsPacket;
307  typedef typename Traits::RhsPacket RhsPacket;
308  typedef typename Traits::ResPacket ResPacket;
309 
313 
317 
319  Index rows, Index cols,
320  const LhsMapper& lhs,
321  const RhsMapper& rhs,
322  ResScalar* res, Index resIncr,
323  ResScalar alpha);
324 };
325 
326 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
328  Index rows, Index cols,
329  const LhsMapper& alhs,
330  const RhsMapper& rhs,
331  ResScalar* res, Index resIncr,
333 {
334  // The following copy tells the compiler that lhs's attributes are not modified outside this function
335  // This helps GCC to generate propoer code.
336  LhsMapper lhs(alhs);
337 
338  eigen_internal_assert(rhs.stride()==1);
343 
344  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
345  // processing 8 rows at once might be counter productive wrt cache.
346  const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
347  const Index n4 = rows-3;
348  const Index n2 = rows-1;
349 
350  // TODO: for padded aligned inputs, we could enable aligned reads
351  enum { LhsAlignment = Unaligned,
352  ResPacketSize = Traits::ResPacketSize,
353  ResPacketSizeHalf = HalfTraits::ResPacketSize,
354  ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
355  LhsPacketSize = Traits::LhsPacketSize,
356  LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
357  LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
358  HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
359  HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
360  };
361 
362  Index i=0;
363  for(; i<n8; i+=8)
364  {
365  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
366  c1 = pset1<ResPacket>(ResScalar(0)),
367  c2 = pset1<ResPacket>(ResScalar(0)),
368  c3 = pset1<ResPacket>(ResScalar(0)),
369  c4 = pset1<ResPacket>(ResScalar(0)),
370  c5 = pset1<ResPacket>(ResScalar(0)),
371  c6 = pset1<ResPacket>(ResScalar(0)),
372  c7 = pset1<ResPacket>(ResScalar(0));
373 
374  Index j=0;
375  for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
376  {
377  RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
378 
379  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
380  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
381  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
382  c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
383  c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
384  c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
385  c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
386  c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
387  }
388  ResScalar cc0 = predux(c0);
389  ResScalar cc1 = predux(c1);
390  ResScalar cc2 = predux(c2);
391  ResScalar cc3 = predux(c3);
392  ResScalar cc4 = predux(c4);
393  ResScalar cc5 = predux(c5);
394  ResScalar cc6 = predux(c6);
395  ResScalar cc7 = predux(c7);
396  for(; j<cols; ++j)
397  {
398  RhsScalar b0 = rhs(j,0);
399 
400  cc0 += cj.pmul(lhs(i+0,j), b0);
401  cc1 += cj.pmul(lhs(i+1,j), b0);
402  cc2 += cj.pmul(lhs(i+2,j), b0);
403  cc3 += cj.pmul(lhs(i+3,j), b0);
404  cc4 += cj.pmul(lhs(i+4,j), b0);
405  cc5 += cj.pmul(lhs(i+5,j), b0);
406  cc6 += cj.pmul(lhs(i+6,j), b0);
407  cc7 += cj.pmul(lhs(i+7,j), b0);
408  }
409  res[(i+0)*resIncr] += alpha*cc0;
410  res[(i+1)*resIncr] += alpha*cc1;
411  res[(i+2)*resIncr] += alpha*cc2;
412  res[(i+3)*resIncr] += alpha*cc3;
413  res[(i+4)*resIncr] += alpha*cc4;
414  res[(i+5)*resIncr] += alpha*cc5;
415  res[(i+6)*resIncr] += alpha*cc6;
416  res[(i+7)*resIncr] += alpha*cc7;
417  }
418  for(; i<n4; i+=4)
419  {
420  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
421  c1 = pset1<ResPacket>(ResScalar(0)),
422  c2 = pset1<ResPacket>(ResScalar(0)),
423  c3 = pset1<ResPacket>(ResScalar(0));
424 
425  Index j=0;
426  for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
427  {
428  RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
429 
430  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
431  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
432  c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
433  c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
434  }
435  ResScalar cc0 = predux(c0);
436  ResScalar cc1 = predux(c1);
437  ResScalar cc2 = predux(c2);
438  ResScalar cc3 = predux(c3);
439  for(; j<cols; ++j)
440  {
441  RhsScalar b0 = rhs(j,0);
442 
443  cc0 += cj.pmul(lhs(i+0,j), b0);
444  cc1 += cj.pmul(lhs(i+1,j), b0);
445  cc2 += cj.pmul(lhs(i+2,j), b0);
446  cc3 += cj.pmul(lhs(i+3,j), b0);
447  }
448  res[(i+0)*resIncr] += alpha*cc0;
449  res[(i+1)*resIncr] += alpha*cc1;
450  res[(i+2)*resIncr] += alpha*cc2;
451  res[(i+3)*resIncr] += alpha*cc3;
452  }
453  for(; i<n2; i+=2)
454  {
455  ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
456  c1 = pset1<ResPacket>(ResScalar(0));
457 
458  Index j=0;
459  for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
460  {
461  RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
462 
463  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
464  c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
465  }
466  ResScalar cc0 = predux(c0);
467  ResScalar cc1 = predux(c1);
468  for(; j<cols; ++j)
469  {
470  RhsScalar b0 = rhs(j,0);
471 
472  cc0 += cj.pmul(lhs(i+0,j), b0);
473  cc1 += cj.pmul(lhs(i+1,j), b0);
474  }
475  res[(i+0)*resIncr] += alpha*cc0;
476  res[(i+1)*resIncr] += alpha*cc1;
477  }
478  for(; i<rows; ++i)
479  {
480  ResPacket c0 = pset1<ResPacket>(ResScalar(0));
481  ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
482  ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
483  Index j=0;
484  for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
485  {
486  RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
487  c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
488  }
489  ResScalar cc0 = predux(c0);
490  if (HasHalf) {
491  for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
492  {
493  RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
494  c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
495  }
496  cc0 += predux(c0_h);
497  }
498  if (HasQuarter) {
499  for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
500  {
501  RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
502  c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
503  }
504  cc0 += predux(c0_q);
505  }
506  for(; j<cols; ++j)
507  {
508  cc0 += cj.pmul(lhs(i,j), rhs(j,0));
509  }
510  res[i*resIncr] += alpha*cc0;
511  }
512 }
513 
514 } // end namespace internal
515 
516 } // end namespace Eigen
517 
518 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
gtsam.examples.DogLegOptimizerExample.int
int
Definition: DogLegOptimizerExample.py:111
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::LhsPacketQuarter
QuarterTraits::LhsPacket LhsPacketQuarter
Definition: GeneralMatrixVector.h:314
Eigen::internal::Lhs
@ Lhs
Definition: TensorContractionMapper.h:19
Eigen::internal::GEMVPacketSizeType
GEMVPacketSizeType
Definition: GeneralMatrixVector.h:17
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::LhsPacketQuarter
QuarterTraits::LhsPacket LhsPacketQuarter
Definition: GeneralMatrixVector.h:95
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::LhsPacketHalf
HalfTraits::LhsPacket LhsPacketHalf
Definition: GeneralMatrixVector.h:310
Eigen::internal::pstoreu
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:700
Eigen::internal::gemv_traits::ResScalar
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: GeneralMatrixVector.h:35
benchmark.n1
n1
Definition: benchmark.py:77
alpha
RealScalar alpha
Definition: level1_cplx_impl.h:147
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::ResPacketQuarter
QuarterTraits::ResPacket ResPacketQuarter
Definition: GeneralMatrixVector.h:97
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::LhsPacket
Traits::LhsPacket LhsPacket
Definition: GeneralMatrixVector.h:306
Eigen::internal::general_matrix_vector_product
Definition: BlasUtil.h:40
Eigen::Unaligned
@ Unaligned
Definition: Constants.h:233
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::LhsPacketHalf
HalfTraits::LhsPacket LhsPacketHalf
Definition: GeneralMatrixVector.h:91
Eigen::internal::gemv_packet_cond
Definition: GeneralMatrixVector.h:24
Eigen::internal::gemv_traits::RhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition: GeneralMatrixVector.h:60
T3
static const Pose3 T3(Rot3::Rodrigues(-90, 0, 0), Point3(1, 2, 3))
Eigen::ScalarBinaryOpTraits
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:801
Eigen::RowMajor
@ RowMajor
Definition: Constants.h:321
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::RhsPacketQuarter
QuarterTraits::RhsPacket RhsPacketQuarter
Definition: GeneralMatrixVector.h:315
Eigen::internal::conj_helper::pmul
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType &x, const RhsType &y) const
Definition: ConjHelper.h:71
res
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
rows
int rows
Definition: Tutorial_commainit_02.cpp:1
eigen_internal_assert
#define eigen_internal_assert(x)
Definition: Macros.h:1043
Eigen::internal::gemv_packet_cond< GEMVPacketHalf, T1, T2, T3 >::type
T2 type
Definition: GeneralMatrixVector.h:30
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::RhsPacketQuarter
QuarterTraits::RhsPacket RhsPacketQuarter
Definition: GeneralMatrixVector.h:96
c1
static double c1
Definition: airy.c:54
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::RhsPacket
Traits::RhsPacket RhsPacket
Definition: GeneralMatrixVector.h:307
Eigen::internal::gemv_traits::ResPacket
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition: GeneralMatrixVector.h:61
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::ResPacketHalf
HalfTraits::ResPacket ResPacketHalf
Definition: GeneralMatrixVector.h:312
Eigen::internal::true_type
Definition: Meta.h:96
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::ResPacket
Traits::ResPacket ResPacket
Definition: GeneralMatrixVector.h:89
benchmark.n2
n2
Definition: benchmark.py:83
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::Traits
gemv_traits< LhsScalar, RhsScalar > Traits
Definition: GeneralMatrixVector.h:81
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::RhsPacketHalf
HalfTraits::RhsPacket RhsPacketHalf
Definition: GeneralMatrixVector.h:92
EIGEN_UNUSED_VARIABLE
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:1076
j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::HalfTraits
gemv_traits< LhsScalar, RhsScalar, GEMVPacketHalf > HalfTraits
Definition: GeneralMatrixVector.h:82
Eigen::internal::gemv_traits
Definition: GeneralMatrixVector.h:33
Eigen::internal::gemv_traits::PACKET_DECL_COND_PREFIX
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize)
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::LhsPacket
Traits::LhsPacket LhsPacket
Definition: GeneralMatrixVector.h:87
Eigen::internal::gemv_traits::ResPacketSize
@ ResPacketSize
Definition: GeneralMatrixVector.h:56
Eigen::numext::mini
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: Eigen/src/Core/MathFunctions.h:1085
Eigen::internal::unpacket_traits
Definition: GenericPacketMath.h:132
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::ResScalar
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: GeneralMatrixVector.h:304
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::RhsPacket
Traits::RhsPacket RhsPacket
Definition: GeneralMatrixVector.h:88
Eigen::internal::GEMVPacketHalf
@ GEMVPacketHalf
Definition: GeneralMatrixVector.h:19
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::ResPacketQuarter
QuarterTraits::ResPacket ResPacketQuarter
Definition: GeneralMatrixVector.h:316
gtsam.examples.DogLegOptimizerExample.run
def run(args)
Definition: DogLegOptimizerExample.py:21
T2
static const Pose3 T2(Rot3::Rodrigues(0.3, 0.2, 0.1), P2)
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::RhsPacketHalf
HalfTraits::RhsPacket RhsPacketHalf
Definition: GeneralMatrixVector.h:311
Eigen::internal::GEMVPacketQuarter
@ GEMVPacketQuarter
Definition: GeneralMatrixVector.h:20
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::HalfTraits
gemv_traits< LhsScalar, RhsScalar, GEMVPacketHalf > HalfTraits
Definition: GeneralMatrixVector.h:301
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::ResPacketHalf
HalfTraits::ResPacket ResPacketHalf
Definition: GeneralMatrixVector.h:93
Eigen::internal::gemv_traits::LhsPacketSize
@ LhsPacketSize
Definition: GeneralMatrixVector.h:54
Eigen::internal::predux
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
Definition: GenericPacketMath.h:875
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::QuarterTraits
gemv_traits< LhsScalar, RhsScalar, GEMVPacketQuarter > QuarterTraits
Definition: GeneralMatrixVector.h:302
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::ResScalar
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: GeneralMatrixVector.h:85
Eigen::internal::conj_helper
Definition: ConjHelper.h:63
Eigen::internal::Rhs
@ Rhs
Definition: TensorContractionMapper.h:18
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::Traits
gemv_traits< LhsScalar, RhsScalar > Traits
Definition: GeneralMatrixVector.h:300
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::ResPacket
Traits::ResPacket ResPacket
Definition: GeneralMatrixVector.h:308
Eigen::internal::general_matrix_vector_product< Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version >::QuarterTraits
gemv_traits< LhsScalar, RhsScalar, GEMVPacketQuarter > QuarterTraits
Definition: GeneralMatrixVector.h:83
Eigen::internal::pmadd
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:827
Eigen::internal::gemv_packet_cond::type
T3 type
Definition: GeneralMatrixVector.h:24
Eigen::internal::conj_helper::pmadd
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType &x, const RhsType &y, const ResultType &c) const
Definition: ConjHelper.h:67
c2
static double c2
Definition: airy.c:55
internal
Definition: BandTriangularSolver.h:13
Eigen::ColMajor
@ ColMajor
Definition: Constants.h:319
cols
int cols
Definition: Tutorial_commainit_02.cpp:1
T1
static const Similarity3 T1(R, Point3(3.5, -8.2, 4.2), 1)
palpha
int RealScalar * palpha
Definition: level1_cplx_impl.h:142
Eigen::internal::gemv_traits::LhsPacket
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition: GeneralMatrixVector.h:59
_
constexpr descr< N - 1 > _(char const (&text)[N])
Definition: descr.h:109
Eigen::internal::gemv_traits::Vectorizable
@ Vectorizable
Definition: GeneralMatrixVector.h:51
Eigen::internal::gemv_packet_cond< GEMVPacketFull, T1, T2, T3 >::type
T1 type
Definition: GeneralMatrixVector.h:27
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
Eigen::internal::GEMVPacketFull
@ GEMVPacketFull
Definition: GeneralMatrixVector.h:18
EIGEN_DONT_INLINE
#define EIGEN_DONT_INLINE
Definition: Macros.h:940
Eigen::internal::gemv_traits::RhsPacketSize
@ RhsPacketSize
Definition: GeneralMatrixVector.h:55


gtsam
Author(s):
autogenerated on Thu Jun 13 2024 03:02:25