GeneralBlockPanelKernel.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
12 
13 
14 namespace Eigen {
15 
16 namespace internal {
17 
18 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
20 
21 
23 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
24 {
25  return a<=0 ? b : a;
26 }
27 
28 #if EIGEN_ARCH_i386_OR_x86_64
29 const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30 const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31 const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
32 #else
33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
36 #endif
37 
39 struct CacheSizes {
40  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
42  queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
43  m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
44  m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
45  m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
46  }
47 
48  std::ptrdiff_t m_l1;
49  std::ptrdiff_t m_l2;
50  std::ptrdiff_t m_l3;
51 };
52 
53 
55 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
56 {
57  static CacheSizes m_cacheSizes;
58 
59  if(action==SetAction)
60  {
61  // set the cpu cache size and cache all block sizes from a global cache size in byte
62  eigen_internal_assert(l1!=0 && l2!=0);
63  m_cacheSizes.m_l1 = *l1;
64  m_cacheSizes.m_l2 = *l2;
65  m_cacheSizes.m_l3 = *l3;
66  }
67  else if(action==GetAction)
68  {
69  eigen_internal_assert(l1!=0 && l2!=0);
70  *l1 = m_cacheSizes.m_l1;
71  *l2 = m_cacheSizes.m_l2;
72  *l3 = m_cacheSizes.m_l3;
73  }
74  else
75  {
76  eigen_internal_assert(false);
77  }
78 }
79 
80 /* Helper for computeProductBlockingSizes.
81  *
82  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
83  * this function computes the blocking size parameters along the respective dimensions
84  * for matrix products and related algorithms. The blocking sizes depends on various
85  * parameters:
86  * - the L1 and L2 cache sizes,
87  * - the register level blocking sizes defined by gebp_traits,
88  * - the number of scalars that fit into a packet (when vectorization is enabled).
89  *
90  * \sa setCpuCacheSizes */
91 
92 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
94 {
95  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
96 
97  // Explanations:
98  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
99  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
100  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
101  // at the register level. This small horizontal panel has to stay within L1 cache.
102  std::ptrdiff_t l1, l2, l3;
103  manage_caching_sizes(GetAction, &l1, &l2, &l3);
104 
105  if (num_threads > 1) {
106  typedef typename Traits::ResScalar ResScalar;
107  enum {
108  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
109  ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
110  kr = 8,
111  mr = Traits::mr,
112  nr = Traits::nr
113  };
114  // Increasing k gives us more time to prefetch the content of the "C"
115  // registers. However once the latency is hidden there is no point in
116  // increasing the value of k, so we'll cap it at 320 (value determined
117  // experimentally).
118  const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
119  if (k_cache < k) {
120  k = k_cache - (k_cache % kr);
121  eigen_internal_assert(k > 0);
122  }
123 
124  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
125  const Index n_per_thread = numext::div_ceil(n, num_threads);
126  if (n_cache <= n_per_thread) {
127  // Don't exceed the capacity of the l2 cache.
128  eigen_internal_assert(n_cache >= static_cast<Index>(nr));
129  n = n_cache - (n_cache % nr);
130  eigen_internal_assert(n > 0);
131  } else {
132  n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
133  }
134 
135  if (l3 > l2) {
136  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
137  const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
138  const Index m_per_thread = numext::div_ceil(m, num_threads);
139  if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
140  m = m_cache - (m_cache % mr);
141  eigen_internal_assert(m > 0);
142  } else {
143  m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
144  }
145  }
146  }
147  else {
148  // In unit tests we do not want to use extra large matrices,
149  // so we reduce the cache size to check the blocking strategy is not flawed
150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
151  l1 = 9*1024;
152  l2 = 32*1024;
153  l3 = 512*1024;
154 #endif
155 
156  // Early return for small problems because the computation below are time consuming for small problems.
157  // Perhaps it would make more sense to consider k*n*m??
158  // Note that for very tiny problem, this function should be bypassed anyway
159  // because we use the coefficient-based implementation for them.
160  if((numext::maxi)(k,(numext::maxi)(m,n))<48)
161  return;
162 
163  typedef typename Traits::ResScalar ResScalar;
164  enum {
165  k_peeling = 8,
166  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
167  k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
168  };
169 
170  // ---- 1st level of blocking on L1, yields kc ----
171 
172  // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
173  // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
174  // We also include a register-level block of the result (mx x nr).
175  // (In an ideal world only the lhs panel would stay in L1)
176  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
177  const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
178  const Index old_k = k;
179  if(k>max_kc)
180  {
181  // We are really blocking on the third dimension:
182  // -> reduce blocking size to make sure the last block is as large as possible
183  // while keeping the same number of sweeps over the result.
184  k = (k%max_kc)==0 ? max_kc
185  : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
186 
187  eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
188  }
189 
190  // ---- 2nd level of blocking on max(L2,L3), yields nc ----
191 
192  // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
193  // actual_l2 = max(l2, l3/nb_core_sharing_l3)
194  // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
195  // For instance, it corresponds to 6MB of L3 shared among 4 cores.
196  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
197  const Index actual_l2 = l3;
198  #else
199  const Index actual_l2 = 1572864; // == 1.5 MB
200  #endif
201 
202  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
203  // The second half is implicitly reserved to access the result and lhs coefficients.
204  // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
205  // to limit this growth: we bound nc to growth by a factor x1.5.
206  // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
207  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
208  Index max_nc;
209  const Index lhs_bytes = m * k * sizeof(LhsScalar);
210  const Index remaining_l1 = l1- k_sub - lhs_bytes;
211  if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
212  {
213  // L1 blocking
214  max_nc = remaining_l1 / (k*sizeof(RhsScalar));
215  }
216  else
217  {
218  // L2 blocking
219  max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
220  }
221  // WARNING Below, we assume that Traits::nr is a power of two.
222  Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
223  if(n>nc)
224  {
225  // We are really blocking over the columns:
226  // -> reduce blocking size to make sure the last block is as large as possible
227  // while keeping the same number of sweeps over the packed lhs.
228  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
229  n = (n%nc)==0 ? nc
230  : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
231  }
232  else if(old_k==k)
233  {
234  // So far, no blocking at all, i.e., kc==k, and nc==n.
235  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
236  // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
237  Index problem_size = k*n*sizeof(LhsScalar);
238  Index actual_lm = actual_l2;
239  Index max_mc = m;
240  if(problem_size<=1024)
241  {
242  // problem is small enough to keep in L1
243  // Let's choose m such that lhs's block fit in 1/3 of L1
244  actual_lm = l1;
245  }
246  else if(l3!=0 && problem_size<=32768)
247  {
248  // we have both L2 and L3, and problem is small enough to be kept in L2
249  // Let's choose m such that lhs's block fit in 1/3 of L2
250  actual_lm = l2;
251  max_mc = (numext::mini<Index>)(576,max_mc);
252  }
253  Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
254  if (mc > Traits::mr) mc -= mc % Traits::mr;
255  else if (mc==0) return;
256  m = (m%mc)==0 ? mc
257  : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
258  }
259  }
260 }
261 
262 template <typename Index>
264 {
265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
267  k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
268  m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
269  n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
270  return true;
271  }
272 #else
276 #endif
277  return false;
278 }
279 
296 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
297 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
298 {
299  if (!useSpecificBlockingSizes(k, m, n)) {
300  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
301  }
302 }
303 
304 template<typename LhsScalar, typename RhsScalar, typename Index>
305 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
306 {
307  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
308 }
309 
310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
311  #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
312 #else
313 
314  // FIXME (a bit overkill maybe ?)
315 
316  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
317  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
318  {
319  c = cj.pmadd(a,b,c);
320  }
321  };
322 
323  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
324  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
325  {
326  t = b; t = cj.pmul(a,t); c = padd(c,t);
327  }
328  };
329 
330  template<typename CJ, typename A, typename B, typename C, typename T>
331  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
332  {
334  }
335 
336  #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
337 // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
338 #endif
339 
340 /* Vectorization logic
341  * real*real: unpack rhs to constant packets, ...
342  *
343  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
344  * storing each res packet into two packets (2x2),
345  * at the end combine them: swap the second and addsub them
346  * cf*cf : same but with 2x4 blocks
347  * cplx*real : unpack rhs to constant packets, ...
348  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
349  */
350 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
351 class gebp_traits
352 {
353 public:
354  typedef _LhsScalar LhsScalar;
355  typedef _RhsScalar RhsScalar;
357 
358  enum {
359  ConjLhs = _ConjLhs,
360  ConjRhs = _ConjRhs,
362  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
363  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
364  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
365 
367 
368  // register block size along the N direction must be 1 or 4
369  nr = 4,
370 
371  // register block size along the M direction (currently, this one cannot be modified)
372  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
373 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
374  // we assume 16 registers
375  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
376  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
377  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
378 #else
379  mr = default_mr,
380 #endif
381 
382  LhsProgress = LhsPacketSize,
383  RhsProgress = 1
384  };
385 
389 
393 
394  typedef ResPacket AccPacket;
395 
396  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
397  {
398  p = pset1<ResPacket>(ResScalar(0));
399  }
400 
401  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
402  {
403  pbroadcast4(b, b0, b1, b2, b3);
404  }
405 
406 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
407 // {
408 // pbroadcast2(b, b0, b1);
409 // }
410 
411  template<typename RhsPacketType>
412  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
413  {
414  dest = pset1<RhsPacketType>(*b);
415  }
416 
417  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
418  {
419  dest = ploadquad<RhsPacket>(b);
420  }
421 
422  template<typename LhsPacketType>
423  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
424  {
425  dest = pload<LhsPacketType>(a);
426  }
427 
428  template<typename LhsPacketType>
429  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
430  {
431  dest = ploadu<LhsPacketType>(a);
432  }
433 
434  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
435  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
436  {
438  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
439  // let gcc allocate the register in which to store the result of the pmul
440  // (in the case where there is no FMA) gcc fails to figure out how to avoid
441  // spilling register.
442 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
444  c = cj.pmadd(a,b,c);
445 #else
446  tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
447 #endif
448  }
449 
450  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
451  {
452  r = pmadd(c,alpha,r);
453  }
454 
455  template<typename ResPacketHalf>
456  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
457  {
458  r = pmadd(c,alpha,r);
459  }
460 
461 };
462 
463 template<typename RealScalar, bool _ConjLhs>
464 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
465 {
466 public:
467  typedef std::complex<RealScalar> LhsScalar;
470 
471  enum {
472  ConjLhs = _ConjLhs,
473  ConjRhs = false,
474  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
475  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
476  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
477  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
478 
480  nr = 4,
481 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
482  // we assume 16 registers
483  mr = 3*LhsPacketSize,
484 #else
485  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
486 #endif
487 
488  LhsProgress = LhsPacketSize,
489  RhsProgress = 1
490  };
491 
495 
499 
500  typedef ResPacket AccPacket;
501 
502  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
503  {
504  p = pset1<ResPacket>(ResScalar(0));
505  }
506 
507  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
508  {
509  dest = pset1<RhsPacket>(*b);
510  }
511 
512  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
513  {
514  dest = pset1<RhsPacket>(*b);
515  }
516 
517  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
518  {
519  dest = pload<LhsPacket>(a);
520  }
521 
522  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
523  {
524  dest = ploadu<LhsPacket>(a);
525  }
526 
527  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
528  {
529  pbroadcast4(b, b0, b1, b2, b3);
530  }
531 
532 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
533 // {
534 // pbroadcast2(b, b0, b1);
535 // }
536 
537  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
538  {
539  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
540  }
541 
542  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
543  {
544 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
546  c.v = pmadd(a.v,b,c.v);
547 #else
548  tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
549 #endif
550  }
551 
552  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
553  {
554  c += a * b;
555  }
556 
557  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
558  {
559  r = cj.pmadd(c,alpha,r);
560  }
561 
562 protected:
564 };
565 
566 template<typename Packet>
568 {
571 };
572 
573 template<typename Packet>
575 {
577  res.first = padd(a.first, b.first);
578  res.second = padd(a.second,b.second);
579  return res;
580 }
581 
582 template<typename Packet>
584 {
585  return a;
586 }
587 
588 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
589 // template<typename Packet>
590 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
591 // {
592 // DoublePacket<Packet> res;
593 // res.first = padd(a.first, b.first);
594 // res.second = padd(a.second,b.second);
595 // return res;
596 // }
597 
598 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
599 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
600 {
601 public:
602  typedef std::complex<RealScalar> Scalar;
603  typedef std::complex<RealScalar> LhsScalar;
604  typedef std::complex<RealScalar> RhsScalar;
605  typedef std::complex<RealScalar> ResScalar;
606 
607  enum {
608  ConjLhs = _ConjLhs,
609  ConjRhs = _ConjRhs,
612  RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
613  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
614  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
615  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
616 
617  // FIXME: should depend on NumberOfRegisters
618  nr = 4,
619  mr = ResPacketSize,
620 
621  LhsProgress = ResPacketSize,
622  RhsProgress = 1
623  };
624 
628 
633 
634  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
635 
636  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
637  {
638  p.first = pset1<RealPacket>(RealScalar(0));
639  p.second = pset1<RealPacket>(RealScalar(0));
640  }
641 
642  // Scalar path
643  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
644  {
645  dest = pset1<ResPacket>(*b);
646  }
647 
648  // Vectorized path
649  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
650  {
651  dest.first = pset1<RealPacket>(real(*b));
652  dest.second = pset1<RealPacket>(imag(*b));
653  }
654 
655  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
656  {
657  loadRhs(b,dest);
658  }
659  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
660  {
662  loadRhs(b,dest);
663  }
664 
665  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
666  {
667  // FIXME not sure that's the best way to implement it!
668  loadRhs(b+0, b0);
669  loadRhs(b+1, b1);
670  loadRhs(b+2, b2);
671  loadRhs(b+3, b3);
672  }
673 
674  // Vectorized path
675  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
676  {
677  // FIXME not sure that's the best way to implement it!
678  loadRhs(b+0, b0);
679  loadRhs(b+1, b1);
680  }
681 
682  // Scalar path
683  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
684  {
685  // FIXME not sure that's the best way to implement it!
686  loadRhs(b+0, b0);
687  loadRhs(b+1, b1);
688  }
689 
690  // nothing special here
691  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
692  {
693  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
694  }
695 
696  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
697  {
698  dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
699  }
700 
701  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
702  {
703  c.first = padd(pmul(a,b.first), c.first);
704  c.second = padd(pmul(a,b.second),c.second);
705  }
706 
707  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
708  {
709  c = cj.pmadd(a,b,c);
710  }
711 
712  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
713 
714  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
715  {
716  // assemble c
717  ResPacket tmp;
718  if((!ConjLhs)&&(!ConjRhs))
719  {
720  tmp = pcplxflip(pconj(ResPacket(c.second)));
721  tmp = padd(ResPacket(c.first),tmp);
722  }
723  else if((!ConjLhs)&&(ConjRhs))
724  {
725  tmp = pconj(pcplxflip(ResPacket(c.second)));
726  tmp = padd(ResPacket(c.first),tmp);
727  }
728  else if((ConjLhs)&&(!ConjRhs))
729  {
730  tmp = pcplxflip(ResPacket(c.second));
731  tmp = padd(pconj(ResPacket(c.first)),tmp);
732  }
733  else if((ConjLhs)&&(ConjRhs))
734  {
735  tmp = pcplxflip(ResPacket(c.second));
736  tmp = psub(pconj(ResPacket(c.first)),tmp);
737  }
738 
739  r = pmadd(tmp,alpha,r);
740  }
741 
742 protected:
744 };
745 
746 template<typename RealScalar, bool _ConjRhs>
747 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
748 {
749 public:
750  typedef std::complex<RealScalar> Scalar;
752  typedef Scalar RhsScalar;
753  typedef Scalar ResScalar;
754 
755  enum {
756  ConjLhs = false,
757  ConjRhs = _ConjRhs,
760  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
761  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
762  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
763 
765  // FIXME: should depend on NumberOfRegisters
766  nr = 4,
767  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
768 
769  LhsProgress = ResPacketSize,
770  RhsProgress = 1
771  };
772 
776 
780 
781  typedef ResPacket AccPacket;
782 
783  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
784  {
785  p = pset1<ResPacket>(ResScalar(0));
786  }
787 
788  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
789  {
790  dest = pset1<RhsPacket>(*b);
791  }
792 
793  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
794  {
795  pbroadcast4(b, b0, b1, b2, b3);
796  }
797 
798 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
799 // {
800 // // FIXME not sure that's the best way to implement it!
801 // b0 = pload1<RhsPacket>(b+0);
802 // b1 = pload1<RhsPacket>(b+1);
803 // }
804 
805  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
806  {
807  dest = ploaddup<LhsPacket>(a);
808  }
809 
810  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
811  {
813  loadRhs(b,dest);
814  }
815 
816  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
817  {
818  dest = ploaddup<LhsPacket>(a);
819  }
820 
821  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
822  {
823  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
824  }
825 
826  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
827  {
828 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
830  c.v = pmadd(a,b.v,c.v);
831 #else
832  tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
833 #endif
834 
835  }
836 
837  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
838  {
839  c += a * b;
840  }
841 
842  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
843  {
844  r = cj.pmadd(alpha,c,r);
845  }
846 
847 protected:
849 };
850 
851 /* optimized GEneral packed Block * packed Panel product kernel
852  *
853  * Mixing type logic: C += A * B
854  * | A | B | comments
855  * |real |cplx | no vectorization yet, would require to pack A with duplication
856  * |cplx |real | easy vectorization
857  */
858 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
860 {
862  typedef typename Traits::ResScalar ResScalar;
863  typedef typename Traits::LhsPacket LhsPacket;
864  typedef typename Traits::RhsPacket RhsPacket;
865  typedef typename Traits::ResPacket ResPacket;
866  typedef typename Traits::AccPacket AccPacket;
867 
874 
875  typedef typename DataMapper::LinearMapper LinearMapper;
876 
877  enum {
878  Vectorizable = Traits::Vectorizable,
879  LhsProgress = Traits::LhsProgress,
880  RhsProgress = Traits::RhsProgress,
881  ResPacketSize = Traits::ResPacketSize
882  };
883 
885  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
886  Index rows, Index depth, Index cols, ResScalar alpha,
887  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
888 };
889 
890 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
893  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
894  Index rows, Index depth, Index cols, ResScalar alpha,
895  Index strideA, Index strideB, Index offsetA, Index offsetB)
896  {
897  Traits traits;
898  SwappedTraits straits;
899 
900  if(strideA==-1) strideA = depth;
901  if(strideB==-1) strideB = depth;
903  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906  const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
907  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
908  const Index peeled_kc = depth & ~(pk-1);
909  const Index prefetch_res_offset = 32/sizeof(ResScalar);
910 // const Index depth2 = depth & ~1;
911 
912  //---------- Process 3 * LhsProgress rows at once ----------
913  // This corresponds to 3*LhsProgress x nr register blocks.
914  // Usually, make sense only with FMA
915  if(mr>=3*Traits::LhsProgress)
916  {
917  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
918  // and on each largest micro vertical panel of the rhs (depth * nr).
919  // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
920  // However, if depth is too small, we can extend the number of rows of these horizontal panels.
921  // This actual number of rows is computed as follow:
922  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
923  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
924  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
925  // or because we are testing specific blocking sizes.
926  const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
927  for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
928  {
929  const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
930  for(Index j2=0; j2<packet_cols4; j2+=nr)
931  {
932  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
933  {
934 
935  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
936  // stored into 3 x nr registers.
937 
938  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
939  prefetch(&blA[0]);
940 
941  // gets res block as register
942  AccPacket C0, C1, C2, C3,
943  C4, C5, C6, C7,
944  C8, C9, C10, C11;
945  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
946  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
947  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
948 
949  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
950  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
951  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
952  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
953 
954  r0.prefetch(0);
955  r1.prefetch(0);
956  r2.prefetch(0);
957  r3.prefetch(0);
958 
959  // performs "inner" products
960  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
961  prefetch(&blB[0]);
962  LhsPacket A0, A1;
963 
964  for(Index k=0; k<peeled_kc; k+=pk)
965  {
966  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
967  RhsPacket B_0, T0;
968  LhsPacket A2;
969 
970 #define EIGEN_GEBP_ONESTEP(K) \
971  do { \
972  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
973  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
974  internal::prefetch(blA+(3*K+16)*LhsProgress); \
975  if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
976  traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
977  traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
978  traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
979  traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
980  traits.madd(A0, B_0, C0, T0); \
981  traits.madd(A1, B_0, C4, T0); \
982  traits.madd(A2, B_0, C8, B_0); \
983  traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
984  traits.madd(A0, B_0, C1, T0); \
985  traits.madd(A1, B_0, C5, T0); \
986  traits.madd(A2, B_0, C9, B_0); \
987  traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
988  traits.madd(A0, B_0, C2, T0); \
989  traits.madd(A1, B_0, C6, T0); \
990  traits.madd(A2, B_0, C10, B_0); \
991  traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
992  traits.madd(A0, B_0, C3 , T0); \
993  traits.madd(A1, B_0, C7, T0); \
994  traits.madd(A2, B_0, C11, B_0); \
995  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
996  } while(false)
997 
998  internal::prefetch(blB);
1000  EIGEN_GEBP_ONESTEP(1);
1001  EIGEN_GEBP_ONESTEP(2);
1002  EIGEN_GEBP_ONESTEP(3);
1003  EIGEN_GEBP_ONESTEP(4);
1004  EIGEN_GEBP_ONESTEP(5);
1005  EIGEN_GEBP_ONESTEP(6);
1006  EIGEN_GEBP_ONESTEP(7);
1007 
1008  blB += pk*4*RhsProgress;
1009  blA += pk*3*Traits::LhsProgress;
1010 
1011  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1012  }
1013  // process remaining peeled loop
1014  for(Index k=peeled_kc; k<depth; k++)
1015  {
1016  RhsPacket B_0, T0;
1017  LhsPacket A2;
1018  EIGEN_GEBP_ONESTEP(0);
1019  blB += 4*RhsProgress;
1020  blA += 3*Traits::LhsProgress;
1021  }
1022 
1023 #undef EIGEN_GEBP_ONESTEP
1024 
1025  ResPacket R0, R1, R2;
1026  ResPacket alphav = pset1<ResPacket>(alpha);
1027 
1028  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030  R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1031  traits.acc(C0, alphav, R0);
1032  traits.acc(C4, alphav, R1);
1033  traits.acc(C8, alphav, R2);
1034  r0.storePacket(0 * Traits::ResPacketSize, R0);
1035  r0.storePacket(1 * Traits::ResPacketSize, R1);
1036  r0.storePacket(2 * Traits::ResPacketSize, R2);
1037 
1038  R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039  R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040  R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1041  traits.acc(C1, alphav, R0);
1042  traits.acc(C5, alphav, R1);
1043  traits.acc(C9, alphav, R2);
1044  r1.storePacket(0 * Traits::ResPacketSize, R0);
1045  r1.storePacket(1 * Traits::ResPacketSize, R1);
1046  r1.storePacket(2 * Traits::ResPacketSize, R2);
1047 
1048  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049  R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050  R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1051  traits.acc(C2, alphav, R0);
1052  traits.acc(C6, alphav, R1);
1053  traits.acc(C10, alphav, R2);
1054  r2.storePacket(0 * Traits::ResPacketSize, R0);
1055  r2.storePacket(1 * Traits::ResPacketSize, R1);
1056  r2.storePacket(2 * Traits::ResPacketSize, R2);
1057 
1058  R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059  R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060  R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1061  traits.acc(C3, alphav, R0);
1062  traits.acc(C7, alphav, R1);
1063  traits.acc(C11, alphav, R2);
1064  r3.storePacket(0 * Traits::ResPacketSize, R0);
1065  r3.storePacket(1 * Traits::ResPacketSize, R1);
1066  r3.storePacket(2 * Traits::ResPacketSize, R2);
1067  }
1068  }
1069 
1070  // Deal with remaining columns of the rhs
1071  for(Index j2=packet_cols4; j2<cols; j2++)
1072  {
1073  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1074  {
1075  // One column at a time
1076  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1077  prefetch(&blA[0]);
1078 
1079  // gets res block as register
1080  AccPacket C0, C4, C8;
1081  traits.initAcc(C0);
1082  traits.initAcc(C4);
1083  traits.initAcc(C8);
1084 
1085  LinearMapper r0 = res.getLinearMapper(i, j2);
1086  r0.prefetch(0);
1087 
1088  // performs "inner" products
1089  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1090  LhsPacket A0, A1, A2;
1091 
1092  for(Index k=0; k<peeled_kc; k+=pk)
1093  {
1094  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1095  RhsPacket B_0;
1096 #define EIGEN_GEBGP_ONESTEP(K) \
1097  do { \
1098  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1099  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1100  traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1101  traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1102  traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1103  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1104  traits.madd(A0, B_0, C0, B_0); \
1105  traits.madd(A1, B_0, C4, B_0); \
1106  traits.madd(A2, B_0, C8, B_0); \
1107  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1108  } while(false)
1109 
1118 
1119  blB += pk*RhsProgress;
1120  blA += pk*3*Traits::LhsProgress;
1121 
1122  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1123  }
1124 
1125  // process remaining peeled loop
1126  for(Index k=peeled_kc; k<depth; k++)
1127  {
1128  RhsPacket B_0;
1130  blB += RhsProgress;
1131  blA += 3*Traits::LhsProgress;
1132  }
1133 #undef EIGEN_GEBGP_ONESTEP
1134  ResPacket R0, R1, R2;
1135  ResPacket alphav = pset1<ResPacket>(alpha);
1136 
1137  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139  R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1140  traits.acc(C0, alphav, R0);
1141  traits.acc(C4, alphav, R1);
1142  traits.acc(C8, alphav, R2);
1143  r0.storePacket(0 * Traits::ResPacketSize, R0);
1144  r0.storePacket(1 * Traits::ResPacketSize, R1);
1145  r0.storePacket(2 * Traits::ResPacketSize, R2);
1146  }
1147  }
1148  }
1149  }
1150 
1151  //---------- Process 2 * LhsProgress rows at once ----------
1152  if(mr>=2*Traits::LhsProgress)
1153  {
1154  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1155  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1156  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1157  // or because we are testing specific blocking sizes.
1158  Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1159 
1160  for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1161  {
1162  Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1163  for(Index j2=0; j2<packet_cols4; j2+=nr)
1164  {
1165  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1166  {
1167 
1168  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1169  // stored into 2 x nr registers.
1170 
1171  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1172  prefetch(&blA[0]);
1173 
1174  // gets res block as register
1175  AccPacket C0, C1, C2, C3,
1176  C4, C5, C6, C7;
1177  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1178  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1179 
1180  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1181  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1182  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1183  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1184 
1185  r0.prefetch(prefetch_res_offset);
1186  r1.prefetch(prefetch_res_offset);
1187  r2.prefetch(prefetch_res_offset);
1188  r3.prefetch(prefetch_res_offset);
1189 
1190  // performs "inner" products
1191  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1192  prefetch(&blB[0]);
1193  LhsPacket A0, A1;
1194 
1195  for(Index k=0; k<peeled_kc; k+=pk)
1196  {
1197  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1198  RhsPacket B_0, B1, B2, B3, T0;
1199 
1200  // NOTE: the begin/end asm comments below work around bug 935!
1201  // but they are not enough for gcc>=6 without FMA (bug 1637)
1202  #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1203  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1204  #else
1205  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1206  #endif
1207  #define EIGEN_GEBGP_ONESTEP(K) \
1208  do { \
1209  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1210  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1211  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1212  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1213  traits.madd(A0, B_0, C0, T0); \
1214  traits.madd(A1, B_0, C4, B_0); \
1215  traits.madd(A0, B1, C1, T0); \
1216  traits.madd(A1, B1, C5, B1); \
1217  traits.madd(A0, B2, C2, T0); \
1218  traits.madd(A1, B2, C6, B2); \
1219  traits.madd(A0, B3, C3, T0); \
1220  traits.madd(A1, B3, C7, B3); \
1221  EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1222  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1223  } while(false)
1224 
1225  internal::prefetch(blB+(48+0));
1230  internal::prefetch(blB+(48+16));
1235 
1236  blB += pk*4*RhsProgress;
1237  blA += pk*(2*Traits::LhsProgress);
1238 
1239  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1240  }
1241  // process remaining peeled loop
1242  for(Index k=peeled_kc; k<depth; k++)
1243  {
1244  RhsPacket B_0, B1, B2, B3, T0;
1246  blB += 4*RhsProgress;
1247  blA += 2*Traits::LhsProgress;
1248  }
1249 #undef EIGEN_GEBGP_ONESTEP
1250 
1251  ResPacket R0, R1, R2, R3;
1252  ResPacket alphav = pset1<ResPacket>(alpha);
1253 
1254  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1255  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1256  R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1257  R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1258  traits.acc(C0, alphav, R0);
1259  traits.acc(C4, alphav, R1);
1260  traits.acc(C1, alphav, R2);
1261  traits.acc(C5, alphav, R3);
1262  r0.storePacket(0 * Traits::ResPacketSize, R0);
1263  r0.storePacket(1 * Traits::ResPacketSize, R1);
1264  r1.storePacket(0 * Traits::ResPacketSize, R2);
1265  r1.storePacket(1 * Traits::ResPacketSize, R3);
1266 
1267  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1268  R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1269  R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1270  R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1271  traits.acc(C2, alphav, R0);
1272  traits.acc(C6, alphav, R1);
1273  traits.acc(C3, alphav, R2);
1274  traits.acc(C7, alphav, R3);
1275  r2.storePacket(0 * Traits::ResPacketSize, R0);
1276  r2.storePacket(1 * Traits::ResPacketSize, R1);
1277  r3.storePacket(0 * Traits::ResPacketSize, R2);
1278  r3.storePacket(1 * Traits::ResPacketSize, R3);
1279  }
1280  }
1281 
1282  // Deal with remaining columns of the rhs
1283  for(Index j2=packet_cols4; j2<cols; j2++)
1284  {
1285  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1286  {
1287  // One column at a time
1288  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1289  prefetch(&blA[0]);
1290 
1291  // gets res block as register
1292  AccPacket C0, C4;
1293  traits.initAcc(C0);
1294  traits.initAcc(C4);
1295 
1296  LinearMapper r0 = res.getLinearMapper(i, j2);
1297  r0.prefetch(prefetch_res_offset);
1298 
1299  // performs "inner" products
1300  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1301  LhsPacket A0, A1;
1302 
1303  for(Index k=0; k<peeled_kc; k+=pk)
1304  {
1305  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1306  RhsPacket B_0, B1;
1307 
1308 #define EIGEN_GEBGP_ONESTEP(K) \
1309  do { \
1310  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1311  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1312  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1313  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1314  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1315  traits.madd(A0, B_0, C0, B1); \
1316  traits.madd(A1, B_0, C4, B_0); \
1317  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1318  } while(false)
1319 
1328 
1329  blB += pk*RhsProgress;
1330  blA += pk*2*Traits::LhsProgress;
1331 
1332  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1333  }
1334 
1335  // process remaining peeled loop
1336  for(Index k=peeled_kc; k<depth; k++)
1337  {
1338  RhsPacket B_0, B1;
1340  blB += RhsProgress;
1341  blA += 2*Traits::LhsProgress;
1342  }
1343 #undef EIGEN_GEBGP_ONESTEP
1344  ResPacket R0, R1;
1345  ResPacket alphav = pset1<ResPacket>(alpha);
1346 
1347  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1348  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1349  traits.acc(C0, alphav, R0);
1350  traits.acc(C4, alphav, R1);
1351  r0.storePacket(0 * Traits::ResPacketSize, R0);
1352  r0.storePacket(1 * Traits::ResPacketSize, R1);
1353  }
1354  }
1355  }
1356  }
1357  //---------- Process 1 * LhsProgress rows at once ----------
1358  if(mr>=1*Traits::LhsProgress)
1359  {
1360  // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1361  for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1362  {
1363  // loops on each largest micro vertical panel of rhs (depth * nr)
1364  for(Index j2=0; j2<packet_cols4; j2+=nr)
1365  {
1366  // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1367  // stored into 1 x nr registers.
1368 
1369  const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1370  prefetch(&blA[0]);
1371 
1372  // gets res block as register
1373  AccPacket C0, C1, C2, C3;
1374  traits.initAcc(C0);
1375  traits.initAcc(C1);
1376  traits.initAcc(C2);
1377  traits.initAcc(C3);
1378 
1379  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1380  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1381  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1382  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1383 
1384  r0.prefetch(prefetch_res_offset);
1385  r1.prefetch(prefetch_res_offset);
1386  r2.prefetch(prefetch_res_offset);
1387  r3.prefetch(prefetch_res_offset);
1388 
1389  // performs "inner" products
1390  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1391  prefetch(&blB[0]);
1392  LhsPacket A0;
1393 
1394  for(Index k=0; k<peeled_kc; k+=pk)
1395  {
1396  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1397  RhsPacket B_0, B1, B2, B3;
1398 
1399 #define EIGEN_GEBGP_ONESTEP(K) \
1400  do { \
1401  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1402  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1403  traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1404  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1405  traits.madd(A0, B_0, C0, B_0); \
1406  traits.madd(A0, B1, C1, B1); \
1407  traits.madd(A0, B2, C2, B2); \
1408  traits.madd(A0, B3, C3, B3); \
1409  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1410  } while(false)
1411 
1412  internal::prefetch(blB+(48+0));
1417  internal::prefetch(blB+(48+16));
1422 
1423  blB += pk*4*RhsProgress;
1424  blA += pk*1*LhsProgress;
1425 
1426  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1427  }
1428  // process remaining peeled loop
1429  for(Index k=peeled_kc; k<depth; k++)
1430  {
1431  RhsPacket B_0, B1, B2, B3;
1433  blB += 4*RhsProgress;
1434  blA += 1*LhsProgress;
1435  }
1436 #undef EIGEN_GEBGP_ONESTEP
1437 
1438  ResPacket R0, R1;
1439  ResPacket alphav = pset1<ResPacket>(alpha);
1440 
1441  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1442  R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1443  traits.acc(C0, alphav, R0);
1444  traits.acc(C1, alphav, R1);
1445  r0.storePacket(0 * Traits::ResPacketSize, R0);
1446  r1.storePacket(0 * Traits::ResPacketSize, R1);
1447 
1448  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1449  R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1450  traits.acc(C2, alphav, R0);
1451  traits.acc(C3, alphav, R1);
1452  r2.storePacket(0 * Traits::ResPacketSize, R0);
1453  r3.storePacket(0 * Traits::ResPacketSize, R1);
1454  }
1455 
1456  // Deal with remaining columns of the rhs
1457  for(Index j2=packet_cols4; j2<cols; j2++)
1458  {
1459  // One column at a time
1460  const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1461  prefetch(&blA[0]);
1462 
1463  // gets res block as register
1464  AccPacket C0;
1465  traits.initAcc(C0);
1466 
1467  LinearMapper r0 = res.getLinearMapper(i, j2);
1468 
1469  // performs "inner" products
1470  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1471  LhsPacket A0;
1472 
1473  for(Index k=0; k<peeled_kc; k+=pk)
1474  {
1475  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1476  RhsPacket B_0;
1477 
1478 #define EIGEN_GEBGP_ONESTEP(K) \
1479  do { \
1480  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1481  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1482  traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1483  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484  traits.madd(A0, B_0, C0, B_0); \
1485  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1486  } while(false);
1487 
1496 
1497  blB += pk*RhsProgress;
1498  blA += pk*1*Traits::LhsProgress;
1499 
1500  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1501  }
1502 
1503  // process remaining peeled loop
1504  for(Index k=peeled_kc; k<depth; k++)
1505  {
1506  RhsPacket B_0;
1508  blB += RhsProgress;
1509  blA += 1*Traits::LhsProgress;
1510  }
1511 #undef EIGEN_GEBGP_ONESTEP
1512  ResPacket R0;
1513  ResPacket alphav = pset1<ResPacket>(alpha);
1514  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1515  traits.acc(C0, alphav, R0);
1516  r0.storePacket(0 * Traits::ResPacketSize, R0);
1517  }
1518  }
1519  }
1520  //---------- Process remaining rows, 1 at once ----------
1521  if(peeled_mc1<rows)
1522  {
1523  // loop on each panel of the rhs
1524  for(Index j2=0; j2<packet_cols4; j2+=nr)
1525  {
1526  // loop on each row of the lhs (1*LhsProgress x depth)
1527  for(Index i=peeled_mc1; i<rows; i+=1)
1528  {
1529  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1530  prefetch(&blA[0]);
1531  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1532 
1533  // The following piece of code wont work for 512 bit registers
1534  // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
1535  // as nr (which is currently 4) for the return type.
1537  if ((SwappedTraits::LhsProgress % 4) == 0 &&
1538  (SwappedTraits::LhsProgress <= 8) &&
1539  (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1540  {
1541  SAccPacket C0, C1, C2, C3;
1542  straits.initAcc(C0);
1543  straits.initAcc(C1);
1544  straits.initAcc(C2);
1545  straits.initAcc(C3);
1546 
1547  const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1548  const Index endk = (depth/spk)*spk;
1549  const Index endk4 = (depth/(spk*4))*(spk*4);
1550 
1551  Index k=0;
1552  for(; k<endk4; k+=4*spk)
1553  {
1554  SLhsPacket A0,A1;
1555  SRhsPacket B_0,B_1;
1556 
1557  straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1558  straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1559 
1560  straits.loadRhsQuad(blA+0*spk, B_0);
1561  straits.loadRhsQuad(blA+1*spk, B_1);
1562  straits.madd(A0,B_0,C0,B_0);
1563  straits.madd(A1,B_1,C1,B_1);
1564 
1565  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1566  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1567  straits.loadRhsQuad(blA+2*spk, B_0);
1568  straits.loadRhsQuad(blA+3*spk, B_1);
1569  straits.madd(A0,B_0,C2,B_0);
1570  straits.madd(A1,B_1,C3,B_1);
1571 
1572  blB += 4*SwappedTraits::LhsProgress;
1573  blA += 4*spk;
1574  }
1575  C0 = padd(padd(C0,C1),padd(C2,C3));
1576  for(; k<endk; k+=spk)
1577  {
1578  SLhsPacket A0;
1579  SRhsPacket B_0;
1580 
1581  straits.loadLhsUnaligned(blB, A0);
1582  straits.loadRhsQuad(blA, B_0);
1583  straits.madd(A0,B_0,C0,B_0);
1584 
1585  blB += SwappedTraits::LhsProgress;
1586  blA += spk;
1587  }
1588  if(SwappedTraits::LhsProgress==8)
1589  {
1590  // Special case where we have to first reduce the accumulation register C0
1592  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1593  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1594  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1595 
1596  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1597  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1598 
1599  if(depth-endk>0)
1600  {
1601  // We have to handle the last row of the rhs which corresponds to a half-packet
1602  SLhsPacketHalf a0;
1603  SRhsPacketHalf b0;
1604  straits.loadLhsUnaligned(blB, a0);
1605  straits.loadRhs(blA, b0);
1606  SAccPacketHalf c0 = predux_downto4(C0);
1607  straits.madd(a0,b0,c0,b0);
1608  straits.acc(c0, alphav, R);
1609  }
1610  else
1611  {
1612  straits.acc(predux_downto4(C0), alphav, R);
1613  }
1614  res.scatterPacket(i, j2, R);
1615  }
1616  else
1617  {
1618  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1619  SResPacket alphav = pset1<SResPacket>(alpha);
1620  straits.acc(C0, alphav, R);
1621  res.scatterPacket(i, j2, R);
1622  }
1623  }
1624  else // scalar path
1625  {
1626  // get a 1 x 4 res block as registers
1627  ResScalar C0(0), C1(0), C2(0), C3(0);
1628 
1629  for(Index k=0; k<depth; k++)
1630  {
1631  LhsScalar A0;
1632  RhsScalar B_0, B_1;
1633 
1634  A0 = blA[k];
1635 
1636  B_0 = blB[0];
1637  B_1 = blB[1];
1638  CJMADD(cj,A0,B_0,C0, B_0);
1639  CJMADD(cj,A0,B_1,C1, B_1);
1640 
1641  B_0 = blB[2];
1642  B_1 = blB[3];
1643  CJMADD(cj,A0,B_0,C2, B_0);
1644  CJMADD(cj,A0,B_1,C3, B_1);
1645 
1646  blB += 4;
1647  }
1648  res(i, j2 + 0) += alpha * C0;
1649  res(i, j2 + 1) += alpha * C1;
1650  res(i, j2 + 2) += alpha * C2;
1651  res(i, j2 + 3) += alpha * C3;
1652  }
1653  }
1654  }
1655  // remaining columns
1656  for(Index j2=packet_cols4; j2<cols; j2++)
1657  {
1658  // loop on each row of the lhs (1*LhsProgress x depth)
1659  for(Index i=peeled_mc1; i<rows; i+=1)
1660  {
1661  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1662  prefetch(&blA[0]);
1663  // gets a 1 x 1 res block as registers
1664  ResScalar C0(0);
1665  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1666  for(Index k=0; k<depth; k++)
1667  {
1668  LhsScalar A0 = blA[k];
1669  RhsScalar B_0 = blB[k];
1670  CJMADD(cj, A0, B_0, C0, B_0);
1671  }
1672  res(i, j2) += alpha * C0;
1673  }
1674  }
1675  }
1676  }
1677 
1678 
1679 #undef CJMADD
1680 
1681 // pack a block of the lhs
1682 // The traversal is as follow (mr==4):
1683 // 0 4 8 12 ...
1684 // 1 5 9 13 ...
1685 // 2 6 10 14 ...
1686 // 3 7 11 15 ...
1687 //
1688 // 16 20 24 28 ...
1689 // 17 21 25 29 ...
1690 // 18 22 26 30 ...
1691 // 19 23 27 31 ...
1692 //
1693 // 32 33 34 35 ...
1694 // 36 36 38 39 ...
1695 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1696 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1697 {
1698  typedef typename DataMapper::LinearMapper LinearMapper;
1699  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1700 };
1701 
1702 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1704  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1705 {
1706  typedef typename packet_traits<Scalar>::type Packet;
1707  enum { PacketSize = packet_traits<Scalar>::size };
1708 
1709  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1710  EIGEN_UNUSED_VARIABLE(stride);
1711  EIGEN_UNUSED_VARIABLE(offset);
1712  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1713  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1715  Index count = 0;
1716 
1717  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1718  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1719  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1720  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1721  : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1722 
1723  Index i=0;
1724 
1725  // Pack 3 packets
1726  if(Pack1>=3*PacketSize)
1727  {
1728  for(; i<peeled_mc3; i+=3*PacketSize)
1729  {
1730  if(PanelMode) count += (3*PacketSize) * offset;
1731 
1732  for(Index k=0; k<depth; k++)
1733  {
1734  Packet A, B, C;
1735  A = lhs.loadPacket(i+0*PacketSize, k);
1736  B = lhs.loadPacket(i+1*PacketSize, k);
1737  C = lhs.loadPacket(i+2*PacketSize, k);
1738  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1739  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1740  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1741  }
1742  if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1743  }
1744  }
1745  // Pack 2 packets
1746  if(Pack1>=2*PacketSize)
1747  {
1748  for(; i<peeled_mc2; i+=2*PacketSize)
1749  {
1750  if(PanelMode) count += (2*PacketSize) * offset;
1751 
1752  for(Index k=0; k<depth; k++)
1753  {
1754  Packet A, B;
1755  A = lhs.loadPacket(i+0*PacketSize, k);
1756  B = lhs.loadPacket(i+1*PacketSize, k);
1757  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1758  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1759  }
1760  if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1761  }
1762  }
1763  // Pack 1 packets
1764  if(Pack1>=1*PacketSize)
1765  {
1766  for(; i<peeled_mc1; i+=1*PacketSize)
1767  {
1768  if(PanelMode) count += (1*PacketSize) * offset;
1769 
1770  for(Index k=0; k<depth; k++)
1771  {
1772  Packet A;
1773  A = lhs.loadPacket(i+0*PacketSize, k);
1774  pstore(blockA+count, cj.pconj(A));
1775  count+=PacketSize;
1776  }
1777  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1778  }
1779  }
1780  // Pack scalars
1781  if(Pack2<PacketSize && Pack2>1)
1782  {
1783  for(; i<peeled_mc0; i+=Pack2)
1784  {
1785  if(PanelMode) count += Pack2 * offset;
1786 
1787  for(Index k=0; k<depth; k++)
1788  for(Index w=0; w<Pack2; w++)
1789  blockA[count++] = cj(lhs(i+w, k));
1790 
1791  if(PanelMode) count += Pack2 * (stride-offset-depth);
1792  }
1793  }
1794  for(; i<rows; i++)
1795  {
1796  if(PanelMode) count += offset;
1797  for(Index k=0; k<depth; k++)
1798  blockA[count++] = cj(lhs(i, k));
1799  if(PanelMode) count += (stride-offset-depth);
1800  }
1801 }
1802 
1803 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1804 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1805 {
1806  typedef typename DataMapper::LinearMapper LinearMapper;
1807  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1808 };
1809 
1810 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1812  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1813 {
1814  typedef typename packet_traits<Scalar>::type Packet;
1815  enum { PacketSize = packet_traits<Scalar>::size };
1816 
1817  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1818  EIGEN_UNUSED_VARIABLE(stride);
1819  EIGEN_UNUSED_VARIABLE(offset);
1820  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1822  Index count = 0;
1823 
1824 // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1825 // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1826 // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1827 
1828  int pack = Pack1;
1829  Index i = 0;
1830  while(pack>0)
1831  {
1832  Index remaining_rows = rows-i;
1833  Index peeled_mc = i+(remaining_rows/pack)*pack;
1834  for(; i<peeled_mc; i+=pack)
1835  {
1836  if(PanelMode) count += pack * offset;
1837 
1838  const Index peeled_k = (depth/PacketSize)*PacketSize;
1839  Index k=0;
1840  if(pack>=PacketSize)
1841  {
1842  for(; k<peeled_k; k+=PacketSize)
1843  {
1844  for (Index m = 0; m < pack; m += PacketSize)
1845  {
1846  PacketBlock<Packet> kernel;
1847  for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1848  ptranspose(kernel);
1849  for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
1850  }
1851  count += PacketSize*pack;
1852  }
1853  }
1854  for(; k<depth; k++)
1855  {
1856  Index w=0;
1857  for(; w<pack-3; w+=4)
1858  {
1859  Scalar a(cj(lhs(i+w+0, k))),
1860  b(cj(lhs(i+w+1, k))),
1861  c(cj(lhs(i+w+2, k))),
1862  d(cj(lhs(i+w+3, k)));
1863  blockA[count++] = a;
1864  blockA[count++] = b;
1865  blockA[count++] = c;
1866  blockA[count++] = d;
1867  }
1868  if(pack%4)
1869  for(;w<pack;++w)
1870  blockA[count++] = cj(lhs(i+w, k));
1871  }
1872 
1873  if(PanelMode) count += pack * (stride-offset-depth);
1874  }
1875 
1876  pack -= PacketSize;
1877  if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1878  pack = Pack2;
1879  }
1880 
1881  for(; i<rows; i++)
1882  {
1883  if(PanelMode) count += offset;
1884  for(Index k=0; k<depth; k++)
1885  blockA[count++] = cj(lhs(i, k));
1886  if(PanelMode) count += (stride-offset-depth);
1887  }
1888 }
1889 
1890 // copy a complete panel of the rhs
1891 // this version is optimized for column major matrices
1892 // The traversal order is as follow: (nr==4):
1893 // 0 1 2 3 12 13 14 15 24 27
1894 // 4 5 6 7 16 17 18 19 25 28
1895 // 8 9 10 11 20 21 22 23 26 29
1896 // . . . . . . . . . .
1897 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1898 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1899 {
1901  typedef typename DataMapper::LinearMapper LinearMapper;
1902  enum { PacketSize = packet_traits<Scalar>::size };
1903  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
1904 };
1905 
1906 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1908  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
1909 {
1910  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
1911  EIGEN_UNUSED_VARIABLE(stride);
1912  EIGEN_UNUSED_VARIABLE(offset);
1913  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1915  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1916  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1917  Index count = 0;
1918  const Index peeled_k = (depth/PacketSize)*PacketSize;
1919 // if(nr>=8)
1920 // {
1921 // for(Index j2=0; j2<packet_cols8; j2+=8)
1922 // {
1923 // // skip what we have before
1924 // if(PanelMode) count += 8 * offset;
1925 // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
1926 // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
1927 // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
1928 // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
1929 // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
1930 // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
1931 // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
1932 // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
1933 // Index k=0;
1934 // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
1935 // {
1936 // for(; k<peeled_k; k+=PacketSize) {
1937 // PacketBlock<Packet> kernel;
1938 // for (int p = 0; p < PacketSize; ++p) {
1939 // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
1940 // }
1941 // ptranspose(kernel);
1942 // for (int p = 0; p < PacketSize; ++p) {
1943 // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
1944 // count+=PacketSize;
1945 // }
1946 // }
1947 // }
1948 // for(; k<depth; k++)
1949 // {
1950 // blockB[count+0] = cj(b0[k]);
1951 // blockB[count+1] = cj(b1[k]);
1952 // blockB[count+2] = cj(b2[k]);
1953 // blockB[count+3] = cj(b3[k]);
1954 // blockB[count+4] = cj(b4[k]);
1955 // blockB[count+5] = cj(b5[k]);
1956 // blockB[count+6] = cj(b6[k]);
1957 // blockB[count+7] = cj(b7[k]);
1958 // count += 8;
1959 // }
1960 // // skip what we have after
1961 // if(PanelMode) count += 8 * (stride-offset-depth);
1962 // }
1963 // }
1964 
1965  if(nr>=4)
1966  {
1967  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1968  {
1969  // skip what we have before
1970  if(PanelMode) count += 4 * offset;
1971  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1972  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1973  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1974  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
1975 
1976  Index k=0;
1977  if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
1978  {
1979  for(; k<peeled_k; k+=PacketSize) {
1981  kernel.packet[0] = dm0.loadPacket(k);
1982  kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1983  kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1984  kernel.packet[3%PacketSize] = dm3.loadPacket(k);
1985  ptranspose(kernel);
1986  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1987  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
1988  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
1989  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
1990  count+=4*PacketSize;
1991  }
1992  }
1993  for(; k<depth; k++)
1994  {
1995  blockB[count+0] = cj(dm0(k));
1996  blockB[count+1] = cj(dm1(k));
1997  blockB[count+2] = cj(dm2(k));
1998  blockB[count+3] = cj(dm3(k));
1999  count += 4;
2000  }
2001  // skip what we have after
2002  if(PanelMode) count += 4 * (stride-offset-depth);
2003  }
2004  }
2005 
2006  // copy the remaining columns one at a time (nr==1)
2007  for(Index j2=packet_cols4; j2<cols; ++j2)
2008  {
2009  if(PanelMode) count += offset;
2010  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2011  for(Index k=0; k<depth; k++)
2012  {
2013  blockB[count] = cj(dm0(k));
2014  count += 1;
2015  }
2016  if(PanelMode) count += (stride-offset-depth);
2017  }
2018 }
2019 
2020 // this version is optimized for row major matrices
2021 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2022 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2023 {
2025  typedef typename DataMapper::LinearMapper LinearMapper;
2026  enum { PacketSize = packet_traits<Scalar>::size };
2027  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2028 };
2029 
2030 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2032  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2033 {
2034  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2035  EIGEN_UNUSED_VARIABLE(stride);
2036  EIGEN_UNUSED_VARIABLE(offset);
2037  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2039  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2040  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2041  Index count = 0;
2042 
2043 // if(nr>=8)
2044 // {
2045 // for(Index j2=0; j2<packet_cols8; j2+=8)
2046 // {
2047 // // skip what we have before
2048 // if(PanelMode) count += 8 * offset;
2049 // for(Index k=0; k<depth; k++)
2050 // {
2051 // if (PacketSize==8) {
2052 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2053 // pstoreu(blockB+count, cj.pconj(A));
2054 // } else if (PacketSize==4) {
2055 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2056 // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2057 // pstoreu(blockB+count, cj.pconj(A));
2058 // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2059 // } else {
2060 // const Scalar* b0 = &rhs[k*rhsStride + j2];
2061 // blockB[count+0] = cj(b0[0]);
2062 // blockB[count+1] = cj(b0[1]);
2063 // blockB[count+2] = cj(b0[2]);
2064 // blockB[count+3] = cj(b0[3]);
2065 // blockB[count+4] = cj(b0[4]);
2066 // blockB[count+5] = cj(b0[5]);
2067 // blockB[count+6] = cj(b0[6]);
2068 // blockB[count+7] = cj(b0[7]);
2069 // }
2070 // count += 8;
2071 // }
2072 // // skip what we have after
2073 // if(PanelMode) count += 8 * (stride-offset-depth);
2074 // }
2075 // }
2076  if(nr>=4)
2077  {
2078  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2079  {
2080  // skip what we have before
2081  if(PanelMode) count += 4 * offset;
2082  for(Index k=0; k<depth; k++)
2083  {
2084  if (PacketSize==4) {
2085  Packet A = rhs.loadPacket(k, j2);
2086  pstoreu(blockB+count, cj.pconj(A));
2087  count += PacketSize;
2088  } else {
2089  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2090  blockB[count+0] = cj(dm0(0));
2091  blockB[count+1] = cj(dm0(1));
2092  blockB[count+2] = cj(dm0(2));
2093  blockB[count+3] = cj(dm0(3));
2094  count += 4;
2095  }
2096  }
2097  // skip what we have after
2098  if(PanelMode) count += 4 * (stride-offset-depth);
2099  }
2100  }
2101  // copy the remaining columns one at a time (nr==1)
2102  for(Index j2=packet_cols4; j2<cols; ++j2)
2103  {
2104  if(PanelMode) count += offset;
2105  for(Index k=0; k<depth; k++)
2106  {
2107  blockB[count] = cj(rhs(k, j2));
2108  count += 1;
2109  }
2110  if(PanelMode) count += stride-offset-depth;
2111  }
2112 }
2113 
2114 } // end namespace internal
2115 
2118 inline std::ptrdiff_t l1CacheSize()
2119 {
2120  std::ptrdiff_t l1, l2, l3;
2121  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2122  return l1;
2123 }
2124 
2127 inline std::ptrdiff_t l2CacheSize()
2128 {
2129  std::ptrdiff_t l1, l2, l3;
2130  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2131  return l2;
2132 }
2133 
2137 inline std::ptrdiff_t l3CacheSize()
2138 {
2139  std::ptrdiff_t l1, l2, l3;
2140  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2141  return l3;
2142 }
2143 
2149 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2150 {
2151  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2152 }
2153 
2154 } // end namespace Eigen
2155 
2156 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
Matrix3f m
internal::packet_traits< Scalar >::type Packet
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:509
SwappedTraits::ResScalar SResScalar
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
SCALAR Scalar
Definition: bench_gemm.cpp:33
packet_traits< RhsScalar >::type _RhsPacket
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define max(a, b)
Definition: datatypes.h:20
#define EIGEN_STRONG_INLINE
Definition: Macros.h:494
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void acc(const DoublePacketType &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, T &a, T &b, T &c, T &t)
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Scalar * b
Definition: benchVecAdd.cpp:17
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
EIGEN_STRONG_INLINE void gebp_madd(const CJ &cj, A &a, B &b, C &c, T &t)
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
packet_traits< LhsScalar >::type _LhsPacket
#define min(a, b)
Definition: datatypes.h:19
static EIGEN_ALWAYS_INLINE void run(const CJ &cj, A &a, B &b, C &c, T &)
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, DoublePacketType &c, RhsPacket &) const
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
int n
Scalar Scalar * c
Definition: benchVecAdd.cpp:17
Rot2 R(Rot2::fromAngle(0.1))
gtsam::Key l2
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
DerType::Scalar imag(const AutoDiffScalar< DerType > &)
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:624
Vector2 b3(3,-6)
Definition: Half.h:150
SwappedTraits::AccPacket SAccPacket
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacket &dest) const
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
const std::ptrdiff_t defaultL3CacheSize
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf &x)
Definition: SSE/Complex.h:242
#define EIGEN_DONT_INLINE
Definition: Macros.h:517
Array33i a
std::ptrdiff_t l3CacheSize()
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
#define CJMADD(CJ, A, B, C, T)
SwappedTraits::RhsPacket SRhsPacket
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
SwappedTraits::ResPacket SResPacket
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp) const
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
std::ptrdiff_t l2CacheSize()
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, AccPacketType &tmp) const
void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &) const
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
void queryCacheSizes(int &l1, int &l2, int &l3)
Definition: Memory.h:936
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:33
Vector2 b2(4,-5)
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
#define eigen_assert(x)
Definition: Macros.h:579
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, DoublePacketType &b0, DoublePacketType &b1)
T div_ceil(const T &a, const T &b)
Definition: Meta.h:505
#define EIGEN_GEBP_ONESTEP(K)
RealScalar alpha
static const Symbol l3('l', 3)
static const double r2
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Q R1(Eigen::AngleAxisd(1, Q_z_axis))
static const double r3
NumTraits< Scalar >::Real RealScalar
Definition: bench_gemm.cpp:34
RowVector3d w
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:37
gtsam::Key l1
static Rot3 R3
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
Vector2 b1(2,-1)
const std::ptrdiff_t defaultL2CacheSize
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar &x, const RhsScalar &y, const Scalar &c) const
Definition: BlasUtil.h:65
static const double r1
SwappedTraits::LhsPacket SLhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacket &dest) const
EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar *b, RhsPacket &b0, RhsPacket &b1, RhsPacket &b2, RhsPacket &b3)
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
#define EIGEN_PLAIN_ENUM_MIN(a, b)
Definition: Macros.h:875
const std::ptrdiff_t defaultL1CacheSize
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > SwappedTraits
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
float * p
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
DataMapper::LinearMapper LinearMapper
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:766
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
static Rot3 R0
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
packet_traits< ResScalar >::type _ResPacket
#define eigen_internal_assert(x)
Definition: Macros.h:585
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket &a, const RhsPacket &b, AccPacket &c, RhsPacket &tmp, const true_type &) const
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
const AutoDiffScalar< DerType > & real(const AutoDiffScalar< DerType > &x)
Q R2(Eigen::AngleAxisd(2, Vector3(0, 1, 0)))
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
std::ptrdiff_t l1CacheSize()
EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar &x, const RhsScalar &y) const
Definition: BlasUtil.h:68
EIGEN_DEVICE_FUNC conditional<(unpacket_traits< Packet >::size%8)==0, typename unpacket_traits< Packet >::half, Packet >::type predux_downto4(const Packet &a)
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:618
Point2 t(10, 10)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: pytypes.h:897
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:42:05