products/GeneralBlockPanelKernel.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
12 
13 
14 namespace Eigen {
15 
16 namespace internal {
17 
22 };
23 
24 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
26 
27 
29 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
30 {
31  return a<=0 ? b : a;
32 }
33 
34 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
36 #else
37 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38 #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
39 
40 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
42 #else
43 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44 #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
45 
46 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
48 #else
49 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50 #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
51 
52 #if EIGEN_ARCH_i386_OR_x86_64
53 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
56 #elif EIGEN_ARCH_PPC
57 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
60 #else
61 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
64 #endif
65 
66 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
69 
71 struct CacheSizes {
72  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
74  queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
75  m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
76  m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
77  m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
78  }
79 
80  std::ptrdiff_t m_l1;
81  std::ptrdiff_t m_l2;
82  std::ptrdiff_t m_l3;
83 };
84 
86 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
87 {
88  static CacheSizes m_cacheSizes;
89 
90  if(action==SetAction)
91  {
92  // set the cpu cache size and cache all block sizes from a global cache size in byte
93  eigen_internal_assert(l1!=0 && l2!=0);
94  m_cacheSizes.m_l1 = *l1;
95  m_cacheSizes.m_l2 = *l2;
96  m_cacheSizes.m_l3 = *l3;
97  }
98  else if(action==GetAction)
99  {
100  eigen_internal_assert(l1!=0 && l2!=0);
101  *l1 = m_cacheSizes.m_l1;
102  *l2 = m_cacheSizes.m_l2;
103  *l3 = m_cacheSizes.m_l3;
104  }
105  else
106  {
107  eigen_internal_assert(false);
108  }
109 }
110 
111 /* Helper for computeProductBlockingSizes.
112  *
113  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
114  * this function computes the blocking size parameters along the respective dimensions
115  * for matrix products and related algorithms. The blocking sizes depends on various
116  * parameters:
117  * - the L1 and L2 cache sizes,
118  * - the register level blocking sizes defined by gebp_traits,
119  * - the number of scalars that fit into a packet (when vectorization is enabled).
120  *
121  * \sa setCpuCacheSizes */
122 
123 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
125 {
126  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
127 
128  // Explanations:
129  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
130  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
131  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
132  // at the register level. This small horizontal panel has to stay within L1 cache.
133  std::ptrdiff_t l1, l2, l3;
134  manage_caching_sizes(GetAction, &l1, &l2, &l3);
135  #ifdef EIGEN_VECTORIZE_AVX512
136  // We need to find a rationale for that, but without this adjustment,
137  // performance with AVX512 is pretty bad, like -20% slower.
138  // One reason is that with increasing packet-size, the blocking size k
139  // has to become pretty small if we want that 1 lhs panel fit within L1.
140  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
141  // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
142  // This is quite small for a good reuse of the accumulation registers.
143  l1 *= 4;
144  #endif
145 
146  if (num_threads > 1) {
147  typedef typename Traits::ResScalar ResScalar;
148  enum {
149  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
150  ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
151  kr = 8,
152  mr = Traits::mr,
153  nr = Traits::nr
154  };
155  // Increasing k gives us more time to prefetch the content of the "C"
156  // registers. However once the latency is hidden there is no point in
157  // increasing the value of k, so we'll cap it at 320 (value determined
158  // experimentally).
159  // To avoid that k vanishes, we make k_cache at least as big as kr
160  const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
161  if (k_cache < k) {
162  k = k_cache - (k_cache % kr);
163  eigen_internal_assert(k > 0);
164  }
165 
166  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
167  const Index n_per_thread = numext::div_ceil(n, num_threads);
168  if (n_cache <= n_per_thread) {
169  // Don't exceed the capacity of the l2 cache.
170  eigen_internal_assert(n_cache >= static_cast<Index>(nr));
171  n = n_cache - (n_cache % nr);
172  eigen_internal_assert(n > 0);
173  } else {
174  n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
175  }
176 
177  if (l3 > l2) {
178  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
179  const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
180  const Index m_per_thread = numext::div_ceil(m, num_threads);
181  if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
182  m = m_cache - (m_cache % mr);
183  eigen_internal_assert(m > 0);
184  } else {
185  m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
186  }
187  }
188  }
189  else {
190  // In unit tests we do not want to use extra large matrices,
191  // so we reduce the cache size to check the blocking strategy is not flawed
192 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
193  l1 = 9*1024;
194  l2 = 32*1024;
195  l3 = 512*1024;
196 #endif
197 
198  // Early return for small problems because the computation below are time consuming for small problems.
199  // Perhaps it would make more sense to consider k*n*m??
200  // Note that for very tiny problem, this function should be bypassed anyway
201  // because we use the coefficient-based implementation for them.
202  if((numext::maxi)(k,(numext::maxi)(m,n))<48)
203  return;
204 
205  typedef typename Traits::ResScalar ResScalar;
206  enum {
207  k_peeling = 8,
208  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
209  k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
210  };
211 
212  // ---- 1st level of blocking on L1, yields kc ----
213 
214  // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
215  // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
216  // We also include a register-level block of the result (mx x nr).
217  // (In an ideal world only the lhs panel would stay in L1)
218  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
219  const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220  const Index old_k = k;
221  if(k>max_kc)
222  {
223  // We are really blocking on the third dimension:
224  // -> reduce blocking size to make sure the last block is as large as possible
225  // while keeping the same number of sweeps over the result.
226  k = (k%max_kc)==0 ? max_kc
227  : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
228 
229  eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
230  }
231 
232  // ---- 2nd level of blocking on max(L2,L3), yields nc ----
233 
234  // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
235  // actual_l2 = max(l2, l3/nb_core_sharing_l3)
236  // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
237  // For instance, it corresponds to 6MB of L3 shared among 4 cores.
238  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
239  const Index actual_l2 = l3;
240  #else
241  const Index actual_l2 = 1572864; // == 1.5 MB
242  #endif
243 
244  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
245  // The second half is implicitly reserved to access the result and lhs coefficients.
246  // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
247  // to limit this growth: we bound nc to growth by a factor x1.5.
248  // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
249  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
250  Index max_nc;
251  const Index lhs_bytes = m * k * sizeof(LhsScalar);
252  const Index remaining_l1 = l1- k_sub - lhs_bytes;
253  if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
254  {
255  // L1 blocking
256  max_nc = remaining_l1 / (k*sizeof(RhsScalar));
257  }
258  else
259  {
260  // L2 blocking
261  max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
262  }
263  // WARNING Below, we assume that Traits::nr is a power of two.
264  Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
265  if(n>nc)
266  {
267  // We are really blocking over the columns:
268  // -> reduce blocking size to make sure the last block is as large as possible
269  // while keeping the same number of sweeps over the packed lhs.
270  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
271  n = (n%nc)==0 ? nc
272  : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
273  }
274  else if(old_k==k)
275  {
276  // So far, no blocking at all, i.e., kc==k, and nc==n.
277  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
278  // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
279  Index problem_size = k*n*sizeof(LhsScalar);
280  Index actual_lm = actual_l2;
281  Index max_mc = m;
282  if(problem_size<=1024)
283  {
284  // problem is small enough to keep in L1
285  // Let's choose m such that lhs's block fit in 1/3 of L1
286  actual_lm = l1;
287  }
288  else if(l3!=0 && problem_size<=32768)
289  {
290  // we have both L2 and L3, and problem is small enough to be kept in L2
291  // Let's choose m such that lhs's block fit in 1/3 of L2
292  actual_lm = l2;
293  max_mc = (numext::mini<Index>)(576,max_mc);
294  }
295  Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
296  if (mc > Traits::mr) mc -= mc % Traits::mr;
297  else if (mc==0) return;
298  m = (m%mc)==0 ? mc
299  : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
300  }
301  }
302 }
303 
304 template <typename Index>
306 {
307 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
309  k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
310  m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
311  n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
312  return true;
313  }
314 #else
318 #endif
319  return false;
320 }
321 
338 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
339 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
340 {
341  if (!useSpecificBlockingSizes(k, m, n)) {
342  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
343  }
344 }
345 
346 template<typename LhsScalar, typename RhsScalar, typename Index>
347 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
348 {
349  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
350 }
351 
352 template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
354  private:
355  static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
356  public:
357  typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
358 };
359 
360 template <typename Packet>
362 {
363  Packet B_0, B1, B2, B3;
364  const Packet& get(const FixedInt<0>&) const { return B_0; }
365  const Packet& get(const FixedInt<1>&) const { return B1; }
366  const Packet& get(const FixedInt<2>&) const { return B2; }
367  const Packet& get(const FixedInt<3>&) const { return B3; }
368 };
369 
370 template <int N, typename T1, typename T2, typename T3>
371 struct packet_conditional { typedef T3 type; };
372 
373 template <typename T1, typename T2, typename T3>
374 struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
375 
376 template <typename T1, typename T2, typename T3>
377 struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
378 
379 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380  typedef typename packet_conditional<packet_size, \
381  typename packet_traits<name ## Scalar>::type, \
382  typename packet_traits<name ## Scalar>::half, \
383  typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384  prefix ## name ## Packet
385 
386 #define PACKET_DECL_COND(name, packet_size) \
387  typedef typename packet_conditional<packet_size, \
388  typename packet_traits<name ## Scalar>::type, \
389  typename packet_traits<name ## Scalar>::half, \
390  typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
391  name ## Packet
392 
393 #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394  typedef typename packet_conditional<packet_size, \
395  typename packet_traits<Scalar>::type, \
396  typename packet_traits<Scalar>::half, \
397  typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398  prefix ## ScalarPacket
399 
400 #define PACKET_DECL_COND_SCALAR(packet_size) \
401  typedef typename packet_conditional<packet_size, \
402  typename packet_traits<Scalar>::type, \
403  typename packet_traits<Scalar>::half, \
404  typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
405  ScalarPacket
406 
407 /* Vectorization logic
408  * real*real: unpack rhs to constant packets, ...
409  *
410  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
411  * storing each res packet into two packets (2x2),
412  * at the end combine them: swap the second and addsub them
413  * cf*cf : same but with 2x4 blocks
414  * cplx*real : unpack rhs to constant packets, ...
415  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
416  */
417 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
418 class gebp_traits
419 {
420 public:
421  typedef _LhsScalar LhsScalar;
422  typedef _RhsScalar RhsScalar;
424 
425  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
426  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
427  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
428 
429  enum {
430  ConjLhs = _ConjLhs,
431  ConjRhs = _ConjRhs,
433  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
434  RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
435  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
436 
438 
439  // register block size along the N direction must be 1 or 4
440  nr = 4,
441 
442  // register block size along the M direction (currently, this one cannot be modified)
443  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
444 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
445  && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
446  // we assume 16 registers or more
447  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
448  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
449  // Bug 1515: MSVC prior to v19.14 yields to register spilling.
450  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
451 #else
452  mr = default_mr,
453 #endif
454 
455  LhsProgress = LhsPacketSize,
456  RhsProgress = 1
457  };
458 
459 
463  typedef LhsPacket LhsPacket4Packing;
464 
466  typedef ResPacket AccPacket;
467 
468  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
469  {
470  p = pset1<ResPacket>(ResScalar(0));
471  }
472 
473  template<typename RhsPacketType>
474  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
475  {
476  dest = pset1<RhsPacketType>(*b);
477  }
478 
479  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
480  {
481  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
482  }
483 
484  template<typename RhsPacketType>
485  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
486  {
487  loadRhs(b, dest);
488  }
489 
490  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
491  {
492  }
493 
494  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
495  {
496  dest = ploadquad<RhsPacket>(b);
497  }
498 
499  template<typename LhsPacketType>
500  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
501  {
502  dest = pload<LhsPacketType>(a);
503  }
504 
505  template<typename LhsPacketType>
506  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
507  {
508  dest = ploadu<LhsPacketType>(a);
509  }
510 
511  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
512  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
513  {
515  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
516  // let gcc allocate the register in which to store the result of the pmul
517  // (in the case where there is no FMA) gcc fails to figure out how to avoid
518  // spilling register.
519 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
521  c = cj.pmadd(a,b,c);
522 #else
523  tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
524 #endif
525  }
526 
527  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
528  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
529  {
530  madd(a, b.get(lane), c, tmp, lane);
531  }
532 
533  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
534  {
535  r = pmadd(c,alpha,r);
536  }
537 
538  template<typename ResPacketHalf>
539  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
540  {
541  r = pmadd(c,alpha,r);
542  }
543 
544 };
545 
546 template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
547 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
548 {
549 public:
550  typedef std::complex<RealScalar> LhsScalar;
553 
554  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
555  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
556  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
557 
558  enum {
559  ConjLhs = _ConjLhs,
560  ConjRhs = false,
561  Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
562  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
563  RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
564  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
565 
567  nr = 4,
568 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
569  // we assume 16 registers
570  mr = 3*LhsPacketSize,
571 #else
572  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
573 #endif
574 
575  LhsProgress = LhsPacketSize,
576  RhsProgress = 1
577  };
578 
582  typedef LhsPacket LhsPacket4Packing;
583 
585 
586  typedef ResPacket AccPacket;
587 
588  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
589  {
590  p = pset1<ResPacket>(ResScalar(0));
591  }
592 
593  template<typename RhsPacketType>
594  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
595  {
596  dest = pset1<RhsPacketType>(*b);
597  }
598 
599  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
600  {
601  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
602  }
603 
604  template<typename RhsPacketType>
605  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
606  {
607  loadRhs(b, dest);
608  }
609 
610  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
611  {}
612 
613  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
614  {
615  loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
616  }
617 
618  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
619  {
620  // FIXME we can do better!
621  // what we want here is a ploadheight
622  RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
623  dest = ploadquad<RhsPacket>(tmp);
624  }
625 
626  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
627  {
628  eigen_internal_assert(RhsPacketSize<=8);
629  dest = pset1<RhsPacket>(*b);
630  }
631 
632  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
633  {
634  dest = pload<LhsPacket>(a);
635  }
636 
637  template<typename LhsPacketType>
638  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
639  {
640  dest = ploadu<LhsPacketType>(a);
641  }
642 
643  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
644  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
645  {
646  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
647  }
648 
649  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
650  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
651  {
652 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
654  c.v = pmadd(a.v,b,c.v);
655 #else
656  tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
657 #endif
658  }
659 
660  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
661  {
662  c += a * b;
663  }
664 
665  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
666  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
667  {
668  madd(a, b.get(lane), c, tmp, lane);
669  }
670 
671  template <typename ResPacketType, typename AccPacketType>
672  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
673  {
675  r = cj.pmadd(c,alpha,r);
676  }
677 
678 protected:
679 };
680 
681 template<typename Packet>
683 {
686 };
687 
688 template<typename Packet>
690 {
692  res.first = padd(a.first, b.first);
693  res.second = padd(a.second,b.second);
694  return res;
695 }
696 
697 // note that for DoublePacket<RealPacket> the "4" in "downto4"
698 // corresponds to the number of complexes, so it means "8"
699 // it terms of real coefficients.
700 
701 template<typename Packet>
705 {
706  return a;
707 }
708 
709 template<typename Packet>
712  typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
713 {
714  // yes, that's pretty hackish :(
717  typedef typename packet_traits<Cplx>::type CplxPacket;
718  res.first = predux_half_dowto4(CplxPacket(a.first)).v;
719  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
720  return res;
721 }
722 
723 // same here, "quad" actually means "8" in terms of real coefficients
724 template<typename Scalar, typename RealPacket>
727 {
728  dest.first = pset1<RealPacket>(numext::real(*b));
729  dest.second = pset1<RealPacket>(numext::imag(*b));
730 }
731 
732 template<typename Scalar, typename RealPacket>
735 {
736  // yes, that's pretty hackish too :(
737  typedef typename NumTraits<Scalar>::Real RealScalar;
738  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
739  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
740  dest.first = ploadquad<RealPacket>(r);
741  dest.second = ploadquad<RealPacket>(i);
742 }
743 
744 
745 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
747 };
748 // template<typename Packet>
749 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
750 // {
751 // DoublePacket<Packet> res;
752 // res.first = padd(a.first, b.first);
753 // res.second = padd(a.second,b.second);
754 // return res;
755 // }
756 
757 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
758 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
759 {
760 public:
761  typedef std::complex<RealScalar> Scalar;
762  typedef std::complex<RealScalar> LhsScalar;
763  typedef std::complex<RealScalar> RhsScalar;
764  typedef std::complex<RealScalar> ResScalar;
765 
766  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
767  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
768  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
769  PACKET_DECL_COND(Real, _PacketSize);
770  PACKET_DECL_COND_SCALAR(_PacketSize);
771 
772  enum {
773  ConjLhs = _ConjLhs,
774  ConjRhs = _ConjRhs,
777  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
778  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
779  RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
780  RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
781 
782  // FIXME: should depend on NumberOfRegisters
783  nr = 4,
784  mr = ResPacketSize,
785 
786  LhsProgress = ResPacketSize,
787  RhsProgress = 1
788  };
789 
791 
797 
798  // this actualy holds 8 packets!
800 
801  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
802 
803  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
804  {
805  p.first = pset1<RealPacket>(RealScalar(0));
806  p.second = pset1<RealPacket>(RealScalar(0));
807  }
808 
809  // Scalar path
810  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
811  {
812  dest = pset1<ScalarPacket>(*b);
813  }
814 
815  // Vectorized path
816  template<typename RealPacketType>
817  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
818  {
819  dest.first = pset1<RealPacketType>(numext::real(*b));
820  dest.second = pset1<RealPacketType>(numext::imag(*b));
821  }
822 
823  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
824  {
825  loadRhs(b, dest.B_0);
826  loadRhs(b + 1, dest.B1);
827  loadRhs(b + 2, dest.B2);
828  loadRhs(b + 3, dest.B3);
829  }
830 
831  // Scalar path
832  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
833  {
834  loadRhs(b, dest);
835  }
836 
837  // Vectorized path
838  template<typename RealPacketType>
839  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
840  {
841  loadRhs(b, dest);
842  }
843 
844  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
845 
846  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
847  {
848  loadRhs(b,dest);
849  }
850  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
851  {
852  loadQuadToDoublePacket(b,dest);
853  }
854 
855  // nothing special here
856  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
857  {
858  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
859  }
860 
861  template<typename LhsPacketType>
862  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
863  {
864  dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
865  }
866 
867  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
870  madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
871  {
872  c.first = padd(pmul(a,b.first), c.first);
873  c.second = padd(pmul(a,b.second),c.second);
874  }
875 
876  template<typename LaneIdType>
877  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
878  {
879  c = cj.pmadd(a,b,c);
880  }
881 
882  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
883  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
884  {
885  madd(a, b.get(lane), c, tmp, lane);
886  }
887 
888  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
889 
890  template<typename RealPacketType, typename ResPacketType>
891  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
892  {
893  // assemble c
894  ResPacketType tmp;
895  if((!ConjLhs)&&(!ConjRhs))
896  {
897  tmp = pcplxflip(pconj(ResPacketType(c.second)));
898  tmp = padd(ResPacketType(c.first),tmp);
899  }
900  else if((!ConjLhs)&&(ConjRhs))
901  {
902  tmp = pconj(pcplxflip(ResPacketType(c.second)));
903  tmp = padd(ResPacketType(c.first),tmp);
904  }
905  else if((ConjLhs)&&(!ConjRhs))
906  {
907  tmp = pcplxflip(ResPacketType(c.second));
908  tmp = padd(pconj(ResPacketType(c.first)),tmp);
909  }
910  else if((ConjLhs)&&(ConjRhs))
911  {
912  tmp = pcplxflip(ResPacketType(c.second));
913  tmp = psub(pconj(ResPacketType(c.first)),tmp);
914  }
915 
916  r = pmadd(tmp,alpha,r);
917  }
918 
919 protected:
921 };
922 
923 template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
924 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
925 {
926 public:
927  typedef std::complex<RealScalar> Scalar;
929  typedef Scalar RhsScalar;
930  typedef Scalar ResScalar;
931 
932  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
933  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
934  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
935  PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
936  PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
937 
938 #undef PACKET_DECL_COND_SCALAR_PREFIX
939 #undef PACKET_DECL_COND_PREFIX
940 #undef PACKET_DECL_COND_SCALAR
941 #undef PACKET_DECL_COND
942 
943  enum {
944  ConjLhs = false,
945  ConjRhs = _ConjRhs,
948  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
949  RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
950  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
951 
953  // FIXME: should depend on NumberOfRegisters
954  nr = 4,
955  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
956 
957  LhsProgress = ResPacketSize,
958  RhsProgress = 1
959  };
960 
964  typedef LhsPacket LhsPacket4Packing;
966  typedef ResPacket AccPacket;
967 
968  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
969  {
970  p = pset1<ResPacket>(ResScalar(0));
971  }
972 
973  template<typename RhsPacketType>
974  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
975  {
976  dest = pset1<RhsPacketType>(*b);
977  }
978 
979  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
980  {
981  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
982  }
983 
984  template<typename RhsPacketType>
985  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
986  {
987  loadRhs(b, dest);
988  }
989 
990  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
991  {}
992 
993  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
994  {
995  dest = ploaddup<LhsPacket>(a);
996  }
997 
998  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
999  {
1000  dest = ploadquad<RhsPacket>(b);
1001  }
1002 
1003  template<typename LhsPacketType>
1004  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
1005  {
1006  dest = ploaddup<LhsPacketType>(a);
1007  }
1008 
1009  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
1010  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
1011  {
1012  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
1013  }
1014 
1015  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
1016  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
1017  {
1018 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1019  EIGEN_UNUSED_VARIABLE(tmp);
1020  c.v = pmadd(a,b.v,c.v);
1021 #else
1022  tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
1023 #endif
1024 
1025  }
1026 
1027  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
1028  {
1029  c += a * b;
1030  }
1031 
1032  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
1033  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
1034  {
1035  madd(a, b.get(lane), c, tmp, lane);
1036  }
1037 
1038  template <typename ResPacketType, typename AccPacketType>
1039  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
1040  {
1042  r = cj.pmadd(alpha,c,r);
1043  }
1044 
1045 protected:
1046 
1047 };
1048 
1049 /* optimized General packed Block * packed Panel product kernel
1050  *
1051  * Mixing type logic: C += A * B
1052  * | A | B | comments
1053  * |real |cplx | no vectorization yet, would require to pack A with duplication
1054  * |cplx |real | easy vectorization
1055  */
1056 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1058 {
1062 
1063  typedef typename Traits::ResScalar ResScalar;
1064  typedef typename Traits::LhsPacket LhsPacket;
1065  typedef typename Traits::RhsPacket RhsPacket;
1066  typedef typename Traits::ResPacket ResPacket;
1067  typedef typename Traits::AccPacket AccPacket;
1069 
1071 
1073 
1079 
1084 
1089 
1090  typedef typename DataMapper::LinearMapper LinearMapper;
1091 
1092  enum {
1093  Vectorizable = Traits::Vectorizable,
1094  LhsProgress = Traits::LhsProgress,
1095  LhsProgressHalf = HalfTraits::LhsProgress,
1096  LhsProgressQuarter = QuarterTraits::LhsProgress,
1097  RhsProgress = Traits::RhsProgress,
1098  RhsProgressHalf = HalfTraits::RhsProgress,
1099  RhsProgressQuarter = QuarterTraits::RhsProgress,
1100  ResPacketSize = Traits::ResPacketSize
1101  };
1102 
1104  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1105  Index rows, Index depth, Index cols, ResScalar alpha,
1106  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
1107 };
1108 
1109 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
1112 {
1115 
1116  typedef typename Traits::ResScalar ResScalar;
1121 
1122  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1123  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1124  ResScalar alpha, SAccPacket &C0)
1125  {
1126  EIGEN_UNUSED_VARIABLE(res);
1127  EIGEN_UNUSED_VARIABLE(straits);
1128  EIGEN_UNUSED_VARIABLE(blA);
1129  EIGEN_UNUSED_VARIABLE(blB);
1130  EIGEN_UNUSED_VARIABLE(depth);
1131  EIGEN_UNUSED_VARIABLE(endk);
1134  EIGEN_UNUSED_VARIABLE(alpha);
1136  }
1137 };
1138 
1139 
1140 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1141 struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1144 
1145  typedef typename Traits::ResScalar ResScalar;
1150 
1151  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1152  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1153  ResScalar alpha, SAccPacket &C0)
1154  {
1155  typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1156  typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1157  typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1158  typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1159 
1160  SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1161  SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1162 
1163  if (depth - endk > 0)
1164  {
1165  // We have to handle the last row(s) of the rhs, which
1166  // correspond to a half-packet
1167  SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1168 
1169  for (Index kk = endk; kk < depth; kk++)
1170  {
1171  SLhsPacketQuarter a0;
1172  SRhsPacketQuarter b0;
1173  straits.loadLhsUnaligned(blB, a0);
1174  straits.loadRhs(blA, b0);
1175  straits.madd(a0,b0,c0,b0, fix<0>);
1176  blB += SwappedTraits::LhsProgress/4;
1177  blA += 1;
1178  }
1179  straits.acc(c0, alphav, R);
1180  }
1181  else
1182  {
1183  straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1184  }
1185  res.scatterPacket(i, j2, R);
1186  }
1187 };
1188 
1189 template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1191 {
1192  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1193 
1194  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1195  {
1196  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1197  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1198  traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199  traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1200  traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201  traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202  traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203  traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204  #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205  __asm__ ("" : "+x,m" (*A0));
1206  #endif
1207  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1208  }
1209 
1211  const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
1212  Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1213  int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
1214  {
1215  GEBPTraits traits;
1216 
1217  // loops on each largest micro horizontal panel of lhs
1218  // (LhsProgress x depth)
1219  for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1220  {
1221  // loops on each largest micro vertical panel of rhs (depth * nr)
1222  for(Index j2=0; j2<packet_cols4; j2+=nr)
1223  {
1224  // We select a LhsProgress x nr micro block of res
1225  // which is entirely stored into 1 x nr registers.
1226 
1227  const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1228  prefetch(&blA[0]);
1229 
1230  // gets res block as register
1231  AccPacket C0, C1, C2, C3;
1232  traits.initAcc(C0);
1233  traits.initAcc(C1);
1234  traits.initAcc(C2);
1235  traits.initAcc(C3);
1236  // To improve instruction pipelining, let's double the accumulation registers:
1237  // even k will accumulate in C*, while odd k will accumulate in D*.
1238  // This trick is crutial to get good performance with FMA, otherwise it is
1239  // actually faster to perform separated MUL+ADD because of a naturally
1240  // better instruction-level parallelism.
1241  AccPacket D0, D1, D2, D3;
1242  traits.initAcc(D0);
1243  traits.initAcc(D1);
1244  traits.initAcc(D2);
1245  traits.initAcc(D3);
1246 
1247  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1248  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1249  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1250  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1251 
1252  r0.prefetch(prefetch_res_offset);
1253  r1.prefetch(prefetch_res_offset);
1254  r2.prefetch(prefetch_res_offset);
1255  r3.prefetch(prefetch_res_offset);
1256 
1257  // performs "inner" products
1258  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1259  prefetch(&blB[0]);
1260  LhsPacket A0, A1;
1261 
1262  for(Index k=0; k<peeled_kc; k+=pk)
1263  {
1264  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1265  RhsPacketx4 rhs_panel;
1266  RhsPacket T0;
1267 
1268  internal::prefetch(blB+(48+0));
1269  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270  peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271  peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272  peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1273  internal::prefetch(blB+(48+16));
1274  peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275  peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276  peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277  peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1278 
1279  blB += pk*4*RhsProgress;
1280  blA += pk*LhsProgress;
1281 
1282  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1283  }
1284  C0 = padd(C0,D0);
1285  C1 = padd(C1,D1);
1286  C2 = padd(C2,D2);
1287  C3 = padd(C3,D3);
1288 
1289  // process remaining peeled loop
1290  for(Index k=peeled_kc; k<depth; k++)
1291  {
1292  RhsPacketx4 rhs_panel;
1293  RhsPacket T0;
1294  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295  blB += 4*RhsProgress;
1296  blA += LhsProgress;
1297  }
1298 
1299  ResPacket R0, R1;
1300  ResPacket alphav = pset1<ResPacket>(alpha);
1301 
1302  R0 = r0.template loadPacket<ResPacket>(0);
1303  R1 = r1.template loadPacket<ResPacket>(0);
1304  traits.acc(C0, alphav, R0);
1305  traits.acc(C1, alphav, R1);
1306  r0.storePacket(0, R0);
1307  r1.storePacket(0, R1);
1308 
1309  R0 = r2.template loadPacket<ResPacket>(0);
1310  R1 = r3.template loadPacket<ResPacket>(0);
1311  traits.acc(C2, alphav, R0);
1312  traits.acc(C3, alphav, R1);
1313  r2.storePacket(0, R0);
1314  r3.storePacket(0, R1);
1315  }
1316 
1317  // Deal with remaining columns of the rhs
1318  for(Index j2=packet_cols4; j2<cols; j2++)
1319  {
1320  // One column at a time
1321  const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1322  prefetch(&blA[0]);
1323 
1324  // gets res block as register
1325  AccPacket C0;
1326  traits.initAcc(C0);
1327 
1328  LinearMapper r0 = res.getLinearMapper(i, j2);
1329 
1330  // performs "inner" products
1331  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1332  LhsPacket A0;
1333 
1334  for(Index k= 0; k<peeled_kc; k+=pk)
1335  {
1336  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1337  RhsPacket B_0;
1338 
1339 #define EIGEN_GEBGP_ONESTEP(K) \
1340  do { \
1341  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1343  /* FIXME: why unaligned???? */ \
1344  traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1348  } while(false);
1349 
1358 
1359  blB += pk*RhsProgress;
1360  blA += pk*LhsProgress;
1361 
1362  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1363  }
1364 
1365  // process remaining peeled loop
1366  for(Index k=peeled_kc; k<depth; k++)
1367  {
1368  RhsPacket B_0;
1370  blB += RhsProgress;
1371  blA += LhsProgress;
1372  }
1373 #undef EIGEN_GEBGP_ONESTEP
1374  ResPacket R0;
1375  ResPacket alphav = pset1<ResPacket>(alpha);
1376  R0 = r0.template loadPacket<ResPacket>(0);
1377  traits.acc(C0, alphav, R0);
1378  r0.storePacket(0, R0);
1379  }
1380  }
1381  }
1382 };
1383 
1384 template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1385 struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1386 {
1387 
1388 EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1389  {
1390  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1391  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1392  traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394  traits.madd(*A0, *B_0, *C0, *B_0);
1395  traits.madd(*A0, *B1, *C1, *B1);
1396  traits.madd(*A0, *B2, *C2, *B2);
1397  traits.madd(*A0, *B3, *C3, *B3);
1398  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1399  }
1400 };
1401 
1402 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1405  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1406  Index rows, Index depth, Index cols, ResScalar alpha,
1407  Index strideA, Index strideB, Index offsetA, Index offsetB)
1408  {
1409  Traits traits;
1410  SwappedTraits straits;
1411 
1412  if(strideA==-1) strideA = depth;
1413  if(strideB==-1) strideB = depth;
1415  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1416  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1417  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1418  const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419  const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420  const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1421  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1422  const Index peeled_kc = depth & ~(pk-1);
1423  const int prefetch_res_offset = 32/sizeof(ResScalar);
1424 // const Index depth2 = depth & ~1;
1425 
1426  //---------- Process 3 * LhsProgress rows at once ----------
1427  // This corresponds to 3*LhsProgress x nr register blocks.
1428  // Usually, make sense only with FMA
1429  if(mr>=3*Traits::LhsProgress)
1430  {
1431  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
1432  // and on each largest micro vertical panel of the rhs (depth * nr).
1433  // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
1434  // However, if depth is too small, we can extend the number of rows of these horizontal panels.
1435  // This actual number of rows is computed as follow:
1436  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1437  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1438  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1439  // or because we are testing specific blocking sizes.
1440  const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
1441  for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1442  {
1443  const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
1444  for(Index j2=0; j2<packet_cols4; j2+=nr)
1445  {
1446  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1447  {
1448 
1449  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1450  // stored into 3 x nr registers.
1451 
1452  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1453  prefetch(&blA[0]);
1454 
1455  // gets res block as register
1456  AccPacket C0, C1, C2, C3,
1457  C4, C5, C6, C7,
1458  C8, C9, C10, C11;
1459  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1460  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1461  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1462 
1463  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1464  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1465  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1466  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1467 
1468  r0.prefetch(0);
1469  r1.prefetch(0);
1470  r2.prefetch(0);
1471  r3.prefetch(0);
1472 
1473  // performs "inner" products
1474  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1475  prefetch(&blB[0]);
1476  LhsPacket A0, A1;
1477 
1478  for(Index k=0; k<peeled_kc; k+=pk)
1479  {
1480  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1481  // 15 registers are taken (12 for acc, 2 for lhs).
1482  RhsPanel15 rhs_panel;
1483  RhsPacket T0;
1484  LhsPacket A2;
1485  #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1486  // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1487  // without this workaround A0, A1, and A2 are loaded in the same register,
1488  // which is not good for pipelining
1489  #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1490  #else
1491  #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1492  #endif
1493 #define EIGEN_GEBP_ONESTEP(K) \
1494  do { \
1495  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1496  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1497  internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498  if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499  internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1500  } /* Bug 953 */ \
1501  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504  EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505  traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508  traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509  traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512  traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513  traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516  traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517  traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520  traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1522  } while (false)
1523 
1524  internal::prefetch(blB);
1525  EIGEN_GEBP_ONESTEP(0);
1526  EIGEN_GEBP_ONESTEP(1);
1527  EIGEN_GEBP_ONESTEP(2);
1528  EIGEN_GEBP_ONESTEP(3);
1529  EIGEN_GEBP_ONESTEP(4);
1530  EIGEN_GEBP_ONESTEP(5);
1531  EIGEN_GEBP_ONESTEP(6);
1532  EIGEN_GEBP_ONESTEP(7);
1533 
1534  blB += pk*4*RhsProgress;
1535  blA += pk*3*Traits::LhsProgress;
1536 
1537  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1538  }
1539  // process remaining peeled loop
1540  for(Index k=peeled_kc; k<depth; k++)
1541  {
1542  RhsPanel15 rhs_panel;
1543  RhsPacket T0;
1544  LhsPacket A2;
1545  EIGEN_GEBP_ONESTEP(0);
1546  blB += 4*RhsProgress;
1547  blA += 3*Traits::LhsProgress;
1548  }
1549 
1550 #undef EIGEN_GEBP_ONESTEP
1551 
1552  ResPacket R0, R1, R2;
1553  ResPacket alphav = pset1<ResPacket>(alpha);
1554 
1555  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1558  traits.acc(C0, alphav, R0);
1559  traits.acc(C4, alphav, R1);
1560  traits.acc(C8, alphav, R2);
1561  r0.storePacket(0 * Traits::ResPacketSize, R0);
1562  r0.storePacket(1 * Traits::ResPacketSize, R1);
1563  r0.storePacket(2 * Traits::ResPacketSize, R2);
1564 
1565  R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566  R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567  R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1568  traits.acc(C1, alphav, R0);
1569  traits.acc(C5, alphav, R1);
1570  traits.acc(C9, alphav, R2);
1571  r1.storePacket(0 * Traits::ResPacketSize, R0);
1572  r1.storePacket(1 * Traits::ResPacketSize, R1);
1573  r1.storePacket(2 * Traits::ResPacketSize, R2);
1574 
1575  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577  R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1578  traits.acc(C2, alphav, R0);
1579  traits.acc(C6, alphav, R1);
1580  traits.acc(C10, alphav, R2);
1581  r2.storePacket(0 * Traits::ResPacketSize, R0);
1582  r2.storePacket(1 * Traits::ResPacketSize, R1);
1583  r2.storePacket(2 * Traits::ResPacketSize, R2);
1584 
1585  R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586  R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587  R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1588  traits.acc(C3, alphav, R0);
1589  traits.acc(C7, alphav, R1);
1590  traits.acc(C11, alphav, R2);
1591  r3.storePacket(0 * Traits::ResPacketSize, R0);
1592  r3.storePacket(1 * Traits::ResPacketSize, R1);
1593  r3.storePacket(2 * Traits::ResPacketSize, R2);
1594  }
1595  }
1596 
1597  // Deal with remaining columns of the rhs
1598  for(Index j2=packet_cols4; j2<cols; j2++)
1599  {
1600  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1601  {
1602  // One column at a time
1603  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1604  prefetch(&blA[0]);
1605 
1606  // gets res block as register
1607  AccPacket C0, C4, C8;
1608  traits.initAcc(C0);
1609  traits.initAcc(C4);
1610  traits.initAcc(C8);
1611 
1612  LinearMapper r0 = res.getLinearMapper(i, j2);
1613  r0.prefetch(0);
1614 
1615  // performs "inner" products
1616  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1617  LhsPacket A0, A1, A2;
1618 
1619  for(Index k=0; k<peeled_kc; k+=pk)
1620  {
1621  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1622  RhsPacket B_0;
1623 #define EIGEN_GEBGP_ONESTEP(K) \
1624  do { \
1625  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1626  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1627  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632  traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633  traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1635  } while (false)
1636 
1645 
1646  blB += int(pk) * int(RhsProgress);
1647  blA += int(pk) * 3 * int(Traits::LhsProgress);
1648 
1649  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1650  }
1651 
1652  // process remaining peeled loop
1653  for(Index k=peeled_kc; k<depth; k++)
1654  {
1655  RhsPacket B_0;
1657  blB += RhsProgress;
1658  blA += 3*Traits::LhsProgress;
1659  }
1660 #undef EIGEN_GEBGP_ONESTEP
1661  ResPacket R0, R1, R2;
1662  ResPacket alphav = pset1<ResPacket>(alpha);
1663 
1664  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1667  traits.acc(C0, alphav, R0);
1668  traits.acc(C4, alphav, R1);
1669  traits.acc(C8, alphav, R2);
1670  r0.storePacket(0 * Traits::ResPacketSize, R0);
1671  r0.storePacket(1 * Traits::ResPacketSize, R1);
1672  r0.storePacket(2 * Traits::ResPacketSize, R2);
1673  }
1674  }
1675  }
1676  }
1677 
1678  //---------- Process 2 * LhsProgress rows at once ----------
1679  if(mr>=2*Traits::LhsProgress)
1680  {
1681  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1682  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1683  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1684  // or because we are testing specific blocking sizes.
1685  Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1686 
1687  for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1688  {
1689  Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1690  for(Index j2=0; j2<packet_cols4; j2+=nr)
1691  {
1692  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1693  {
1694 
1695  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1696  // stored into 2 x nr registers.
1697 
1698  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1699  prefetch(&blA[0]);
1700 
1701  // gets res block as register
1702  AccPacket C0, C1, C2, C3,
1703  C4, C5, C6, C7;
1704  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1705  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1706 
1707  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1708  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1709  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1710  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1711 
1712  r0.prefetch(prefetch_res_offset);
1713  r1.prefetch(prefetch_res_offset);
1714  r2.prefetch(prefetch_res_offset);
1715  r3.prefetch(prefetch_res_offset);
1716 
1717  // performs "inner" products
1718  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1719  prefetch(&blB[0]);
1720  LhsPacket A0, A1;
1721 
1722  for(Index k=0; k<peeled_kc; k+=pk)
1723  {
1724  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1725  RhsPacketx4 rhs_panel;
1726  RhsPacket T0;
1727 
1728  // NOTE: the begin/end asm comments below work around bug 935!
1729  // but they are not enough for gcc>=6 without FMA (bug 1637)
1730  #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1731  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1732  #else
1733  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1734  #endif
1735 #define EIGEN_GEBGP_ONESTEP(K) \
1736  do { \
1737  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740  traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749  EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1751  } while (false)
1752 
1753  internal::prefetch(blB+(48+0));
1758  internal::prefetch(blB+(48+16));
1763 
1764  blB += pk*4*RhsProgress;
1765  blA += pk*(2*Traits::LhsProgress);
1766 
1767  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1768  }
1769  // process remaining peeled loop
1770  for(Index k=peeled_kc; k<depth; k++)
1771  {
1772  RhsPacketx4 rhs_panel;
1773  RhsPacket T0;
1775  blB += 4*RhsProgress;
1776  blA += 2*Traits::LhsProgress;
1777  }
1778 #undef EIGEN_GEBGP_ONESTEP
1779 
1780  ResPacket R0, R1, R2, R3;
1781  ResPacket alphav = pset1<ResPacket>(alpha);
1782 
1783  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785  R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786  R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1787  traits.acc(C0, alphav, R0);
1788  traits.acc(C4, alphav, R1);
1789  traits.acc(C1, alphav, R2);
1790  traits.acc(C5, alphav, R3);
1791  r0.storePacket(0 * Traits::ResPacketSize, R0);
1792  r0.storePacket(1 * Traits::ResPacketSize, R1);
1793  r1.storePacket(0 * Traits::ResPacketSize, R2);
1794  r1.storePacket(1 * Traits::ResPacketSize, R3);
1795 
1796  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1800  traits.acc(C2, alphav, R0);
1801  traits.acc(C6, alphav, R1);
1802  traits.acc(C3, alphav, R2);
1803  traits.acc(C7, alphav, R3);
1804  r2.storePacket(0 * Traits::ResPacketSize, R0);
1805  r2.storePacket(1 * Traits::ResPacketSize, R1);
1806  r3.storePacket(0 * Traits::ResPacketSize, R2);
1807  r3.storePacket(1 * Traits::ResPacketSize, R3);
1808  }
1809  }
1810 
1811  // Deal with remaining columns of the rhs
1812  for(Index j2=packet_cols4; j2<cols; j2++)
1813  {
1814  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1815  {
1816  // One column at a time
1817  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1818  prefetch(&blA[0]);
1819 
1820  // gets res block as register
1821  AccPacket C0, C4;
1822  traits.initAcc(C0);
1823  traits.initAcc(C4);
1824 
1825  LinearMapper r0 = res.getLinearMapper(i, j2);
1826  r0.prefetch(prefetch_res_offset);
1827 
1828  // performs "inner" products
1829  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1830  LhsPacket A0, A1;
1831 
1832  for(Index k=0; k<peeled_kc; k+=pk)
1833  {
1834  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1835  RhsPacket B_0, B1;
1836 
1837 #define EIGEN_GEBGP_ONESTEP(K) \
1838  do { \
1839  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1840  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1841  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1842  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1843  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1844  traits.madd(A0, B_0, C0, B1, fix<0>); \
1845  traits.madd(A1, B_0, C4, B_0, fix<0>); \
1846  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1847  } while(false)
1848 
1857 
1858  blB += int(pk) * int(RhsProgress);
1859  blA += int(pk) * 2 * int(Traits::LhsProgress);
1860 
1861  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1862  }
1863 
1864  // process remaining peeled loop
1865  for(Index k=peeled_kc; k<depth; k++)
1866  {
1867  RhsPacket B_0, B1;
1869  blB += RhsProgress;
1870  blA += 2*Traits::LhsProgress;
1871  }
1872 #undef EIGEN_GEBGP_ONESTEP
1873  ResPacket R0, R1;
1874  ResPacket alphav = pset1<ResPacket>(alpha);
1875 
1876  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1878  traits.acc(C0, alphav, R0);
1879  traits.acc(C4, alphav, R1);
1880  r0.storePacket(0 * Traits::ResPacketSize, R0);
1881  r0.storePacket(1 * Traits::ResPacketSize, R1);
1882  }
1883  }
1884  }
1885  }
1886  //---------- Process 1 * LhsProgress rows at once ----------
1887  if(mr>=1*Traits::LhsProgress)
1888  {
1890  p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1891  }
1892  //---------- Process LhsProgressHalf rows at once ----------
1893  if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1894  {
1896  p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1897  }
1898  //---------- Process LhsProgressQuarter rows at once ----------
1899  if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1900  {
1902  p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1903  }
1904  //---------- Process remaining rows, 1 at once ----------
1905  if(peeled_mc_quarter<rows)
1906  {
1907  // loop on each panel of the rhs
1908  for(Index j2=0; j2<packet_cols4; j2+=nr)
1909  {
1910  // loop on each row of the lhs (1*LhsProgress x depth)
1911  for(Index i=peeled_mc_quarter; i<rows; i+=1)
1912  {
1913  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1914  prefetch(&blA[0]);
1915  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1916 
1917  // If LhsProgress is 8 or 16, it assumes that there is a
1918  // half or quarter packet, respectively, of the same size as
1919  // nr (which is currently 4) for the return type.
1922  if ((SwappedTraits::LhsProgress % 4) == 0 &&
1923  (SwappedTraits::LhsProgress<=16) &&
1924  (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925  (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1926  {
1927  SAccPacket C0, C1, C2, C3;
1928  straits.initAcc(C0);
1929  straits.initAcc(C1);
1930  straits.initAcc(C2);
1931  straits.initAcc(C3);
1932 
1933  const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1934  const Index endk = (depth/spk)*spk;
1935  const Index endk4 = (depth/(spk*4))*(spk*4);
1936 
1937  Index k=0;
1938  for(; k<endk4; k+=4*spk)
1939  {
1940  SLhsPacket A0,A1;
1941  SRhsPacket B_0,B_1;
1942 
1943  straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1944  straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1945 
1946  straits.loadRhsQuad(blA+0*spk, B_0);
1947  straits.loadRhsQuad(blA+1*spk, B_1);
1948  straits.madd(A0,B_0,C0,B_0, fix<0>);
1949  straits.madd(A1,B_1,C1,B_1, fix<0>);
1950 
1951  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1952  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1953  straits.loadRhsQuad(blA+2*spk, B_0);
1954  straits.loadRhsQuad(blA+3*spk, B_1);
1955  straits.madd(A0,B_0,C2,B_0, fix<0>);
1956  straits.madd(A1,B_1,C3,B_1, fix<0>);
1957 
1958  blB += 4*SwappedTraits::LhsProgress;
1959  blA += 4*spk;
1960  }
1961  C0 = padd(padd(C0,C1),padd(C2,C3));
1962  for(; k<endk; k+=spk)
1963  {
1964  SLhsPacket A0;
1965  SRhsPacket B_0;
1966 
1967  straits.loadLhsUnaligned(blB, A0);
1968  straits.loadRhsQuad(blA, B_0);
1969  straits.madd(A0,B_0,C0,B_0, fix<0>);
1970 
1971  blB += SwappedTraits::LhsProgress;
1972  blA += spk;
1973  }
1974  if(SwappedTraits::LhsProgress==8)
1975  {
1976  // Special case where we have to first reduce the accumulation register C0
1978  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1979  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1980  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1981 
1982  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1983  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1984 
1985  if(depth-endk>0)
1986  {
1987  // We have to handle the last row of the rhs which corresponds to a half-packet
1988  SLhsPacketHalf a0;
1989  SRhsPacketHalf b0;
1990  straits.loadLhsUnaligned(blB, a0);
1991  straits.loadRhs(blA, b0);
1992  SAccPacketHalf c0 = predux_half_dowto4(C0);
1993  straits.madd(a0,b0,c0,b0, fix<0>);
1994  straits.acc(c0, alphav, R);
1995  }
1996  else
1997  {
1998  straits.acc(predux_half_dowto4(C0), alphav, R);
1999  }
2000  res.scatterPacket(i, j2, R);
2001  }
2002  else if (SwappedTraits::LhsProgress==16)
2003  {
2004  // Special case where we have to first reduce the
2005  // accumulation register C0. We specialize the block in
2006  // template form, so that LhsProgress < 16 paths don't
2007  // fail to compile
2009  p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2010  }
2011  else
2012  {
2013  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2014  SResPacket alphav = pset1<SResPacket>(alpha);
2015  straits.acc(C0, alphav, R);
2016  res.scatterPacket(i, j2, R);
2017  }
2018  }
2019  else // scalar path
2020  {
2021  // get a 1 x 4 res block as registers
2022  ResScalar C0(0), C1(0), C2(0), C3(0);
2023 
2024  for(Index k=0; k<depth; k++)
2025  {
2026  LhsScalar A0;
2027  RhsScalar B_0, B_1;
2028 
2029  A0 = blA[k];
2030 
2031  B_0 = blB[0];
2032  B_1 = blB[1];
2033  C0 = cj.pmadd(A0,B_0,C0);
2034  C1 = cj.pmadd(A0,B_1,C1);
2035 
2036  B_0 = blB[2];
2037  B_1 = blB[3];
2038  C2 = cj.pmadd(A0,B_0,C2);
2039  C3 = cj.pmadd(A0,B_1,C3);
2040 
2041  blB += 4;
2042  }
2043  res(i, j2 + 0) += alpha * C0;
2044  res(i, j2 + 1) += alpha * C1;
2045  res(i, j2 + 2) += alpha * C2;
2046  res(i, j2 + 3) += alpha * C3;
2047  }
2048  }
2049  }
2050  // remaining columns
2051  for(Index j2=packet_cols4; j2<cols; j2++)
2052  {
2053  // loop on each row of the lhs (1*LhsProgress x depth)
2054  for(Index i=peeled_mc_quarter; i<rows; i+=1)
2055  {
2056  const LhsScalar* blA = &blockA[i*strideA+offsetA];
2057  prefetch(&blA[0]);
2058  // gets a 1 x 1 res block as registers
2059  ResScalar C0(0);
2060  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2061  for(Index k=0; k<depth; k++)
2062  {
2063  LhsScalar A0 = blA[k];
2064  RhsScalar B_0 = blB[k];
2065  C0 = cj.pmadd(A0, B_0, C0);
2066  }
2067  res(i, j2) += alpha * C0;
2068  }
2069  }
2070  }
2071  }
2072 
2073 
2074 // pack a block of the lhs
2075 // The traversal is as follow (mr==4):
2076 // 0 4 8 12 ...
2077 // 1 5 9 13 ...
2078 // 2 6 10 14 ...
2079 // 3 7 11 15 ...
2080 //
2081 // 16 20 24 28 ...
2082 // 17 21 25 29 ...
2083 // 18 22 26 30 ...
2084 // 19 23 27 31 ...
2085 //
2086 // 32 33 34 35 ...
2087 // 36 36 38 39 ...
2088 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2089 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2090 {
2091  typedef typename DataMapper::LinearMapper LinearMapper;
2092  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2093 };
2094 
2095 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2097  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2098 {
2099  typedef typename unpacket_traits<Packet>::half HalfPacket;
2100  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2101  enum { PacketSize = unpacket_traits<Packet>::size,
2102  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2103  QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2104  HasHalf = (int)HalfPacketSize < (int)PacketSize,
2105  HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2106 
2107  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2108  EIGEN_UNUSED_VARIABLE(stride);
2109  EIGEN_UNUSED_VARIABLE(offset);
2110  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2111  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2113  Index count = 0;
2114 
2115  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2116  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2117  const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118  const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119  const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2120  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2121  const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122  : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
2123 
2124  Index i=0;
2125 
2126  // Pack 3 packets
2127  if(Pack1>=3*PacketSize)
2128  {
2129  for(; i<peeled_mc3; i+=3*PacketSize)
2130  {
2131  if(PanelMode) count += (3*PacketSize) * offset;
2132 
2133  for(Index k=0; k<depth; k++)
2134  {
2135  Packet A, B, C;
2136  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137  B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138  C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2139  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2140  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2141  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2142  }
2143  if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2144  }
2145  }
2146  // Pack 2 packets
2147  if(Pack1>=2*PacketSize)
2148  {
2149  for(; i<peeled_mc2; i+=2*PacketSize)
2150  {
2151  if(PanelMode) count += (2*PacketSize) * offset;
2152 
2153  for(Index k=0; k<depth; k++)
2154  {
2155  Packet A, B;
2156  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157  B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2158  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2159  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2160  }
2161  if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2162  }
2163  }
2164  // Pack 1 packets
2165  if(Pack1>=1*PacketSize)
2166  {
2167  for(; i<peeled_mc1; i+=1*PacketSize)
2168  {
2169  if(PanelMode) count += (1*PacketSize) * offset;
2170 
2171  for(Index k=0; k<depth; k++)
2172  {
2173  Packet A;
2174  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2175  pstore(blockA+count, cj.pconj(A));
2176  count+=PacketSize;
2177  }
2178  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2179  }
2180  }
2181  // Pack half packets
2182  if(HasHalf && Pack1>=HalfPacketSize)
2183  {
2184  for(; i<peeled_mc_half; i+=HalfPacketSize)
2185  {
2186  if(PanelMode) count += (HalfPacketSize) * offset;
2187 
2188  for(Index k=0; k<depth; k++)
2189  {
2190  HalfPacket A;
2191  A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192  pstoreu(blockA+count, cj.pconj(A));
2193  count+=HalfPacketSize;
2194  }
2195  if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2196  }
2197  }
2198  // Pack quarter packets
2199  if(HasQuarter && Pack1>=QuarterPacketSize)
2200  {
2201  for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2202  {
2203  if(PanelMode) count += (QuarterPacketSize) * offset;
2204 
2205  for(Index k=0; k<depth; k++)
2206  {
2207  QuarterPacket A;
2208  A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209  pstoreu(blockA+count, cj.pconj(A));
2210  count+=QuarterPacketSize;
2211  }
2212  if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2213  }
2214  }
2215  // Pack2 may be *smaller* than PacketSize—that happens for
2216  // products like real * complex, where we have to go half the
2217  // progress on the lhs in order to duplicate those operands to
2218  // address both real & imaginary parts on the rhs. This portion will
2219  // pack those half ones until they match the number expected on the
2220  // last peeling loop at this point (for the rhs).
2221  if(Pack2<PacketSize && Pack2>1)
2222  {
2223  for(; i<peeled_mc0; i+=last_lhs_progress)
2224  {
2225  if(PanelMode) count += last_lhs_progress * offset;
2226 
2227  for(Index k=0; k<depth; k++)
2228  for(Index w=0; w<last_lhs_progress; w++)
2229  blockA[count++] = cj(lhs(i+w, k));
2230 
2231  if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2232  }
2233  }
2234  // Pack scalars
2235  for(; i<rows; i++)
2236  {
2237  if(PanelMode) count += offset;
2238  for(Index k=0; k<depth; k++)
2239  blockA[count++] = cj(lhs(i, k));
2240  if(PanelMode) count += (stride-offset-depth);
2241  }
2242 }
2243 
2244 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2245 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2246 {
2247  typedef typename DataMapper::LinearMapper LinearMapper;
2248  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2249 };
2250 
2251 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2253  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2254 {
2255  typedef typename unpacket_traits<Packet>::half HalfPacket;
2256  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2257  enum { PacketSize = unpacket_traits<Packet>::size,
2258  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2259  QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2260  HasHalf = (int)HalfPacketSize < (int)PacketSize,
2261  HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2262 
2263  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2264  EIGEN_UNUSED_VARIABLE(stride);
2265  EIGEN_UNUSED_VARIABLE(offset);
2266  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2268  Index count = 0;
2269  bool gone_half = false, gone_quarter = false, gone_last = false;
2270 
2271  Index i = 0;
2272  int pack = Pack1;
2273  int psize = PacketSize;
2274  while(pack>0)
2275  {
2276  Index remaining_rows = rows-i;
2277  Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2278  Index starting_pos = i;
2279  for(; i<peeled_mc; i+=pack)
2280  {
2281  if(PanelMode) count += pack * offset;
2282 
2283  Index k=0;
2284  if(pack>=psize && psize >= QuarterPacketSize)
2285  {
2286  const Index peeled_k = (depth/psize)*psize;
2287  for(; k<peeled_k; k+=psize)
2288  {
2289  for (Index m = 0; m < pack; m += psize)
2290  {
2291  if (psize == PacketSize) {
2292  PacketBlock<Packet> kernel;
2293  for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2294  ptranspose(kernel);
2295  for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2296  } else if (HasHalf && psize == HalfPacketSize) {
2297  gone_half = true;
2298  PacketBlock<HalfPacket> kernel_half;
2299  for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2300  ptranspose(kernel_half);
2301  for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2302  } else if (HasQuarter && psize == QuarterPacketSize) {
2303  gone_quarter = true;
2304  PacketBlock<QuarterPacket> kernel_quarter;
2305  for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2306  ptranspose(kernel_quarter);
2307  for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2308  }
2309  }
2310  count += psize*pack;
2311  }
2312  }
2313 
2314  for(; k<depth; k++)
2315  {
2316  Index w=0;
2317  for(; w<pack-3; w+=4)
2318  {
2319  Scalar a(cj(lhs(i+w+0, k))),
2320  b(cj(lhs(i+w+1, k))),
2321  c(cj(lhs(i+w+2, k))),
2322  d(cj(lhs(i+w+3, k)));
2323  blockA[count++] = a;
2324  blockA[count++] = b;
2325  blockA[count++] = c;
2326  blockA[count++] = d;
2327  }
2328  if(pack%4)
2329  for(;w<pack;++w)
2330  blockA[count++] = cj(lhs(i+w, k));
2331  }
2332 
2333  if(PanelMode) count += pack * (stride-offset-depth);
2334  }
2335 
2336  pack -= psize;
2337  Index left = rows - i;
2338  if (pack <= 0) {
2339  if (!gone_last &&
2340  (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341  ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342  (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2343  psize /= 2;
2344  pack = psize;
2345  continue;
2346  }
2347  // Pack2 may be *smaller* than PacketSize—that happens for
2348  // products like real * complex, where we have to go half the
2349  // progress on the lhs in order to duplicate those operands to
2350  // address both real & imaginary parts on the rhs. This portion will
2351  // pack those half ones until they match the number expected on the
2352  // last peeling loop at this point (for the rhs).
2353  if (Pack2 < PacketSize && !gone_last) {
2354  gone_last = true;
2355  psize = pack = left & ~1;
2356  }
2357  }
2358  }
2359 
2360  for(; i<rows; i++)
2361  {
2362  if(PanelMode) count += offset;
2363  for(Index k=0; k<depth; k++)
2364  blockA[count++] = cj(lhs(i, k));
2365  if(PanelMode) count += (stride-offset-depth);
2366  }
2367 }
2368 
2369 // copy a complete panel of the rhs
2370 // this version is optimized for column major matrices
2371 // The traversal order is as follow: (nr==4):
2372 // 0 1 2 3 12 13 14 15 24 27
2373 // 4 5 6 7 16 17 18 19 25 28
2374 // 8 9 10 11 20 21 22 23 26 29
2375 // . . . . . . . . . .
2376 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2377 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2378 {
2380  typedef typename DataMapper::LinearMapper LinearMapper;
2381  enum { PacketSize = packet_traits<Scalar>::size };
2382  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2383 };
2384 
2385 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2387  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2388 {
2389  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
2390  EIGEN_UNUSED_VARIABLE(stride);
2391  EIGEN_UNUSED_VARIABLE(offset);
2392  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2394  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2395  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2396  Index count = 0;
2397  const Index peeled_k = (depth/PacketSize)*PacketSize;
2398 // if(nr>=8)
2399 // {
2400 // for(Index j2=0; j2<packet_cols8; j2+=8)
2401 // {
2402 // // skip what we have before
2403 // if(PanelMode) count += 8 * offset;
2404 // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
2405 // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
2406 // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
2407 // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
2408 // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
2409 // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
2410 // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
2411 // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
2412 // Index k=0;
2413 // if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
2414 // {
2415 // for(; k<peeled_k; k+=PacketSize) {
2416 // PacketBlock<Packet> kernel;
2417 // for (int p = 0; p < PacketSize; ++p) {
2418 // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
2419 // }
2420 // ptranspose(kernel);
2421 // for (int p = 0; p < PacketSize; ++p) {
2422 // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
2423 // count+=PacketSize;
2424 // }
2425 // }
2426 // }
2427 // for(; k<depth; k++)
2428 // {
2429 // blockB[count+0] = cj(b0[k]);
2430 // blockB[count+1] = cj(b1[k]);
2431 // blockB[count+2] = cj(b2[k]);
2432 // blockB[count+3] = cj(b3[k]);
2433 // blockB[count+4] = cj(b4[k]);
2434 // blockB[count+5] = cj(b5[k]);
2435 // blockB[count+6] = cj(b6[k]);
2436 // blockB[count+7] = cj(b7[k]);
2437 // count += 8;
2438 // }
2439 // // skip what we have after
2440 // if(PanelMode) count += 8 * (stride-offset-depth);
2441 // }
2442 // }
2443 
2444  if(nr>=4)
2445  {
2446  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2447  {
2448  // skip what we have before
2449  if(PanelMode) count += 4 * offset;
2450  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2451  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2452  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2453  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2454 
2455  Index k=0;
2456  if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
2457  {
2458  for(; k<peeled_k; k+=PacketSize) {
2460  kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2461  kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462  kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463  kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2464  ptranspose(kernel);
2465  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2466  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2467  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2468  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2469  count+=4*PacketSize;
2470  }
2471  }
2472  for(; k<depth; k++)
2473  {
2474  blockB[count+0] = cj(dm0(k));
2475  blockB[count+1] = cj(dm1(k));
2476  blockB[count+2] = cj(dm2(k));
2477  blockB[count+3] = cj(dm3(k));
2478  count += 4;
2479  }
2480  // skip what we have after
2481  if(PanelMode) count += 4 * (stride-offset-depth);
2482  }
2483  }
2484 
2485  // copy the remaining columns one at a time (nr==1)
2486  for(Index j2=packet_cols4; j2<cols; ++j2)
2487  {
2488  if(PanelMode) count += offset;
2489  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2490  for(Index k=0; k<depth; k++)
2491  {
2492  blockB[count] = cj(dm0(k));
2493  count += 1;
2494  }
2495  if(PanelMode) count += (stride-offset-depth);
2496  }
2497 }
2498 
2499 // this version is optimized for row major matrices
2500 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2501 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2502 {
2506  typedef typename DataMapper::LinearMapper LinearMapper;
2507  enum { PacketSize = packet_traits<Scalar>::size,
2510  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
2511  {
2512  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2513  EIGEN_UNUSED_VARIABLE(stride);
2515  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2516  const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
2517  const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
2519  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2521  Index count = 0;
2522 
2523  // if(nr>=8)
2524  // {
2525  // for(Index j2=0; j2<packet_cols8; j2+=8)
2526  // {
2527  // // skip what we have before
2528  // if(PanelMode) count += 8 * offset;
2529  // for(Index k=0; k<depth; k++)
2530  // {
2531  // if (PacketSize==8) {
2532  // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2533  // pstoreu(blockB+count, cj.pconj(A));
2534  // } else if (PacketSize==4) {
2535  // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2536  // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2537  // pstoreu(blockB+count, cj.pconj(A));
2538  // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2539  // } else {
2540  // const Scalar* b0 = &rhs[k*rhsStride + j2];
2541  // blockB[count+0] = cj(b0[0]);
2542  // blockB[count+1] = cj(b0[1]);
2543  // blockB[count+2] = cj(b0[2]);
2544  // blockB[count+3] = cj(b0[3]);
2545  // blockB[count+4] = cj(b0[4]);
2546  // blockB[count+5] = cj(b0[5]);
2547  // blockB[count+6] = cj(b0[6]);
2548  // blockB[count+7] = cj(b0[7]);
2549  // }
2550  // count += 8;
2551  // }
2552  // // skip what we have after
2553  // if(PanelMode) count += 8 * (stride-offset-depth);
2554  // }
2555  // }
2556  if(nr>=4)
2557  {
2558  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2559  {
2560  // skip what we have before
2561  if(PanelMode) count += 4 * offset;
2562  for(Index k=0; k<depth; k++)
2563  {
2564  if (PacketSize==4) {
2565  Packet A = rhs.template loadPacket<Packet>(k, j2);
2566  pstoreu(blockB+count, cj.pconj(A));
2567  count += PacketSize;
2568  } else if (HasHalf && HalfPacketSize==4) {
2569  HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570  pstoreu(blockB+count, cj.pconj(A));
2571  count += HalfPacketSize;
2572  } else if (HasQuarter && QuarterPacketSize==4) {
2573  QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574  pstoreu(blockB+count, cj.pconj(A));
2575  count += QuarterPacketSize;
2576  } else {
2577  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2578  blockB[count+0] = cj(dm0(0));
2579  blockB[count+1] = cj(dm0(1));
2580  blockB[count+2] = cj(dm0(2));
2581  blockB[count+3] = cj(dm0(3));
2582  count += 4;
2583  }
2584  }
2585  // skip what we have after
2586  if(PanelMode) count += 4 * (stride-offset-depth);
2587  }
2588  }
2589  // copy the remaining columns one at a time (nr==1)
2590  for(Index j2=packet_cols4; j2<cols; ++j2)
2591  {
2592  if(PanelMode) count += offset;
2593  for(Index k=0; k<depth; k++)
2594  {
2595  blockB[count] = cj(rhs(k, j2));
2596  count += 1;
2597  }
2598  if(PanelMode) count += stride-offset-depth;
2599  }
2600  }
2601 };
2602 
2603 } // end namespace internal
2604 
2607 inline std::ptrdiff_t l1CacheSize()
2608 {
2609  std::ptrdiff_t l1, l2, l3;
2610  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2611  return l1;
2612 }
2613 
2616 inline std::ptrdiff_t l2CacheSize()
2617 {
2618  std::ptrdiff_t l1, l2, l3;
2619  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2620  return l2;
2621 }
2622 
2626 inline std::ptrdiff_t l3CacheSize()
2627 {
2628  std::ptrdiff_t l1, l2, l3;
2629  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2630  return l3;
2631 }
2632 
2638 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2639 {
2640  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2641 }
2642 
2643 } // end namespace Eigen
2644 
2645 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
Matrix3f m
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
static Matrix A1
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val)
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
SCALAR Scalar
Definition: bench_gemm.cpp:46
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define max(a, b)
Definition: datatypes.h:20
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
DoublePacket< typename unpacket_traits< Packet >::half > half
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
float real
Definition: datatypes.h:10
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Scalar * b
Definition: benchVecAdd.cpp:17
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const false_type &) const
static const Pose3 T3(Rot3::Rodrigues(-90, 0, 0), Point3(1, 2, 3))
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
#define min(a, b)
Definition: datatypes.h:19
static double depth
#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)
int n
Scalar Scalar * c
Definition: benchVecAdd.cpp:17
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType &x, const RhsType &y, const ResultType &c) const
Definition: ConjHelper.h:67
Rot2 R(Rot2::fromAngle(0.1))
gtsam::Key l2
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
static Cal3_S2 K(500, 500, 0.1, 640/2, 480/2)
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:1082
Definition: BFloat16.h:88
Holds information about the various numeric (i.e. scalar) types allowed by Eigen. ...
Definition: NumTraits.h:232
static const Pose3 T2(Rot3::Rodrigues(0.3, 0.2, 0.1), P2)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
EIGEN_STRONG_INLINE enable_if<!is_same< RhsPacketType, RhsPacketx4 >::value >::type madd(const LhsPacketType &a, const RhsPacketType &b, DoublePacket< ResPacketType > &c, TmpType &, const LaneIdType &) const
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter > QuarterTraits
const std::ptrdiff_t defaultL3CacheSize
EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b)
Definition: Meta.h:779
#define EIGEN_DONT_INLINE
Definition: Macros.h:940
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
std::ptrdiff_t l3CacheSize()
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Point3 l3(2, 2, 0)
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
static char left
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
void loadQuadToDoublePacket(const Scalar *b, DoublePacket< RealPacket > &dest, typename enable_if< unpacket_traits< RealPacket >::size<=8 >::type *=0)
#define PACKET_DECL_COND_SCALAR(packet_size)
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf > HalfTraits
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
std::ptrdiff_t l2CacheSize()
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
void queryCacheSizes(int &l1, int &l2, int &l3)
Definition: Memory.h:1106
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
#define eigen_assert(x)
Definition: Macros.h:1037
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
RealScalar alpha
static const double r2
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
static const double r3
NumTraits< Scalar >::Real RealScalar
Definition: bench_gemm.cpp:47
RowVector3d w
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:50
gtsam::Key l1
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
static Rot3 R3
const Packet & get(const FixedInt< 0 > &) const
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
EIGEN_CONSTEXPR Index size(const T &x)
Definition: Meta.h:479
#define EIGEN_GEBP_ONESTEP(K)
const std::ptrdiff_t defaultL2CacheSize
mp::number< mp::cpp_dec_float< 100 >, mp::et_on > Real
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val)
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K
#define EIGEN_GEBGP_ONESTEP(K)
static const double r1
constexpr descr< N - 1 > _(char const (&text)[N])
Definition: descr.h:109
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
#define EIGEN_PLAIN_ENUM_MIN(a, b)
Definition: Macros.h:1288
static const Similarity3 T1(R, Point3(3.5, -8.2, 4.2), 1)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
const std::ptrdiff_t defaultL1CacheSize
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
#define EIGEN_COMP_MSVC
Definition: Macros.h:114
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
float * p
EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride=0, Index offset=0)
#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size)
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd &x)
Definition: MSA/Complex.h:620
EIGEN_DEVICE_FUNC const ImagReturnType imag() const
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const true_type &) const
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:801
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &, const LaneIdType &) const
static Rot3 R0
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType &x, const RhsType &y) const
Definition: ConjHelper.h:71
#define eigen_internal_assert(x)
Definition: Macros.h:1043
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Generic expression where a coefficient-wise unary operator is applied to an expression.
Definition: CwiseUnaryOp.h:55
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val)
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
#define PACKET_DECL_COND(name, packet_size)
std::ptrdiff_t l1CacheSize()
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:1076
Definition: pytypes.h:1370
EIGEN_STRONG_INLINE void acc(const DoublePacket< RealPacketType > &c, const ResPacketType &alpha, ResPacketType &r) const
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:34:15