TensorContractionThreadPool.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
12 
13 // evaluator for thread pool device
14 #ifdef EIGEN_USE_THREADS
15 
16 namespace Eigen {
17 
18 #ifdef EIGEN_USE_SIMPLE_THREAD_POOL
19 namespace internal {
20 
21 template<typename LhsScalar, typename LhsMapper, typename Index>
22 struct packLhsArg {
23  LhsScalar* blockA;
24  const LhsMapper& lhs;
25  const Index m_start;
26  const Index k_start;
27  const Index mc;
28  const Index kc;
29 };
30 
31 template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
32 struct packRhsAndKernelArg {
33  const MaxSizeVector<LhsScalar*>* blockAs;
34  RhsScalar* blockB;
35  const RhsMapper& rhs;
36  OutputMapper& output;
37  const Index m;
38  const Index k;
39  const Index n;
40  const Index mc;
41  const Index kc;
42  const Index nc;
43  const Index num_threads;
44  const Index num_blockAs;
45  const Index max_m;
46  const Index k_block_idx;
47  const Index m_block_idx;
48  const Index n_block_idx;
49  const Index m_blocks;
50  const Index n_blocks;
51  MaxSizeVector<Notification*>* kernel_notifications;
52  const MaxSizeVector<Notification*>* lhs_notifications;
53  const bool need_to_pack;
54 };
55 
56 } // end namespace internal
57 #endif // EIGEN_USE_SIMPLE_THREAD_POOL
58 
59 template<typename Indices, typename LeftArgType, typename RightArgType>
60 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
61  public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
62 
63  typedef ThreadPoolDevice Device;
64 
65  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
66  typedef TensorContractionEvaluatorBase<Self> Base;
67 
68  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
70  typedef typename XprType::Index Index;
71  typedef typename XprType::CoeffReturnType CoeffReturnType;
72  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
73 
74  enum {
76  };
77 
78  // Most of the code is assuming that both input tensors are ColMajor. If the
79  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
80  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
81  // will pretend B is LHS and A is RHS.
82  typedef typename internal::conditional<
83  static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
84  typedef typename internal::conditional<
85  static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
86 
87  static const int LDims =
88  internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
89  static const int RDims =
90  internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
91  static const int ContractDims = internal::array_size<Indices>::value;
92 
93  typedef array<Index, LDims> left_dim_mapper_t;
94  typedef array<Index, RDims> right_dim_mapper_t;
95 
96  typedef array<Index, ContractDims> contract_t;
97  typedef array<Index, LDims - ContractDims> left_nocontract_t;
98  typedef array<Index, RDims - ContractDims> right_nocontract_t;
99 
100  static const int NumDims = LDims + RDims - 2 * ContractDims;
101 
102  typedef DSizes<Index, NumDims> Dimensions;
103 
104  // typedefs needed in evalTo
107  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
108 
109  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
110  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
111 
112  TensorEvaluator(const XprType& op, const Device& device) :
113  Base(op, device) {}
114 
115 #ifndef EIGEN_USE_SIMPLE_THREAD_POOL
116  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
117  bool rhs_inner_dim_reordered, int Alignment>
118  void evalProduct(Scalar* buffer) const {
119  typedef internal::TensorContractionInputMapper<
120  LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
122  lhs_inner_dim_contiguous, false, Unaligned>
123  LhsMapper;
124  typedef internal::TensorContractionInputMapper<
125  RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
127  rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
128  RhsMapper;
129  typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
130  typedef internal::gemm_pack_lhs<LhsScalar, Index,
131  typename LhsMapper::SubMapper, Traits::mr,
132  Traits::LhsProgress, ColMajor>
133  LhsPacker;
134  typedef internal::gemm_pack_rhs<
135  RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
136  RhsPacker;
137  typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
138  Traits::mr, Traits::nr, false, false>
139  GebpKernel;
140 
141  const Index m = this->m_i_size;
142  const Index n = this->m_j_size;
143  const Index k = this->m_k_size;
144  if (m == 0 || n == 0 || k == 0) return;
145 
146  // Compute a set of algorithm parameters:
147  // - kernel block sizes (bm, bn, bk)
148  // - task grain sizes (number of kernels executed per task: gm, gn)
149  // - number of threads
150  // - sharding by row/column
151  // - parallel packing or first lhs then rhs
152  // and some derived parameters:
153  // - number of tasks (nm, nn, nk)
154  // - number of kernels (nm0, nn0)
155  // Unfortunately, all these parameters are tightly interdependent.
156  // So in some cases we first compute approximate values, then compute other
157  // values based on these approximations and then refine the approximations.
158 
159  // There are lots of heuristics here. There is some reasoning behind them,
160  // but ultimately they are just tuned on contraction benchmarks for
161  // different input configurations, thread counts and instruction sets.
162  // So feel free to question any of them.
163 
164  // Compute whether we want to shard by row or by column.
165  // This is a first approximation, it will be refined later. Since we don't
166  // know number of threads yet we use 2, because what's we are most
167  // interested in at this point is whether it makes sense to use
168  // parallelization at all or not.
169  bool shard_by_col = shardByCol(m, n, 2);
170 
171  // First approximation of kernel blocking sizes.
172  // Again, we don't know number of threads yet, so we use 2.
173  Index bm, bn, bk;
174  if (shard_by_col) {
175  internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
177  blocking(k, m, n, 2);
178  bm = blocking.mc();
179  bn = blocking.nc();
180  bk = blocking.kc();
181  } else {
182  internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
184  blocking(k, m, n, 2);
185  bm = blocking.mc();
186  bn = blocking.nc();
187  bk = blocking.kc();
188  }
189 
190  // Compute optimal number of threads.
191  // Note: we use bk instead of k here because we are interested in amount of
192  // _parallelizable_ computations, and computations are not parallelizable
193  // across k dimension.
194  const TensorOpCost cost =
195  contractionCost(m, n, bm, bn, bk, shard_by_col, false);
197  static_cast<double>(n) * m, cost, this->m_device.numThreads());
198 
199  // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
200  // model is not tuned. Remove this when the cost model is tuned.
201  if (n == 1) num_threads = 1;
202 
203  if (num_threads == 1) {
204  // The single-threaded algorithm should be faster in this case.
205  if (n == 1)
206  this->template evalGemv<lhs_inner_dim_contiguous,
207  rhs_inner_dim_contiguous,
208  rhs_inner_dim_reordered, Alignment>(buffer);
209  else
210  this->template evalGemm<lhs_inner_dim_contiguous,
211  rhs_inner_dim_contiguous,
212  rhs_inner_dim_reordered, Alignment>(buffer);
213  return;
214  }
215 
216  // Now that we know number of threads, recalculate sharding and blocking.
217  shard_by_col = shardByCol(m, n, num_threads);
218  if (shard_by_col) {
219  internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
221  blocking(k, m, n, num_threads);
222  bm = blocking.mc();
223  bn = blocking.nc();
224  bk = blocking.kc();
225  } else {
226  internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
228  blocking(k, m, n, num_threads);
229  bm = blocking.mc();
230  bn = blocking.nc();
231  bk = blocking.kc();
232  }
233 
234  // Number of kernels for each dimension.
235  Index nm0 = divup(m, bm);
236  Index nn0 = divup(n, bn);
237  Index nk = divup(k, bk);
238 
239  // Calculate task grain size (number of kernels executed per task).
240  // This task size coarsening serves two purposes:
241  // 1. It reduces per-task overheads including synchronization overheads.
242  // 2. It allows to use caches better (reuse the same packed rhs in several
243  // consecutive kernels).
244  Index gm = 1;
245  Index gn = 1;
246  // If we are sharding by column, then we prefer to reduce rows first.
247  if (shard_by_col) {
248  gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
249  gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
250  } else {
251  gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
252  gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
253  }
254  // Number of tasks in each dimension.
255  Index nm = divup(nm0, gm);
256  Index nn = divup(nn0, gn);
257 
258  // Last by not least, decide whether we want to issue both lhs and rhs
259  // packing in parallel; or issue lhs packing first, and then issue rhs
260  // packing when lhs packing completes (for !shard_by_col lhs and rhs are
261  // swapped). Parallel packing allows more parallelism (for both packing and
262  // kernels), while sequential packing provides better locality (once
263  // a thread finishes rhs packing it proceed to kernels with that rhs).
264  // First, we are interested in parallel packing if there are few tasks.
265  bool parallel_pack = num_threads >= nm * nn;
266  // Also do parallel packing if all data fits into L2$.
267  if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
268  l2CacheSize() * num_threads)
269  parallel_pack = true;
270  // But don't do it if we will use each rhs only once. Locality seems to be
271  // more important in this case.
272  if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
273 
274  LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
275  this->m_i_strides, this->m_left_contracting_strides,
276  this->m_k_strides);
277 
278  RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
279  this->m_j_strides, this->m_right_contracting_strides,
280  this->m_k_strides);
281 
282  Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
283  OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
284  k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
285  shard_by_col, parallel_pack)
286  .run();
287  }
288 
289  // Context coordinates a single parallel gemm operation.
290  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
291  typename LhsMapper, typename RhsMapper, typename OutputMapper>
292  class Context {
293  public:
294  Context(const Device& device, int num_threads, LhsMapper& lhs,
295  RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
296  Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
297  Index gn, Index nm0, Index nn0, bool shard_by_col,
298  bool parallel_pack)
299  : device_(device),
300  lhs_(lhs),
301  rhs_(rhs),
302  buffer_(buffer),
303  output_(buffer, tm),
304  num_threads_(num_threads),
305  shard_by_col_(shard_by_col),
306  parallel_pack_(parallel_pack),
307  m_(tm),
308  n_(tn),
309  k_(tk),
310  bm_(bm),
311  bn_(bn),
312  bk_(bk),
313  nm_(nm),
314  nn_(nn),
315  nk_(nk),
316  gm_(gm),
317  gn_(gn),
318  nm0_(nm0),
319  nn0_(nn0)
320  {
321  for (Index x = 0; x < P; x++) {
322  // Normal number of notifications for k slice switch is
323  // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
324  // nm_ + nn_ notifications, because they will not receive notifications
325  // from preceeding kernels.
326  state_switch_[x] =
327  x == 0
328  ? 1
329  : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
330  (x == P - 1 ? nm_ * nn_ : 0);
331  state_packing_ready_[x] =
332  parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
333  state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
334  for (Index m = 0; m < nm_; m++) {
335  state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
336  // Kernels generally receive 3 notifications (previous kernel + 2
337  // packing), but the first slice won't get notifications from previous
338  // kernels.
339  for (Index n = 0; n < nn_; n++)
340  state_kernel_[x][m][n].store(
341  (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
342  std::memory_order_relaxed);
343  }
344  }
345 
346  // Allocate memory for packed rhs/lhs matrices.
348  size_t lhs_size =
349  divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
350  size_t rhs_size =
351  divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
352  packed_mem_ = static_cast<char*>(internal::aligned_malloc(
353  (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
354  char* mem = static_cast<char*>(packed_mem_);
355  for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
356  packed_lhs_[x].resize(nm0_);
357  for (Index m = 0; m < nm0_; m++) {
358  packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
359  mem += lhs_size;
360  }
361  packed_rhs_[x].resize(nn0_);
362  for (Index n = 0; n < nn0_; n++) {
363  packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
364  mem += rhs_size;
365  }
366  }
367  }
368 
369  ~Context() {
370  for (Index x = 0; x < P; x++) {
371  for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
372  delete[] state_kernel_[x];
373  }
374  internal::aligned_free(packed_mem_);
375  }
376 
377  void run() {
378  // Kick off packing of the first slice.
379  signal_switch(0, 1);
380  // Wait for overall completion.
381  // TODO(dvyukov): this wait can lead to deadlock.
382  // If nthreads contractions are concurrently submitted from worker
383  // threads, this wait will block all worker threads and the system will
384  // deadlock.
385  done_.Wait();
386  }
387 
388  private:
389  Notification done_;
390  const Device& device_;
391  LhsMapper& lhs_;
392  RhsMapper& rhs_;
393  Scalar* const buffer_;
394  OutputMapper output_;
395  const int num_threads_;
396  const bool shard_by_col_;
397  const bool parallel_pack_;
398  // Matrix sizes.
399  const Index m_;
400  const Index n_;
401  const Index k_;
402  // Block sizes.
403  const Index bm_;
404  const Index bn_;
405  const Index bk_;
406  // Number of tasks.
407  const Index nm_;
408  const Index nn_;
409  const Index nk_;
410  // Task grain sizes (number of kernels executed per task).
411  const Index gm_;
412  const Index gn_;
413  // Number of blocks (this is different from ni_/nn_ because of task size
414  // coarsening).
415  const Index nm0_;
416  const Index nn0_;
417 
418  // Parallelization strategy.
419  //
420  // Blocks related to the same k block can run in parallel because they write
421  // to different output blocks. So we parallelize within k slices, this
422  // gives us parallelism level of m x n. Before we can start any kernels
423  // related to k-th slice, we need to issue m lhs packing tasks and n rhs
424  // packing tasks.
425  //
426  // However, there is a bottleneck when we are finishing kernels for k-th
427  // slice (at the very end there is only 1 runnable kernel). To mitigate this
428  // bottleneck we allow kernels from k-th and k+1-th slices to run in
429  // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
430  // output block, so they must not run in parallel.
431  //
432  // This gives us the following dependency graph.
433  // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
434  // packing tasks.
435  // Kernel (m, n, k) can start when:
436  // - kernel (m, n, k-1) has finished
437  // - lhs packing (m, k) has finished
438  // - rhs packing (n, k) has finished
439  // Lhs/rhs packing can start when:
440  // - all k-1 packing has finished (artificially imposed to limit amount of
441  // parallel packing)
442  //
443  // On top of that we limit runnable tasks to two consecutive k slices.
444  // This is done to limit amount of memory we need for packed lhs/rhs
445  // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
446  //
447  // state_switch_ tracks when we are ready to switch to the next k slice.
448  // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
449  // These variable are rolling over 3 consecutive k slices: first two we are
450  // actively executing + one to track completion of kernels in the second
451  // slice.
452  static const Index P = 3;
453  void* packed_mem_;
454  std::vector<LhsScalar*> packed_lhs_[P - 1];
455  std::vector<RhsScalar*> packed_rhs_[P - 1];
456  std::atomic<uint8_t>** state_kernel_[P];
457  // state_switch_ is frequently modified by worker threads, while other
458  // fields are read-only after constructor. Let's move it to a separate cache
459  // line to reduce cache-coherency traffic.
460  char pad_[128];
461  std::atomic<Index> state_packing_ready_[P];
462  std::atomic<Index> state_switch_[P];
463 
464  void pack_lhs(Index m, Index k) {
465  const Index mend = m * gm_ + gm(m);
466  for (Index m1 = m * gm_; m1 < mend; m1++)
467  LhsPacker()(packed_lhs_[k % (P - 1)][m1],
468  lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
469 
470  if (!parallel_pack_ && shard_by_col_) {
471  signal_packing(k);
472  } else {
473  signal_switch(k + 1);
474  for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
475  }
476  }
477 
478  void pack_rhs(Index n, Index k) {
479  const Index nend = n * gn_ + gn(n);
480  for (Index n1 = n * gn_; n1 < nend; n1++) {
481  if (k == 0) {
482  // Zero the output memory in parallel.
483  // On 10000x2x10000 mm zeroing can easily take half of time.
484  // Zero (bn x m) row. Safe to do here because all kernels that will
485  // write to this memory depend on completion of this task.
486  // Note: don't call device_.memset() here. device_.memset() blocks on
487  // thread pool worker thread, which can lead to underutilization and
488  // deadlocks.
489  memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
490  }
491  RhsPacker()(packed_rhs_[k % (P - 1)][n1],
492  rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
493  }
494 
495  if (parallel_pack_ || shard_by_col_) {
496  signal_switch(k + 1);
497  for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
498  } else {
499  signal_packing(k);
500  }
501  }
502 
503  void kernel(Index m, Index n, Index k) {
504  // Note: order of iteration matters here. Iteration over m is innermost
505  // because we want to reuse the same packed rhs in consequetive tasks
506  // (rhs fits into L2$ while lhs only into L3$).
507  const Index nend = n * gn_ + gn(n);
508  const Index mend = m * gm_ + gm(m);
509  if (shard_by_col_) {
510  for (Index n1 = n * gn_; n1 < nend; n1++) {
511  for (Index m1 = m * gm_; m1 < mend; m1++)
512  GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
513  packed_lhs_[k % (P - 1)][m1],
514  packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
515  Scalar(1), -1, -1, 0, 0);
516  }
517  } else {
518  for (Index m1 = m * gm_; m1 < mend; m1++)
519  for (Index n1 = n * gn_; n1 < nend; n1++) {
520  GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
521  packed_lhs_[k % (P - 1)][m1],
522  packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
523  Scalar(1), -1, -1, 0, 0);
524  }
525  }
526  signal_kernel(m, n, k + 1, false);
527  signal_switch(k + 2);
528  }
529 
530  void signal_packing(Index k) {
531  eigen_assert(!parallel_pack_);
532  Index s = state_packing_ready_[k % P].fetch_sub(1);
533  eigen_assert(s > 0);
534  if (s != 1) return;
535  state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
536  enqueue_packing(k, shard_by_col_);
537  }
538 
539  void signal_kernel(Index m, Index n, Index k, bool sync) {
540  std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
541  Index s = state->load();
542  eigen_assert(s > 0);
543  if (s != 1 && state->fetch_sub(1) != 1) return;
544  state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
545  if (sync)
546  kernel(m, n, k);
547  else
548  device_.enqueueNoNotification([=]() { kernel(m, n, k); });
549  }
550 
551  void signal_switch(Index k, Index v = 1) {
552  Index s = state_switch_[k % P].fetch_sub(v);
553  eigen_assert(s >= v);
554  if (s != v) return;
555 
556  // Ready to switch to the next k slice.
557  // Reset counter for the next iteration.
558  state_switch_[k % P] =
559  (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
560  nm_ * nn_;
561  if (k < nk_) {
562  // Issue lhs/rhs packing. Their completion will in turn kick off
563  // kernels.
564  if (parallel_pack_) {
565  enqueue_packing(k, !shard_by_col_);
566  enqueue_packing(k, shard_by_col_);
567  } else if (shard_by_col_) {
568  enqueue_packing(k, false);
569  } else {
570  enqueue_packing(k, true);
571  }
572 
573  // Termination handling.
574  // Because kernel completion signals k + 2 switch, we need to finish nk
575  // + 2 slices without issuing any tasks on nk + 1 slice. So here we
576  // pretend that all nk + 1 packing tasks just finish instantly; so that
577  // nk + 2 switch only waits for completion of nk kernels.
578  } else if (k == nk_) {
579  signal_switch(k + 1,
580  parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
581  } else {
582  done_.Notify();
583  }
584  }
585 
586  // Enqueue all rhs/lhs packing for k-th slice.
587  void enqueue_packing(Index k, bool rhs) {
588  enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
589  }
590 
591  void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
592  if (end - start == 1) {
593  if (rhs)
594  pack_rhs(start, k);
595  else
596  pack_lhs(start, k);
597  } else {
598  Index mid = (start + end) / 2;
599  device_.enqueueNoNotification(
600  [=]() { enqueue_packing_helper(mid, end, k, rhs); });
601  device_.enqueueNoNotification(
602  [=]() { enqueue_packing_helper(start, mid, k, rhs); });
603  }
604  }
605 
606  // Block sizes with accounting for potentially incomplete last block.
607  Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
608  Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
609  Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
610  // Task grain sizes accounting for potentially incomplete last task.
611  Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
612  Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
613 
614  Context(const Context&) = delete;
615  void operator=(const Context&) = delete;
616  };
617 
618  // Decide whether we want to shard m x n contraction by columns or by rows.
619  static bool shardByCol(Index m, Index n, Index num_threads) {
620  // Note: we are comparing both n and m against Traits::nr, it is not
621  // a mistake. We are trying to figure out how both n and m will fit into
622  // the main sharding dimension.
623 
624  // Sharding by column is the default
625  // ... unless there is enough data for vectorization over rows
626  if (m / num_threads >= Traits::nr &&
627  // and not enough data for vectorization over columns
628  (n / num_threads < Traits::nr ||
629  // ... or barely enough data for vectorization over columns,
630  // but it is not evenly dividable across threads
631  (n / num_threads < 4 * Traits::nr &&
632  (n % (num_threads * Traits::nr)) != 0 &&
633  // ... and it is evenly dividable across threads for rows
634  ((m % (num_threads * Traits::nr)) == 0 ||
635  // .. or it is not evenly dividable for both dimensions but
636  // there is much more data over rows so that corner effects are
637  // mitigated.
638  (m / n >= 6)))))
639  return false;
640  // Wait, or if matrices are just substantially prolonged over the other
641  // dimension.
642  if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
643  return true;
644  }
645 
646  Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
647  int num_threads, bool shard_by_col) const {
648  Index gm = 1;
649  Index gm1 = 1;
650  Index nm0 = divup(m, bm);
651  Index nm1 = nm0;
652  for (;;) {
653  // Find the next candidate for m grain size. It needs to result in
654  // different number of blocks. E.g. if we have 10 kernels, we want to try
655  // 5 and 10, but not 6, 7, 8 and 9.
656  while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
657  if (gm1 > nm0) break;
658  // Check the candidate.
659  int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
660  shard_by_col);
661  if (res < 0) break;
662  nm1 = divup(nm0, gm1);
663  if (res == 0) continue;
664  // Commit new grain size.
665  gm = gm1;
666  }
667  return gm;
668  }
669 
670  Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
671  int num_threads, bool shard_by_col) const {
672  Index gn = 1;
673  Index gn1 = 1;
674  Index nn0 = divup(n, bn);
675  Index nn1 = nn0;
676  for (;;) {
677  while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
678  if (gn1 > nn0) break;
679  int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
680  shard_by_col);
681  if (res < 0) break;
682  nn1 = divup(nn0, gn1);
683  if (res == 0) continue;
684  gn = gn1;
685  }
686  return gn;
687  }
688 
689  // checkGrain checks whether grain (gm, gn) is suitable and is better than
690  // (oldgm, oldgn).
691  int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
692  Index gn, Index oldgm, Index oldgn, int num_threads,
693  bool shard_by_col) const {
694  const TensorOpCost cost =
695  contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
697  static_cast<double>(bm) * gm * bn * gn, cost);
698  // If the task is too small, then we agree on it regardless of anything
699  // else. Otherwise synchronization overheads will dominate.
700  if (taskSize < 1) return 1;
701  // If it is too large, then we reject it and all larger tasks.
702  if (taskSize > 2) return -1;
703  // Now we are in presumably good task size range.
704  // The main deciding factor here is parallelism. Consider that we have 12
705  // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
706  // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
707  // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
708  // us parallelism of 1 (we can load all cores).
709  Index nm0 = divup(m, bm);
710  Index nn0 = divup(n, bn);
711  Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
712  double new_parallelism = static_cast<double>(new_tasks) /
713  (divup<int>(new_tasks, num_threads) * num_threads);
714  Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
715  double old_parallelism = static_cast<double>(old_tasks) /
716  (divup<int>(old_tasks, num_threads) * num_threads);
717  if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
718  return 0;
719  }
720 
721 #else // EIGEN_USE_SIMPLE_THREAD_POOL
722 
723  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
724  void evalProduct(Scalar* buffer) const {
725  if (this->m_j_size == 1) {
726  this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
727  return;
728  }
729 
730  evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
731  }
732 
733  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
734  void evalGemm(Scalar* buffer) const {
735  // columns in left side, rows in right side
736  const Index k = this->m_k_size;
737 
738  // rows in left side
739  const Index m = this->m_i_size;
740 
741  // columns in right side
742  const Index n = this->m_j_size;
743 
744  // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
745  this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
746 
747 
750 
751  typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
752  LeftEvaluator, left_nocontract_t,
753  contract_t, lhs_packet_size,
754  lhs_inner_dim_contiguous,
755  false, Unaligned> LhsMapper;
756 
757  typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
758  RightEvaluator, right_nocontract_t,
759  contract_t, rhs_packet_size,
760  rhs_inner_dim_contiguous,
761  rhs_inner_dim_reordered, Unaligned> RhsMapper;
762 
763  typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
764 
765  // TODO: packing could be faster sometimes if we supported row major tensor mappers
766  typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
767  Traits::LhsProgress, ColMajor> LhsPacker;
768  typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
769 
770  // TODO: replace false, false with conjugate values?
771  typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
772  Traits::mr, Traits::nr, false, false> GebpKernel;
773 
774  typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
775  typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
776 
777  // initialize data mappers
778  LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
779  this->m_left_contracting_strides, this->m_k_strides);
780 
781  RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
782  this->m_right_contracting_strides, this->m_k_strides);
783 
784  OutputMapper output(buffer, m);
785 
786  // compute block sizes (which depend on number of threads)
787  const Index num_threads = this->m_device.numThreads();
788  internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
789  Index mc = blocking.mc();
790  Index nc = blocking.nc();
791  Index kc = blocking.kc();
792  eigen_assert(mc <= m);
793  eigen_assert(nc <= n);
794  eigen_assert(kc <= k);
795 
796 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
797  const Index k_blocks = CEIL_DIV(k, kc);
798  const Index n_blocks = CEIL_DIV(n, nc);
799  const Index m_blocks = CEIL_DIV(m, mc);
800  const Index sizeA = mc * kc;
801  const Index sizeB = kc * nc;
802 
803  /* cout << "m: " << m << " n: " << n << " k: " << k << endl;
804  cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
805  cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
806  cout << "num threads: " << num_threads << endl;
807  */
808 
809  // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
810  // aren't 16 byte aligned segfaults will happen due to SIMD instructions
811  // note: You can get away with allocating just a single blockA and offsets and meet the
812  // the alignment requirements with the assumption that
813  // (Traits::mr * sizeof(ResScalar)) % 16 == 0
814  const Index numBlockAs = numext::mini(num_threads, m_blocks);
815  MaxSizeVector<LhsScalar *> blockAs(num_threads);
816  for (int i = 0; i < num_threads; i++) {
817  blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
818  }
819 
820  // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
821  // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
822  // Other options: (1) reuse memory when a thread finishes. con: tricky
823  // (2) allocate block B memory in each thread. con: overhead
824  MaxSizeVector<RhsScalar *> blockBs(n_blocks);
825  for (int i = 0; i < n_blocks; i++) {
826  blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
827  }
828 
829  // lhs_notifications starts with all null Notifications
830  MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
831 
832  // this should really be numBlockAs * n_blocks;
833  const Index num_kernel_notifications = num_threads * n_blocks;
834  MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
835  nullptr);
836 
837  for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
838  const Index k_start = k_block_idx * kc;
839  // make sure we don't overshoot right edge of left matrix
840  const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
841 
842  for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
843  const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
844 
845  for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
846  const Index m_start = mt_block_idx * mc;
847  const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
848  eigen_assert(actual_mc > 0);
849 
850  Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
851 
852  for (int i = 0; i < n_blocks; ++i) {
853  Index notification_id = (blockAId * n_blocks + i);
854  // Wait for any current kernels using this slot to complete
855  // before using it.
856  if (kernel_notifications[notification_id]) {
857  wait_until_ready(kernel_notifications[notification_id]);
858  delete kernel_notifications[notification_id];
859  }
860  kernel_notifications[notification_id] = new Notification();
861  }
862  const packLArg arg = {
863  blockAs[blockAId], // blockA
864  lhs, // lhs
865  m_start, // m
866  k_start, // k
867  actual_mc, // mc
868  actual_kc, // kc
869  };
870 
871  // Delete any existing notification since we may be
872  // replacing it. The algorithm should ensure that there are
873  // no existing waiters on this notification.
874  delete lhs_notifications[blockAId];
875  lhs_notifications[blockAId] =
876  this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
877  }
878 
879  // now start kernels.
880  const Index m_base_start = m_block_idx * mc;
881  const bool need_to_pack = m_block_idx == 0;
882 
883  for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
884  const Index n_start = n_block_idx * nc;
885  const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
886 
887  // first make sure the previous kernels are all done before overwriting rhs. Also wait if
888  // we're going to start new k. In both cases need_to_pack is true.
889  if (need_to_pack) {
890  for (Index i = num_blocks; i < num_threads; ++i) {
891  Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
892  Index future_id = (blockAId * n_blocks + n_block_idx);
893  wait_until_ready(kernel_notifications[future_id]);
894  }
895  }
896 
897  packRKArg arg = {
898  &blockAs, // blockA
899  blockBs[n_block_idx], // blockB
900  rhs, // rhs
901  output, // output
902  m_base_start, // m
903  k_start, // k
904  n_start, // n
905  mc, // mc
906  actual_kc, // kc
907  actual_nc, // nc
908  num_threads,
909  numBlockAs,
910  m,
911  k_block_idx,
912  m_block_idx,
913  n_block_idx, // n_block_idx
914  m_blocks, // m_blocks
915  n_blocks, // n_blocks
916  &kernel_notifications, // kernel notifications
917  &lhs_notifications, // lhs notifications
918  need_to_pack, // need_to_pack
919  };
920 
921  // We asynchronously kick off this function, which ends up
922  // notifying the appropriate kernel_notifications objects,
923  // which this thread waits on before exiting.
924  this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
925  }
926  }
927  }
928 
929  // Make sure all the kernels are done.
930  for (size_t i = 0; i < kernel_notifications.size(); ++i) {
931  wait_until_ready(kernel_notifications[i]);
932  delete kernel_notifications[i];
933  }
934 
935  // No need to wait for lhs notifications since they should have
936  // already been waited on. Just clean them up.
937  for (size_t i = 0; i < lhs_notifications.size(); ++i) {
938  delete lhs_notifications[i];
939  }
940 
941  // deallocate all of the memory for both A and B's
942  for (size_t i = 0; i < blockAs.size(); i++) {
943  this->m_device.deallocate(blockAs[i]);
944  }
945  for (size_t i = 0; i < blockBs.size(); i++) {
946  this->m_device.deallocate(blockBs[i]);
947  }
948 
949 #undef CEIL_DIV
950  }
951 
952  /*
953  * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
954  * the LHS block, check that all of the kernels that worked on the same
955  * mt_block_idx in the previous m_block are done.
956  */
957  template <typename packLArg, typename LhsPacker>
958  static void packLhs(const packLArg arg) {
959  // perform actual packing
960  LhsPacker pack_lhs;
961  pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
962  }
963 
964  /*
965  * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
966  * all kernels in the previous block are done.
967  * Then for each LHS future, we wait on the future and then call GEBP
968  * on the area packed by the future (which starts at
969  * blockA + future_idx * mt * kc) on the LHS and with the full packed
970  * RHS block.
971  * The output of this GEBP is written to output(m + i * mt, n).
972  */
973  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
974  static void packRhsAndKernel(packRKArg arg) {
975  if (arg.need_to_pack) {
976  RhsPacker pack_rhs;
977  pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
978  }
979 
980  GebpKernel gebp;
981  for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
982  const Index m_base_start = arg.m + arg.mc*mt_block_idx;
983  if (m_base_start < arg.max_m) {
984  Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
985  wait_until_ready((*arg.lhs_notifications)[blockAId]);
986  const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
987  gebp(arg.output.getSubMapper(m_base_start, arg.n),
988  (*arg.blockAs)[blockAId], arg.blockB,
989  actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
990 
991  // Notify that the kernel is done.
992  const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
993  (*arg.kernel_notifications)[set_idx]->Notify();
994  }
995  }
996  }
997 #endif // EIGEN_USE_SIMPLE_THREAD_POOL
998 
999  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
1000  bool shard_by_col, bool prepacked) const {
1001  const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
1003  const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
1004  const double kd = static_cast<double>(bk);
1005  // Peak VFMA bandwidth is 0.5. However if we have not enough data for
1006  // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
1007  // experimentally.
1008  double computeBandwidth = bk == 1 ? 4.0 :
1009  (shard_by_col ? bn : bm) < Traits::nr ||
1010  (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
1011 #ifndef EIGEN_VECTORIZE_FMA
1012  // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
1013  // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
1014  // so overall bandwidth is 1.0.
1015  if (computeBandwidth == 0.5) computeBandwidth = 1.0;
1016 #endif
1017  // Computations.
1018  TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
1019  // Output stores.
1020  cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
1021  if (prepacked) {
1022  // Packing and kernels are executed in different tasks. When we calculate
1023  // task grain size we look only at kernel cost assuming that kernel
1024  // is more expensive than packing.
1025  return cost;
1026  }
1027  // Lhs/rhs loads + computations.
1028  TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
1029  TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
1030  // Lhs packing memory cost does not contribute considerably to overall
1031  // execution time because lhs is prefetched early and accessed sequentially.
1032  if (shard_by_col)
1033  lhsCost.dropMemoryCost();
1034  else
1035  rhsCost.dropMemoryCost();
1036  return cost + lhsCost + rhsCost;
1037  }
1038 };
1039 
1040 } // end namespace Eigen
1041 
1042 #endif // EIGEN_USE_THREADS
1043 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
Matrix3f m
SCALAR Scalar
Definition: bench_gemm.cpp:33
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
EIGEN_DEVICE_FUNC void * aligned_malloc(std::size_t size)
Definition: Memory.h:153
#define EIGEN_MAX_ALIGN_BYTES
Definition: Macros.h:775
ArrayXcf v
Definition: Cwise_arg.cpp:1
Definition: numpy.h:543
int n
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: cast.h:1853
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
EIGEN_DEVICE_FUNC void aligned_free(void *ptr)
Definition: Memory.h:174
std::ptrdiff_t l2CacheSize()
Matrix3d m1
Definition: IOFormat.cpp:2
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:33
#define eigen_assert(x)
Definition: Macros.h:579
RealScalar s
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
boost::optional< Pose2 > align(const vector< Point2Pair > &pairs)
Definition: Pose2.cpp:314
idx_t * nn
std::vector< size_t > Indices
void run(Expr &expr, Dev &dev)
Definition: TensorSyclRun.h:33
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition: TensorMeta.h:30
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:51
Definition: pytypes.h:897


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:45:17