tensor_benchmarks.h
Go to the documentation of this file.
1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
3 
4 typedef int TensorIndex;
5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
6 
7 #include "unsupported/Eigen/CXX11/Tensor"
8 #include "benchmark.h"
9 
10 #define BENCHMARK_RANGE(bench, lo, hi) \
11  BENCHMARK(bench)->Range(lo, hi)
12 
13 using Eigen::Tensor;
14 using Eigen::TensorMap;
15 
16 // TODO(bsteiner): also templatize on the input type since we have users
17 // for int8 as well as floats.
18 template <typename Device, typename T> class BenchmarkSuite {
19  public:
20  BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
21  : m_(m), k_(k), n_(n), device_(device) {
22  initialize();
23  }
24 
25  BenchmarkSuite(const Device& device, size_t m)
26  : m_(m), k_(m), n_(m), device_(device) {
27  initialize();
28  }
29 
31  device_.deallocate(a_);
32  device_.deallocate(b_);
33  device_.deallocate(c_);
34  }
35 
36  void memcpy(int num_iters) {
37  eigen_assert(m_ == k_ && k_ == n_);
39  for (int iter = 0; iter < num_iters; ++iter) {
40  device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
41  }
42  // Record the number of values copied per second
43  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
44  }
45 
46  void typeCasting(int num_iters) {
47  eigen_assert(m_ == n_);
49  if (sizeof(T) >= sizeof(int)) {
50  sizes[0] = m_;
51  sizes[1] = k_;
52  } else {
53  sizes[0] = m_ * sizeof(T) / sizeof(int);
54  sizes[1] = k_ * sizeof(T) / sizeof(int);
55  }
58 
60  for (int iter = 0; iter < num_iters; ++iter) {
61  B.device(device_) = A.template cast<T>();
62  }
63  // Record the number of values copied per second
64  finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
65  }
66 
67  void random(int num_iters) {
68  eigen_assert(m_ == k_ && k_ == n_);
70  sizes[0] = m_;
71  sizes[1] = m_;
73 
75  for (int iter = 0; iter < num_iters; ++iter) {
76  C.device(device_) = C.random();
77  }
78  // Record the number of random numbers generated per second
79  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
80  }
81 
82  void slicing(int num_iters) {
83  eigen_assert(m_ == k_ && k_ == n_);
85  sizes[0] = m_;
86  sizes[1] = m_;
87  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
88  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
90 
91  const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
92  const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
93  const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
94  const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
95  const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
96 
98  for (int iter = 0; iter < num_iters; ++iter) {
99  C.slice(first_quadrant, quarter_sizes).device(device_) =
100  A.slice(first_quadrant, quarter_sizes);
101  C.slice(second_quadrant, quarter_sizes).device(device_) =
102  B.slice(second_quadrant, quarter_sizes);
103  C.slice(third_quadrant, quarter_sizes).device(device_) =
104  A.slice(third_quadrant, quarter_sizes);
105  C.slice(fourth_quadrant, quarter_sizes).device(device_) =
106  B.slice(fourth_quadrant, quarter_sizes);
107  }
108  // Record the number of values copied from the rhs slice to the lhs slice
109  // each second
110  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
111  }
112 
113  void rowChip(int num_iters) {
114  Eigen::array<TensorIndex, 2> input_size;
115  input_size[0] = k_;
116  input_size[1] = n_;
118  Eigen::array<TensorIndex, 1> output_size;
119  output_size[0] = n_;
121 
123  for (int iter = 0; iter < num_iters; ++iter) {
124  C.device(device_) = B.chip(iter % k_, 0);
125  }
126  // Record the number of values copied from the rhs chip to the lhs.
127  finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
128  }
129 
130  void colChip(int num_iters) {
131  Eigen::array<TensorIndex, 2> input_size;
132  input_size[0] = k_;
133  input_size[1] = n_;
135  Eigen::array<TensorIndex, 1> output_size;
136  output_size[0] = n_;
138 
140  for (int iter = 0; iter < num_iters; ++iter) {
141  C.device(device_) = B.chip(iter % n_, 1);
142  }
143  // Record the number of values copied from the rhs chip to the lhs.
144  finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
145  }
146 
147  void shuffling(int num_iters) {
148  eigen_assert(m_ == n_);
150  size_a[0] = m_;
151  size_a[1] = k_;
152  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
154  size_b[0] = k_;
155  size_b[1] = m_;
157 
158  Eigen::array<int, 2> shuffle;
159  shuffle[0] = 1;
160  shuffle[1] = 0;
161 
163  for (int iter = 0; iter < num_iters; ++iter) {
164  B.device(device_) = A.shuffle(shuffle);
165  }
166  // Record the number of values shuffled from A and copied to B each second
167  finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
168  }
169 
170  void padding(int num_iters) {
171  eigen_assert(m_ == k_);
173  size_a[0] = m_;
174  size_a[1] = k_-3;
175  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
177  size_b[0] = k_;
178  size_b[1] = m_;
180 
181 #if defined(EIGEN_HAS_INDEX_LIST)
182  Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
183  Eigen::type2indexpair<2, 1> > paddings;
184 #else
186  paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
187  paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
188 #endif
189 
191  for (int iter = 0; iter < num_iters; ++iter) {
192  B.device(device_) = A.pad(paddings);
193  }
194  // Record the number of values copied from the padded tensor A each second
195  finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
196  }
197 
198  void striding(int num_iters) {
199  eigen_assert(m_ == k_);
201  size_a[0] = m_;
202  size_a[1] = k_;
203  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
205  size_b[0] = m_;
206  size_b[1] = k_/2;
208 
209 #ifndef EIGEN_HAS_INDEX_LIST
211  strides[0] = 1;
212  strides[1] = 2;
213 #else
214  // Take advantage of cxx11 to give the compiler information it can use to
215  // optimize the code.
216  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
217 #endif
218 
220  for (int iter = 0; iter < num_iters; ++iter) {
221  B.device(device_) = A.stride(strides);
222  }
223  // Record the number of values copied from the padded tensor A each second
224  finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
225  }
226 
227  void broadcasting(int num_iters) {
229  size_a[0] = m_;
230  size_a[1] = 1;
231  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
233  size_c[0] = m_;
234  size_c[1] = n_;
236 
237 #ifndef EIGEN_HAS_INDEX_LIST
239  broadcast[0] = 1;
240  broadcast[1] = n_;
241 #else
242  // Take advantage of cxx11 to give the compiler information it can use to
243  // optimize the code.
244  Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
245  broadcast.set(1, n_);
246 #endif
247 
249  for (int iter = 0; iter < num_iters; ++iter) {
250  C.device(device_) = A.broadcast(broadcast);
251  }
252  // Record the number of values broadcasted from A and copied to C each second
253  finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
254  }
255 
256  void coeffWiseOp(int num_iters) {
257  eigen_assert(m_ == k_ && k_ == n_);
259  sizes[0] = m_;
260  sizes[1] = m_;
261  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
262  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
264 
266  for (int iter = 0; iter < num_iters; ++iter) {
267  C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
268  }
269  // Record the number of FLOP executed per second (2 multiplications and
270  // 1 addition per value)
271  finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
272  }
273 
274  void algebraicFunc(int num_iters) {
275  eigen_assert(m_ == k_ && k_ == n_);
277  sizes[0] = m_;
278  sizes[1] = m_;
279  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
280  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
282 
284  for (int iter = 0; iter < num_iters; ++iter) {
285  C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
286  }
287  // Record the number of FLOP executed per second (assuming one operation
288  // per value)
289  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
290  }
291 
292  void transcendentalFunc(int num_iters) {
293  eigen_assert(m_ == k_ && k_ == n_);
295  sizes[0] = m_;
296  sizes[1] = m_;
297  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
298  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
300 
302  for (int iter = 0; iter < num_iters; ++iter) {
303  C.device(device_) = A.exp() + B.log();
304  }
305  // Record the number of FLOP executed per second (assuming one operation
306  // per value)
307  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
308  }
309 
310  // Row reduction
311  void rowReduction(int num_iters) {
312  Eigen::array<TensorIndex, 2> input_size;
313  input_size[0] = k_;
314  input_size[1] = n_;
316  Eigen::array<TensorIndex, 1> output_size;
317  output_size[0] = n_;
319 
320 #ifndef EIGEN_HAS_INDEX_LIST
321  Eigen::array<TensorIndex, 1> sum_along_dim;
322  sum_along_dim[0] = 0;
323 #else
324  // Take advantage of cxx11 to give the compiler information it can use to
325  // optimize the code.
326  Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
327 #endif
328 
330  for (int iter = 0; iter < num_iters; ++iter) {
331  C.device(device_) = B.sum(sum_along_dim);
332  }
333  // Record the number of FLOP executed per second (assuming one operation
334  // per value)
335  finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
336  }
337 
338  // Column reduction
339  void colReduction(int num_iters) {
340  Eigen::array<TensorIndex, 2> input_size;
341  input_size[0] = k_;
342  input_size[1] = n_;
344  b_, input_size);
345  Eigen::array<TensorIndex, 1> output_size;
346  output_size[0] = k_;
348  c_, output_size);
349 
350 #ifndef EIGEN_HAS_INDEX_LIST
351  Eigen::array<TensorIndex, 1> sum_along_dim;
352  sum_along_dim[0] = 1;
353 #else
354  // Take advantage of cxx11 to give the compiler information it can use to
355  // optimize the code.
356  Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
357 #endif
358 
360  for (int iter = 0; iter < num_iters; ++iter) {
361  C.device(device_) = B.sum(sum_along_dim);
362  }
363  // Record the number of FLOP executed per second (assuming one operation
364  // per value)
365  finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
366  }
367 
368  // Full reduction
369  void fullReduction(int num_iters) {
370  Eigen::array<TensorIndex, 2> input_size;
371  input_size[0] = k_;
372  input_size[1] = n_;
374  b_, input_size);
375  Eigen::array<TensorIndex, 0> output_size;
377  c_, output_size);
378 
380  for (int iter = 0; iter < num_iters; ++iter) {
381  C.device(device_) = B.sum();
382  }
383  // Record the number of FLOP executed per second (assuming one operation
384  // per value)
385  finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
386  }
387 
388  // do a contraction which is equivalent to a matrix multiplication
389  void contraction(int num_iters) {
391  sizeA[0] = m_;
392  sizeA[1] = k_;
394  sizeB[0] = k_;
395  sizeB[1] = n_;
397  sizeC[0] = m_;
398  sizeC[1] = n_;
399 
400  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
401  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
403 
404  typedef typename Tensor<T, 2>::DimensionPair DimPair;
406  dims[0] = DimPair(1, 0);
407 
409  for (int iter = 0; iter < num_iters; ++iter) {
410  C.device(device_) = A.contract(B, dims);
411  }
412  // Record the number of FLOP executed per second (size_ multiplications and
413  // additions for each value in the resulting tensor)
414  finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
415  }
416 
417  void convolution(int num_iters, int kernel_x, int kernel_y) {
418  Eigen::array<TensorIndex, 2> input_sizes;
419  input_sizes[0] = m_;
420  input_sizes[1] = n_;
421  TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
422  Eigen::array<TensorIndex, 2> kernel_sizes;
423  kernel_sizes[0] = kernel_x;
424  kernel_sizes[1] = kernel_y;
425  TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
426  Eigen::array<TensorIndex, 2> result_sizes;
427  result_sizes[0] = m_ - kernel_x + 1;
428  result_sizes[1] = n_ - kernel_y + 1;
429  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
431  dims[0] = 0;
432  dims[1] = 1;
433 
435  for (int iter = 0; iter < num_iters; ++iter) {
436  C.device(device_) = A.convolve(B, dims);
437  }
438  // Record the number of FLOP executed per second (kernel_size
439  // multiplications and additions for each value in the resulting tensor)
440  finalizeBenchmark(static_cast<int64_t>(2) *
441  (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
442  }
443 
444  private:
445  void initialize() {
446  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
447  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
448  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
449 
450  // Initialize the content of the memory pools to prevent asan from
451  // complaining.
452  device_.memset(a_, 12, m_ * k_ * sizeof(T));
453  device_.memset(b_, 23, k_ * n_ * sizeof(T));
454  device_.memset(c_, 31, m_ * n_ * sizeof(T));
455 
456  //BenchmarkUseRealTime();
457  }
458 
459  inline void finalizeBenchmark(int64_t num_items) {
460 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
462  device_.synchronize();
463  }
464 #endif
466  SetBenchmarkFlopsProcessed(num_items);
467  }
468 
469 
473  T* a_;
474  T* b_;
475  T* c_;
476  Device device_;
477 };
478 #endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
Matrix3f m
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorSlicingOp< const StartIndices, const Sizes, const TensorMap< PlainObjectType, Options_, MakePointer_ > > slice(const StartIndices &startIndices, const Sizes &sizes) const
Definition: TensorBase.h:920
void algebraicFunc(int num_iters)
void colChip(int num_iters)
void transcendentalFunc(int num_iters)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp< const Shuffle, const TensorMap< PlainObjectType, Options_, MakePointer_ > > shuffle(const Shuffle &shuffle) const
Definition: TensorBase.h:977
void convolution(int num_iters, int kernel_x, int kernel_y)
std::vector< Array2i > sizes
int n
void fullReduction(int num_iters)
void rowChip(int num_iters)
TensorDevice< TensorMap< PlainObjectType, Options_, MakePointer_ >, DeviceType > device(const DeviceType &device)
Definition: TensorBase.h:999
iterator iter(handle obj)
Definition: pytypes.h:1547
void memcpy(int num_iters)
void slicing(int num_iters)
BenchmarkSuite(const Device &device, size_t m)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorChippingOp< DimId, const TensorMap< PlainObjectType, Options_, MakePointer_ > > chip(const Index offset) const
Definition: TensorBase.h:944
void typeCasting(int num_iters)
void finalizeBenchmark(int64_t num_items)
BenchmarkSuite(const Device &device, size_t m, size_t k, size_t n)
void striding(int num_iters)
A tensor expression mapping an existing array of data.
Tensor< float, 1 >::DimensionPair DimPair
void contraction(int num_iters)
signed __int64 int64_t
Definition: ms_stdint.h:94
#define eigen_assert(x)
Definition: Macros.h:579
Eigen::Triplet< double > T
void padding(int num_iters)
void colReduction(int num_iters)
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:37
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorStridingOp< const Strides, const TensorMap< PlainObjectType, Options_, MakePointer_ > > stride(const Strides &strides) const
Definition: TensorBase.h:988
void random(int num_iters)
void StartBenchmarkTiming()
void StopBenchmarkTiming()
void rowReduction(int num_iters)
void shuffling(int num_iters)
int TensorIndex
void SetBenchmarkFlopsProcessed(int64_t)
void coeffWiseOp(int num_iters)
void broadcasting(int num_iters)
The tensor class.
Definition: Tensor.h:63
broadcast_trivial broadcast(const std::array< buffer_info, N > &buffers, ssize_t &ndim, std::vector< ssize_t > &shape)
Definition: numpy.h:1398


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:45:09