tensor_benchmarks.h
Go to the documentation of this file.
1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
3 
4 typedef int TensorIndex;
5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
6 
7 #include "unsupported/Eigen/CXX11/Tensor"
8 #include "benchmark.h"
9 
10 #define BENCHMARK_RANGE(bench, lo, hi) \
11  BENCHMARK(bench)->Range(lo, hi)
12 
13 using Eigen::Tensor;
14 using Eigen::TensorMap;
15 
16 // TODO(bsteiner): also templatize on the input type since we have users
17 // for int8 as well as floats.
18 template <typename Device, typename T> class BenchmarkSuite {
19  public:
20  BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
21  : m_(m), k_(k), n_(n), device_(device) {
22  initialize();
23  }
24 
25  BenchmarkSuite(const Device& device, size_t m)
26  : m_(m), k_(m), n_(m), device_(device) {
27  initialize();
28  }
29 
30  BenchmarkSuite(const Device& device, size_t m, size_t k)
31  : m_(1), k_(k), n_(m), device_(device) {
32  initialize();
33  }
34 
36  device_.deallocate(a_);
37  device_.deallocate(b_);
38  device_.deallocate(c_);
39  }
40 
41  void memcpy(int num_iters) {
42  eigen_assert(m_ == k_ && k_ == n_);
43 #ifdef EIGEN_USE_SYCL // warmup for sycl
44  for (int iter = 0; iter < 10; ++iter) {
45  device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
46  }
47 #endif
49  for (int iter = 0; iter < num_iters; ++iter) {
50  device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
51  }
52  // Record the number of values copied per second
53  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
54  }
55 
56  void typeCasting(int num_iters) {
57  eigen_assert(m_ == n_);
59  if (sizeof(T) >= sizeof(int)) {
60  sizes[0] = m_;
61  sizes[1] = k_;
62  } else {
63  sizes[0] = m_ * sizeof(T) / sizeof(int);
64  sizes[1] = k_ * sizeof(T) / sizeof(int);
65  }
68 #ifdef EIGEN_USE_SYCL // warmup for sycl
69  for (int iter = 0; iter < 10; ++iter) {
70  B.device(device_) = A.template cast<T>();
71  }
72 #endif
74  for (int iter = 0; iter < num_iters; ++iter) {
75  B.device(device_) = A.template cast<T>();
76  }
77  // Record the number of values copied per second
78  finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
79  }
80 
81  void random(int num_iters) {
82  eigen_assert(m_ == k_ && k_ == n_);
84  sizes[0] = m_;
85  sizes[1] = m_;
87 #ifdef EIGEN_USE_SYCL // warmup for sycl
88  for (int iter = 0; iter < 10; ++iter) {
89  C.device(device_) = C.random();
90  }
91 #endif
93  for (int iter = 0; iter < num_iters; ++iter) {
94  C.device(device_) = C.random();
95  }
96  // Record the number of random numbers generated per second
97  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
98  }
99 
100  void slicing(int num_iters) {
101  eigen_assert(m_ == k_ && k_ == n_);
103  sizes[0] = m_;
104  sizes[1] = m_;
105  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
106  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
108 
109  const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
110  const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
111  const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
112  const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
113  const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
114 #ifdef EIGEN_USE_SYCL // warmup for sycl
115  for (int iter = 0; iter < 10; ++iter) {
116  C.slice(first_quadrant, quarter_sizes).device(device_) =
117  A.slice(first_quadrant, quarter_sizes);
118  C.slice(second_quadrant, quarter_sizes).device(device_) =
119  B.slice(second_quadrant, quarter_sizes);
120  C.slice(third_quadrant, quarter_sizes).device(device_) =
121  A.slice(third_quadrant, quarter_sizes);
122  C.slice(fourth_quadrant, quarter_sizes).device(device_) =
123  B.slice(fourth_quadrant, quarter_sizes);
124  }
125 #endif
127  for (int iter = 0; iter < num_iters; ++iter) {
128  C.slice(first_quadrant, quarter_sizes).device(device_) =
129  A.slice(first_quadrant, quarter_sizes);
130  C.slice(second_quadrant, quarter_sizes).device(device_) =
131  B.slice(second_quadrant, quarter_sizes);
132  C.slice(third_quadrant, quarter_sizes).device(device_) =
133  A.slice(third_quadrant, quarter_sizes);
134  C.slice(fourth_quadrant, quarter_sizes).device(device_) =
135  B.slice(fourth_quadrant, quarter_sizes);
136  }
137  // Record the number of values copied from the rhs slice to the lhs slice
138  // each second
139  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
140  }
141 
142  void rowChip(int num_iters) {
143  Eigen::array<TensorIndex, 2> input_size;
144  input_size[0] = k_;
145  input_size[1] = n_;
147  Eigen::array<TensorIndex, 1> output_size;
148  output_size[0] = n_;
150 #ifdef EIGEN_USE_SYCL // warmup for sycl
151  for (int iter = 0; iter < 10; ++iter) {
152  C.device(device_) = B.chip(iter % k_, 0);
153  }
154 #endif
156  for (int iter = 0; iter < num_iters; ++iter) {
157  C.device(device_) = B.chip(iter % k_, 0);
158  }
159  // Record the number of values copied from the rhs chip to the lhs.
160  finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
161  }
162 
163  void colChip(int num_iters) {
164  Eigen::array<TensorIndex, 2> input_size;
165  input_size[0] = k_;
166  input_size[1] = n_;
168  Eigen::array<TensorIndex, 1> output_size;
169  output_size[0] = n_;
171 #ifdef EIGEN_USE_SYCL // warmup for sycl
172  for (int iter = 0; iter < 10; ++iter) {
173  C.device(device_) = B.chip(iter % n_, 1);
174  }
175 #endif
177  for (int iter = 0; iter < num_iters; ++iter) {
178  C.device(device_) = B.chip(iter % n_, 1);
179  }
180  // Record the number of values copied from the rhs chip to the lhs.
181  finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
182  }
183 
184  void shuffling(int num_iters) {
185  eigen_assert(m_ == n_);
187  size_a[0] = m_;
188  size_a[1] = k_;
189  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
191  size_b[0] = k_;
192  size_b[1] = m_;
194 
195  Eigen::array<int, 2> shuffle;
196  shuffle[0] = 1;
197  shuffle[1] = 0;
198 #ifdef EIGEN_USE_SYCL // warmup for sycl
199  for (int iter = 0; iter < 10; ++iter) {
200  B.device(device_) = A.shuffle(shuffle);
201  }
202 #endif
204  for (int iter = 0; iter < num_iters; ++iter) {
205  B.device(device_) = A.shuffle(shuffle);
206  }
207  // Record the number of values shuffled from A and copied to B each second
208  finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
209  }
210 
211  void padding(int num_iters) {
212  eigen_assert(m_ == k_);
214  size_a[0] = m_;
215  size_a[1] = k_-3;
216  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
218  size_b[0] = k_;
219  size_b[1] = m_;
221 
222 #if defined(EIGEN_HAS_INDEX_LIST)
223  Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
224  Eigen::type2indexpair<2, 1> > paddings;
225 #else
227  paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
228  paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
229 #endif
230 #ifdef EIGEN_USE_SYCL // warmup for sycl
231  for (int iter = 0; iter < 10; ++iter) {
232  B.device(device_) = A.pad(paddings);
233  }
234 #endif
236  for (int iter = 0; iter < num_iters; ++iter) {
237  B.device(device_) = A.pad(paddings);
238  }
239  // Record the number of values copied from the padded tensor A each second
240  finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
241  }
242 
243  void striding(int num_iters) {
244  eigen_assert(m_ == k_);
246  size_a[0] = m_;
247  size_a[1] = k_;
248  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
250  size_b[0] = m_;
251  size_b[1] = k_/2;
253 
254 #ifndef EIGEN_HAS_INDEX_LIST
256  strides[0] = 1;
257  strides[1] = 2;
258 #else
259  // Take advantage of cxx11 to give the compiler information it can use to
260  // optimize the code.
261  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
262 #endif
263 
264 #ifdef EIGEN_USE_SYCL // warmup for sycl
265  for (int iter = 0; iter < 10; ++iter) {
266  B.device(device_) = A.stride(strides);
267  }
268 #endif
270  for (int iter = 0; iter < num_iters; ++iter) {
271  B.device(device_) = A.stride(strides);
272  }
273  // Record the number of values copied from the padded tensor A each second
274  finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
275  }
276 
277 
278  void broadcasting(int num_iters) {
280  size_a[0] = m_;
281  size_a[1] = 1;
282  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
284  size_c[0] = m_;
285  size_c[1] = n_;
287 
288 #ifndef EIGEN_HAS_INDEX_LIST
290  broadcast[0] = 1;
291  broadcast[1] = n_;
292 #else
293  // Take advantage of cxx11 to give the compiler information it can use to
294  // optimize the code.
295  Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
296  broadcast.set(1, n_);
297 #endif
298 
299 #ifdef EIGEN_USE_SYCL // warmup for sycl
300  for (int iter = 0; iter < 10; ++iter) {
301  C.device(device_) = A.broadcast(broadcast);
302  }
303 #endif
305  for (int iter = 0; iter < num_iters; ++iter) {
306  C.device(device_) = A.broadcast(broadcast);
307  }
308  // Record the number of values broadcasted from A and copied to C each second
309  finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
310  }
311 
312  void coeffWiseOp(int num_iters) {
313  eigen_assert(m_ == k_ && k_ == n_);
315  sizes[0] = m_;
316  sizes[1] = m_;
317  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
318  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
320 #ifdef EIGEN_USE_SYCL // warmup for sycl
321  for (int iter = 0; iter < 10; ++iter) {
322  C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
323  }
324 #endif
326  for (int iter = 0; iter < num_iters; ++iter) {
327  C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
328  }
329  // Record the number of FLOP executed per second (2 multiplications and
330  // 1 addition per value)
331  finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
332  }
333 
334  void algebraicFunc(int num_iters) {
335  eigen_assert(m_ == k_ && k_ == n_);
337  sizes[0] = m_;
338  sizes[1] = m_;
339  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
340  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
342 
343 #ifdef EIGEN_USE_SYCL // warmup for sycl
344 for (int iter = 0; iter < 10; ++iter) {
345  C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
346 }
347 #endif
349  for (int iter = 0; iter < num_iters; ++iter) {
350  C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
351  }
352  // Record the number of FLOP executed per second (assuming one operation
353  // per value)
354  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
355  }
356 
357  void transcendentalFunc(int num_iters) {
358  eigen_assert(m_ == k_ && k_ == n_);
360  sizes[0] = m_;
361  sizes[1] = m_;
362  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
363  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
365 #ifdef EIGEN_USE_SYCL // warmup for sycl
366  for (int iter = 0; iter < 10; ++iter) {
367  C.device(device_) = A.exp() + B.log();
368  }
369 #endif
371  for (int iter = 0; iter < num_iters; ++iter) {
372  C.device(device_) = A.exp() + B.log();
373  }
374  // Record the number of FLOP executed per second (assuming one operation
375  // per value)
376  finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
377  }
378 
379  // Row reduction
380  void rowReduction(int num_iters) {
381  Eigen::array<TensorIndex, 2> input_size;
382  input_size[0] = k_;
383  input_size[1] = n_;
385  Eigen::array<TensorIndex, 1> output_size;
386  output_size[0] = n_;
388 
389 #ifndef EIGEN_HAS_INDEX_LIST
390  Eigen::array<TensorIndex, 1> sum_along_dim;
391  sum_along_dim[0] = 0;
392 #else
393  // Take advantage of cxx11 to give the compiler information it can use to
394  // optimize the code.
395  Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
396 #endif
397 #ifdef EIGEN_USE_SYCL // warmup for sycl
398  for (int iter = 0; iter < 10; ++iter) {
399  C.device(device_) = B.sum(sum_along_dim);
400  }
401 #endif
403  for (int iter = 0; iter < num_iters; ++iter) {
404  C.device(device_) = B.sum(sum_along_dim);
405  }
406  // Record the number of FLOP executed per second (assuming one operation
407  // per value)
408  finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
409  }
410 
411  // Column reduction
412  void colReduction(int num_iters) {
413  Eigen::array<TensorIndex, 2> input_size;
414  input_size[0] = k_;
415  input_size[1] = n_;
417  b_, input_size);
418  Eigen::array<TensorIndex, 1> output_size;
419  output_size[0] = k_;
421  a_, output_size);
422 
423 #ifndef EIGEN_HAS_INDEX_LIST
424  Eigen::array<TensorIndex, 1> sum_along_dim;
425  sum_along_dim[0] = 1;
426 #else
427  // Take advantage of cxx11 to give the compiler information it can use to
428  // optimize the code.
429  Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
430 #endif
431 #ifdef EIGEN_USE_SYCL // warmup for sycl
432  for (int iter = 0; iter < 10; ++iter) {
433  A.device(device_) = B.sum(sum_along_dim);
434  }
435 #endif
437  for (int iter = 0; iter < num_iters; ++iter) {
438  A.device(device_) = B.sum(sum_along_dim);
439  }
440  // Record the number of FLOP executed per second (assuming one operation
441  // per value)
442  finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
443  }
444 
445  // Full reduction
446  void fullReduction(int num_iters) {
447  Eigen::array<TensorIndex, 2> input_size;
448  input_size[0] = k_;
449  input_size[1] = n_;
451  b_, input_size);
452  Eigen::array<TensorIndex, 0> output_size;
454  c_, output_size);
455 #ifdef EIGEN_USE_SYCL // warmup for sycl
456  for (int iter = 0; iter < 10; ++iter) {
457  C.device(device_) = B.sum();
458  }
459 #endif
461  for (int iter = 0; iter < num_iters; ++iter) {
462  C.device(device_) = B.sum();
463  }
464  // Record the number of FLOP executed per second (assuming one operation
465  // per value)
466  finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
467  }
468 
469 
470 
471  // do a contraction which is equivalent to a matrix multiplication
472  void contraction(int num_iters) {
473  contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
474  }
475 
476  void contractionRowMajor(int num_iters) {
477  contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
478  }
479 
480  void contractionRowMajorAT(int num_iters) {
481  contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
482  }
483 
484  void contractionRowMajorBT(int num_iters) {
485  contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
486  }
487 
488  void contractionRowMajorABT(int num_iters) {
489  contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
490  }
491 
492  void convolution(int num_iters, int kernel_x, int kernel_y) {
493  Eigen::array<TensorIndex, 2> input_sizes;
494  input_sizes[0] = m_;
495  input_sizes[1] = n_;
496  TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
497  Eigen::array<TensorIndex, 2> kernel_sizes;
498  kernel_sizes[0] = kernel_x;
499  kernel_sizes[1] = kernel_y;
500  TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
501  Eigen::array<TensorIndex, 2> result_sizes;
502  result_sizes[0] = m_ - kernel_x + 1;
503  result_sizes[1] = n_ - kernel_y + 1;
504  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
506  dims[0] = 0;
507  dims[1] = 1;
508 #ifdef EIGEN_USE_SYCL // warmup for sycl
509  for (int iter = 0; iter < 10; ++iter) {
510  C.device(device_) = A.convolve(B, dims);
511  }
512 #endif
514  for (int iter = 0; iter < num_iters; ++iter) {
515  C.device(device_) = A.convolve(B, dims);
516  }
517  // Record the number of FLOPs executed per second (kernel_size
518  // multiplications and additions for each value in the resulting tensor)
519  finalizeBenchmark(static_cast<int64_t>(2) *
520  (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
521  }
522 
523  private:
524  // do a contraction which is equivalent to a matrix multiplication
525  template<int Layout>
526  void contraction(int num_iters, bool trans_a, bool trans_b) {
528  sizeA[0] = (trans_a ? k_: m_);
529  sizeA[1] = (trans_a ? m_: k_);
531  sizeB[0] = (trans_b ? n_: k_);
532  sizeB[1] = (trans_b ? k_: n_);
534  sizeC[0] = m_;
535  sizeC[1] = n_;
536 
540 
543  TensorIndex a_contract_dim = (trans_a ? 0 : 1);
544  TensorIndex b_contract_dim = (trans_b ? 1 : 0);
545  dims[0] = DimPair(a_contract_dim, b_contract_dim);
546 #ifdef EIGEN_USE_SYCL // warmup for sycl
547  for (int iter = 0; iter < 10; ++iter) {
548  C.device(device_) = A.contract(B, dims);
549  }
550 #endif
552  for (int iter = 0; iter < num_iters; ++iter) {
553  C.device(device_) = A.contract(B, dims);
554  }
555  // Record the number of FLOP executed per second (size_ multiplications and
556  // additions for each value in the resulting tensor)
557  finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
558  }
559 
560  void initialize() {
561  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
562  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
563  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
564 
565  // Initialize the content of the memory pools to prevent asan from
566  // complaining.
567  device_.memset(a_, 12, m_ * k_ * sizeof(T));
568  device_.memset(b_, 23, k_ * n_ * sizeof(T));
569  device_.memset(c_, 31, m_ * n_ * sizeof(T));
570 
571  }
572 
573  inline void finalizeBenchmark(int64_t num_items) {
574 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
576  device_.synchronize();
577  }
578 #elif defined(EIGEN_USE_SYCL)
580  device_.synchronize();
581  }
582 
583 #endif
585  SetBenchmarkFlopsProcessed(num_items);
586  }
587 
588 
592  T* a_;
593  T* b_;
594  T* c_;
595  Device device_;
596 };
597 #endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
Matrix3f m
void algebraicFunc(int num_iters)
void colChip(int num_iters)
void transcendentalFunc(int num_iters)
void convolution(int num_iters, int kernel_x, int kernel_y)
void contractionRowMajorBT(int num_iters)
std::vector< Array2i > sizes
int n
void fullReduction(int num_iters)
void rowChip(int num_iters)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorChippingOp< DimId, const TensorMap< PlainObjectType, Options_, MakePointer_ > > chip(const Index offset) const
Definition: TensorBase.h:1090
iterator iter(handle obj)
Definition: pytypes.h:2273
void memcpy(int num_iters)
void slicing(int num_iters)
void contractionRowMajor(int num_iters)
BenchmarkSuite(const Device &device, size_t m)
void contractionRowMajorABT(int num_iters)
void contraction(int num_iters, bool trans_a, bool trans_b)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp< const Shuffle, const TensorMap< PlainObjectType, Options_, MakePointer_ > > shuffle(const Shuffle &shfl) const
Definition: TensorBase.h:1123
void typeCasting(int num_iters)
void finalizeBenchmark(int64_t num_items)
BenchmarkSuite(const Device &device, size_t m, size_t k, size_t n)
void striding(int num_iters)
A tensor expression mapping an existing array of data.
Tensor< float, 1 >::DimensionPair DimPair
void contraction(int num_iters)
void contractionRowMajorAT(int num_iters)
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Definition: TensorBlock.h:26
signed __int64 int64_t
Definition: ms_stdint.h:94
#define eigen_assert(x)
Definition: Macros.h:1037
Eigen::Triplet< double > T
void padding(int num_iters)
TensorDevice< TensorMap< PlainObjectType, Options_, MakePointer_ >, DeviceType > device(const DeviceType &dev)
Definition: TensorBase.h:1145
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorSlicingOp< const StartIndices, const Sizes, const TensorMap< PlainObjectType, Options_, MakePointer_ > > slice(const StartIndices &startIndices, const Sizes &sizes) const
Definition: TensorBase.h:1066
void colReduction(int num_iters)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorStridingOp< const Strides, const TensorMap< PlainObjectType, Options_, MakePointer_ > > stride(const Strides &strides) const
Definition: TensorBase.h:1134
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:50
void random(int num_iters)
void StartBenchmarkTiming()
void StopBenchmarkTiming()
void rowReduction(int num_iters)
void shuffling(int num_iters)
int TensorIndex
void SetBenchmarkFlopsProcessed(int64_t)
BenchmarkSuite(const Device &device, size_t m, size_t k)
void coeffWiseOp(int num_iters)
void broadcasting(int num_iters)
The tensor class.
Definition: Tensor.h:63
broadcast_trivial broadcast(const std::array< buffer_info, N > &buffers, ssize_t &ndim, std::vector< ssize_t > &shape)
Definition: numpy.h:1647


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:36:38