tensor_contract_sycl_bench.cc
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016
5 // Mehdi Goli Codeplay Software Ltd.
6 // Ralph Potter Codeplay Software Ltd.
7 // Luke Iwanski Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 #ifndef EIGEN_BENCH_CONTRACT_SYCL
14 #define EIGEN_BENCH_CONTRACT_SYCL
15 #define EIGEN_TEST_NO_LONGDOUBLE
16 #define EIGEN_TEST_NO_COMPLEX
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #include <SYCL/sycl.hpp>
19 #include <fstream>
20 #include <iostream>
21 #include <chrono>
22 #include <ctime>
23 
24 #include <unsupported/Eigen/CXX11/Tensor>
25 
26 using Eigen::array;
27 using Eigen::SyclDevice;
28 using Eigen::Tensor;
29 using Eigen::TensorMap;
30 std::ofstream out("Result.txt");
31 
32 std::chrono::time_point<std::chrono::system_clock> get_time(){
33  std::chrono::time_point<std::chrono::system_clock> start, end;
34  return std::chrono::system_clock::now();
35 }
36 
37 template<typename Start, typename End, typename TensorIndex>
38 void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
39 
40  std::chrono::duration<double> elapsed_seconds = end-start;
41  std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
42  static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
43  out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
44  static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
45 }
46 
47 // do a contraction which is equivalent to a matrix multiplication
48 template<typename T, typename Device, typename TensorIndex>
49 void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
50  T* a_;
51  T* b_;
52  T* c_;
53  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
54  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
55  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
56 
57  // Initialize the content of the memory pools to prevent asan from
58  // complaining.
59  device_.memset(a_, 12, m_ * k_ * sizeof(T));
60  device_.memset(b_, 23, k_ * n_ * sizeof(T));
61  device_.memset(c_, 31, m_ * n_ * sizeof(T));
62 
64  sizeA[0] = m_;
65  sizeA[1] = k_;
67  sizeB[0] = k_;
68  sizeB[1] = n_;
70  sizeC[0] = m_;
71  sizeC[1] = n_;
72 
73  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
74  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
76 
77  typedef typename Tensor<T, 2>::DimensionPair DimPair;
79  dims[0] = DimPair(1, 0);
80 #ifdef EIGEN_USE_SYCL // warmup for sycl
81  for (int iter = 0; iter < 10; ++iter) {
82  C.device(device_) = A.contract(B, dims);
83  }
84 #endif
85  auto start = get_time();
86  for (int iter = 0; iter < num_iters; ++iter) {
87  C.device(device_) = A.contract(B, dims);
88  }
89  auto end = get_time();
90  // Record the number of FLOPs executed per second (size_ multiplications and
91  // additions for each value in the resulting tensor)
92  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
93  device_.deallocate(a_);
94  device_.deallocate(b_);
95  device_.deallocate(c_);
96  device_.synchronize();
97 }
98 
99 
100 
101 // do a contraction which is equivalent to a matrix multiplication
102 template<typename T, typename Device, typename TensorIndex>
103 void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
104  T* a_;
105  T* b_;
106  T* c_;
107  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
108  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
109  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
110 
111  // Initialize the content of the memory pools to prevent asan from
112  // complaining.
113  device_.memset(a_, 12, m_ * k_ * sizeof(T));
114  device_.memset(b_, 23, k_ * n_ * sizeof(T));
115  device_.memset(c_, 31, m_ * n_ * sizeof(T));
116 
118  sizeA[0] = m_;
119  sizeA[1] = k_;
121  sizeB[0] = k_;
122  sizeB[1] = n_;
124  sizeC[0] = m_;
125  sizeC[1] = n_;
126 
130 
131  typedef typename Tensor<T, 2>::DimensionPair DimPair;
133  dims[0] = DimPair(1, 0);
134 #ifdef EIGEN_USE_SYCL // warmup for sycl
135  for (int iter = 0; iter < 10; ++iter) {
136  C.device(device_) = A.contract(B, dims);
137  }
138 #endif
139  auto start = get_time();
140  for (int iter = 0; iter < num_iters; ++iter) {
141  C.device(device_) = A.contract(B, dims);
142  }
143  auto end = get_time();
144  // Record the number of FLOPs executed per second (size_ multiplications and
145  // additions for each value in the resulting tensor)
146  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
147  device_.deallocate(a_);
148  device_.deallocate(b_);
149  device_.deallocate(c_);
150  device_.synchronize();
151 }
152 
153 
154 template<typename T, typename Device, typename TensorIndex>
155 void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
156  T* a_;
157  T* b_;
158  T* c_;
159  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
160  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
161  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
162 
163  // Initialize the content of the memory pools to prevent asan from
164  // complaining.
165  device_.memset(a_, 12, m_ * k_ * sizeof(T));
166  device_.memset(b_, 23, k_ * n_ * sizeof(T));
167  device_.memset(c_, 31, m_ * n_ * sizeof(T));
169  sizeA[0] = k_;
170  sizeA[1] = m_;
172  sizeB[0] = k_;
173  sizeB[1] = n_;
175  sizeC[0] = m_;
176  sizeC[1] = n_;
177 
181 
182  typedef typename Tensor<T, 2>::DimensionPair DimPair;
184  dims[0] = DimPair(0, 0);
185 #ifdef EIGEN_USE_SYCL // warmup for sycl
186  for (int iter = 0; iter < 10; ++iter) {
187  C.device(device_) = A.contract(B, dims);
188  }
189 #endif
190  auto start = get_time();
191  for (int iter = 0; iter < num_iters; ++iter) {
192  C.device(device_) = A.contract(B, dims);
193  }
194  auto end = get_time();
195  // Record the number of FLOPs executed per second (size_ multiplications and
196  // additions for each value in the resulting tensor)
197  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
198  device_.deallocate(a_);
199  device_.deallocate(b_);
200  device_.deallocate(c_);
201  device_.synchronize();
202 
203 }
204 
205 template<typename T, typename Device, typename TensorIndex>
206 void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
207  T* a_;
208  T* b_;
209  T* c_;
210  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
211  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
212  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
213 
214  // Initialize the content of the memory pools to prevent asan from
215  // complaining.
216  device_.memset(a_, 12, m_ * k_ * sizeof(T));
217  device_.memset(b_, 23, k_ * n_ * sizeof(T));
218  device_.memset(c_, 31, m_ * n_ * sizeof(T));
219 
221  sizeA[0] = m_;
222  sizeA[1] = k_;
224  sizeB[0] = n_;
225  sizeB[1] = k_;
227  sizeC[0] = m_;
228  sizeC[1] = n_;
229 
233 
234  typedef typename Tensor<T, 2>::DimensionPair DimPair;
236  dims[0] = DimPair(1, 1);
237 #ifdef EIGEN_USE_SYCL // warmup for sycl
238  for (int iter = 0; iter < 10; ++iter) {
239  C.device(device_) = A.contract(B, dims);
240  }
241 #endif
242  auto start = get_time();
243  for (int iter = 0; iter < num_iters; ++iter) {
244  C.device(device_) = A.contract(B, dims);
245  }
246  auto end = get_time();
247  // Record the number of FLOPs executed per second (size_ multiplications and
248  // additions for each value in the resulting tensor)
249  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
250  device_.deallocate(a_);
251  device_.deallocate(b_);
252  device_.deallocate(c_);
253  device_.synchronize();
254 
255 }
256 
257 template<typename T, typename Device, typename TensorIndex>
258 void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
259  T* a_;
260  T* b_;
261  T* c_;
262  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
263  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
264  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
265 
266  // Initialize the content of the memory pools to prevent asan from
267  // complaining.
268  device_.memset(a_, 12, m_ * k_ * sizeof(T));
269  device_.memset(b_, 23, k_ * n_ * sizeof(T));
270  device_.memset(c_, 31, m_ * n_ * sizeof(T));
271 
273  sizeA[0] = k_;
274  sizeA[1] = m_;
276  sizeB[0] = n_;
277  sizeB[1] = k_;
279  sizeC[0] = m_;
280  sizeC[1] = n_;
281 
285 
286  typedef typename Tensor<T, 2>::DimensionPair DimPair;
288  dims[0] = DimPair(0, 1);
289 #ifdef EIGEN_USE_SYCL // warmup for sycl
290  for (int iter = 0; iter < 10; ++iter) {
291  C.device(device_) = A.contract(B, dims);
292  }
293 #endif
294  auto start = get_time();
295  for (int iter = 0; iter < num_iters; ++iter) {
296  C.device(device_) = A.contract(B, dims);
297  }
298  auto end = get_time();
299  // Record the number of FLOPs executed per second (size_ multiplications and
300  // additions for each value in the resulting tensor)
301  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
302  device_.deallocate(a_);
303  device_.deallocate(b_);
304  device_.deallocate(c_);
305  device_.synchronize();
306 }
307 
308 int main() {
309  cl::sycl::gpu_selector selector;
310  Eigen::QueueInterface queue(selector);
311  Eigen::SyclDevice device(&queue);
312  int64_t num_iters =20;
313  for(int64_t m = 32; m <= 4096; m *= 2)
314  for(int64_t k = 32; k <= 4096; k *= 2)
315  for(int64_t n = 32; n <= 4096; n*= 2){
316  (contraction<float>(device, num_iters, m, k, n));
317  (contractionRowMajor<float>(device, num_iters, m, k, n));
318  (contractionAT<float>(device, num_iters, m, k, n));
319  (contractionBT<float>(device, num_iters, m, k, n));
320  (contractionABT<float>(device, num_iters, m, k, n));
321  }
322  return 0;
323  }
324 
325 #endif // EIGEN_BENCH_CONTRACT_SYCL
Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
Definition: bench_gemm.cpp:49
Matrix3f m
int array[24]
std::chrono::time_point< std::chrono::system_clock > get_time()
std::ofstream out("Result.txt")
int n
void contractionBT(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
void contractionAT(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
iterator iter(handle obj)
Definition: pytypes.h:2273
void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_, TensorIndex num_iters, std::string name)
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
Definition: bench_gemm.cpp:48
A tensor expression mapping an existing array of data.
Tensor< float, 1 >::DimensionPair DimPair
signed __int64 int64_t
Definition: ms_stdint.h:94
Array< double, 1, 3 > e(1./3., 0.5, 2.)
void contractionRowMajor(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
TensorDevice< TensorMap< PlainObjectType, Options_, MakePointer_ >, DeviceType > device(const DeviceType &dev)
Definition: TensorBase.h:1145
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:50
static EIGEN_DEPRECATED const end_t end
void contraction(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
Annotation for function names.
Definition: attr.h:48
int TensorIndex
void contractionABT(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
The tensor class.
Definition: Tensor.h:63


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:36:38