13 #ifndef EIGEN_BENCH_CONTRACT_SYCL
14 #define EIGEN_BENCH_CONTRACT_SYCL
15 #define EIGEN_TEST_NO_LONGDOUBLE
16 #define EIGEN_TEST_NO_COMPLEX
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #include <SYCL/sycl.hpp>
24 #include <unsupported/Eigen/CXX11/Tensor>
27 using Eigen::SyclDevice;
30 std::ofstream
out(
"Result.txt");
32 std::chrono::time_point<std::chrono::system_clock>
get_time(){
33 std::chrono::time_point<std::chrono::system_clock> start,
end;
34 return std::chrono::system_clock::now();
37 template<
typename Start,
typename End,
typename TensorIndex>
40 std::chrono::duration<double> elapsed_seconds =
end-start;
41 std::cout <<
"Kernel Name : " <<
name <<
", M : " << m_ <<
", N : " << n_ <<
", K : " << k_ <<
" GFLOP/s : " <<
42 static_cast<float>((
static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1
e-9 <<
"\n";
43 out <<
"Kernel Name : " <<
name <<
", M : " << m_ <<
", N : " << n_ <<
", K : " << k_ <<
" GFLOP/s : " <<
44 static_cast<float>((
static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1
e-9 <<
"\n";
48 template<
typename T,
typename Device,
typename TensorIndex>
53 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
54 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
55 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
59 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
60 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
61 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
80 #ifdef EIGEN_USE_SYCL // warmup for sycl
82 C.device(device_) =
A.contract(
B, dims);
87 C.device(device_) =
A.contract(
B, dims);
93 device_.deallocate(a_);
94 device_.deallocate(b_);
95 device_.deallocate(c_);
96 device_.synchronize();
102 template<
typename T,
typename Device,
typename TensorIndex>
107 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
108 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
109 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
113 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
114 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
115 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
134 #ifdef EIGEN_USE_SYCL // warmup for sycl
136 C.device(device_) =
A.contract(
B, dims);
141 C.device(device_) =
A.contract(
B, dims);
147 device_.deallocate(a_);
148 device_.deallocate(b_);
149 device_.deallocate(c_);
150 device_.synchronize();
154 template<
typename T,
typename Device,
typename TensorIndex>
159 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
160 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
161 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
165 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
166 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
167 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
185 #ifdef EIGEN_USE_SYCL // warmup for sycl
187 C.device(device_) =
A.contract(
B, dims);
192 C.device(device_) =
A.contract(
B, dims);
198 device_.deallocate(a_);
199 device_.deallocate(b_);
200 device_.deallocate(c_);
201 device_.synchronize();
205 template<
typename T,
typename Device,
typename TensorIndex>
210 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
211 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
212 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
216 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
217 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
218 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
237 #ifdef EIGEN_USE_SYCL // warmup for sycl
239 C.device(device_) =
A.contract(
B, dims);
244 C.device(device_) =
A.contract(
B, dims);
250 device_.deallocate(a_);
251 device_.deallocate(b_);
252 device_.deallocate(c_);
253 device_.synchronize();
257 template<
typename T,
typename Device,
typename TensorIndex>
262 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
263 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
264 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
268 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
269 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
270 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
289 #ifdef EIGEN_USE_SYCL // warmup for sycl
291 C.device(device_) =
A.contract(
B, dims);
296 C.device(device_) =
A.contract(
B, dims);
302 device_.deallocate(a_);
303 device_.deallocate(b_);
304 device_.deallocate(c_);
305 device_.synchronize();
309 cl::sycl::gpu_selector selector;
310 Eigen::QueueInterface queue(selector);
311 Eigen::SyclDevice device(&queue);
314 for(
int64_t k = 32; k <= 4096; k *= 2)
316 (contraction<float>(device, num_iters,
m, k,
n));
317 (contractionRowMajor<float>(device, num_iters,
m, k,
n));
318 (contractionAT<float>(device, num_iters,
m, k,
n));
319 (contractionBT<float>(device, num_iters,
m, k,
n));
320 (contractionABT<float>(device, num_iters,
m, k,
n));
325 #endif // EIGEN_BENCH_CONTRACT_SYCL