13 #ifndef EIGEN_BENCH_CONTRACT_SYCL 14 #define EIGEN_BENCH_CONTRACT_SYCL 15 #define EIGEN_TEST_NO_LONGDOUBLE 16 #define EIGEN_TEST_NO_COMPLEX 17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 18 #include <SYCL/sycl.hpp> 24 #include <unsupported/Eigen/CXX11/Tensor> 27 using Eigen::SyclDevice;
30 std::ofstream
out(
"Result.txt");
32 std::chrono::time_point<std::chrono::system_clock>
get_time(){
33 std::chrono::time_point<std::chrono::system_clock> start,
end;
34 return std::chrono::system_clock::now();
37 template<
typename Start,
typename End,
typename TensorIndex>
40 std::chrono::duration<double> elapsed_seconds = end-start;
41 std::cout <<
"Kernel Name : " << name <<
", M : " << m_ <<
", N : " << n_ <<
", K : " << k_ <<
" GFLOP/s : " <<
42 static_cast<float>((
static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1
e-9 <<
"\n";
43 out <<
"Kernel Name : " << name <<
", M : " << m_ <<
", N : " << n_ <<
", K : " << k_ <<
" GFLOP/s : " <<
44 static_cast<float>((
static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1
e-9 <<
"\n";
48 template<
typename T,
typename Device,
typename TensorIndex>
53 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
54 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
55 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
59 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
60 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
61 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
80 #ifdef EIGEN_USE_SYCL // warmup for sycl 82 C.
device(device_) = A.contract(B, dims);
87 C.
device(device_) = A.contract(B, dims);
93 device_.deallocate(a_);
94 device_.deallocate(b_);
95 device_.deallocate(c_);
96 device_.synchronize();
102 template<
typename T,
typename Device,
typename TensorIndex>
107 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
108 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
109 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
113 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
114 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
115 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
134 #ifdef EIGEN_USE_SYCL // warmup for sycl 136 C.
device(device_) = A.contract(B, dims);
141 C.
device(device_) = A.contract(B, dims);
147 device_.deallocate(a_);
148 device_.deallocate(b_);
149 device_.deallocate(c_);
150 device_.synchronize();
154 template<
typename T,
typename Device,
typename TensorIndex>
159 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
160 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
161 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
165 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
166 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
167 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
185 #ifdef EIGEN_USE_SYCL // warmup for sycl 187 C.
device(device_) = A.contract(B, dims);
192 C.
device(device_) = A.contract(B, dims);
198 device_.deallocate(a_);
199 device_.deallocate(b_);
200 device_.deallocate(c_);
201 device_.synchronize();
205 template<
typename T,
typename Device,
typename TensorIndex>
210 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
211 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
212 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
216 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
217 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
218 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
237 #ifdef EIGEN_USE_SYCL // warmup for sycl 239 C.
device(device_) = A.contract(B, dims);
244 C.
device(device_) = A.contract(B, dims);
250 device_.deallocate(a_);
251 device_.deallocate(b_);
252 device_.deallocate(c_);
253 device_.synchronize();
257 template<
typename T,
typename Device,
typename TensorIndex>
262 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
263 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
264 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
268 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
269 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
270 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
289 #ifdef EIGEN_USE_SYCL // warmup for sycl 291 C.
device(device_) = A.contract(B, dims);
296 C.
device(device_) = A.contract(B, dims);
302 device_.deallocate(a_);
303 device_.deallocate(b_);
304 device_.deallocate(c_);
305 device_.synchronize();
309 cl::sycl::gpu_selector selector;
310 Eigen::QueueInterface queue(selector);
311 Eigen::SyclDevice device(&queue);
314 for(
int64_t k = 32; k <= 4096; k *= 2)
316 (contraction<float>(device, num_iters,
m, k,
n));
317 (contractionRowMajor<float>(device, num_iters,
m, k,
n));
318 (contractionAT<float>(device, num_iters,
m, k,
n));
319 (contractionBT<float>(device, num_iters,
m, k,
n));
320 (contractionABT<float>(device, num_iters,
m, k,
n));
325 #endif // EIGEN_BENCH_CONTRACT_SYCL Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
std::chrono::time_point< std::chrono::system_clock > get_time()
std::ofstream out("Result.txt")
void contractionBT(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
void contractionAT(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
iterator iter(handle obj)
void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_, TensorIndex num_iters, std::string name)
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
A tensor expression mapping an existing array of data.
Tensor< float, 1 >::DimensionPair DimPair
Array< double, 1, 3 > e(1./3., 0.5, 2.)
void contractionRowMajor(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
TensorDevice< TensorMap< PlainObjectType, Options_, MakePointer_ >, DeviceType > device(const DeviceType &dev)
Matrix< Scalar, Dynamic, Dynamic > C
static EIGEN_DEPRECATED const end_t end
void contraction(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
Annotation for function names.
void contractionABT(const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)