14 #define EIGEN_TEST_NO_LONGDOUBLE 
   15 #define EIGEN_TEST_NO_COMPLEX 
   17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 
   18 #define EIGEN_USE_SYCL 
   27 #include <unsupported/Eigen/CXX11/Tensor> 
   30 using Eigen::SyclDevice;
 
   34 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
   37                                   IndexType k_size, IndexType n_size) {
 
   61       static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
 
   63       static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
 
   64   DataType *d_t_result =
 
   65       static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
 
   68       gpu_t_left(d_t_left, left_dims);
 
   70       gpu_t_right(d_t_right, right_dims);
 
   72       gpu_t_result(d_t_result, result_dims);
 
   74   sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
 
   75   sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
 
   77   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
 
   78   sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
 
   81   t_result = t_left.contract(t_right, dims);
 
   83   for (IndexType 
i = 0; 
i < t_result.
size(); 
i++) {
 
   84     if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
 
   93     std::cout << 
"M : " << m_size << 
", N : " << n_size << 
", K : " << k_size
 
   94               << 
", mismatch detected at IndexType " << 
i << 
": " << t_result(
i)
 
   95               << 
" vs " << t_result_gpu(
i) << std::endl;
 
   98   sycl_device.deallocate(d_t_left);
 
   99   sycl_device.deallocate(d_t_right);
 
  100   sycl_device.deallocate(d_t_result);
 
  103 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  106   for (IndexType 
k = 32; 
k < 256; 
k++) {
 
  107     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 
k, 128,
 
  112 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  115   for (IndexType 
k = 32; 
k < 256; 
k++) {
 
  116     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 
k,
 
  121 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  124   for (IndexType 
k = 32; 
k < 256; 
k++) {
 
  125     test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
 
  130 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  133   IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
 
  134                          257, 511, 512, 513, 1023, 1024, 1025};
 
  136   IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
 
  137                          257, 511, 512, 513, 1023, 1024, 1025};
 
  139   IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
 
  140                          255, 257, 511, 512, 513, 1023, 1024, 1025};
 
  142   for (IndexType 
i = 0; 
i < 15; 
i++) {
 
  143     for (IndexType 
j = 0; 
j < 15; 
j++) {
 
  144       for (IndexType 
k = 0; 
k < 17; 
k++) {
 
  145         test_sycl_contraction<DataLayout, DataType, IndexType>(
 
  146             sycl_device, m_sizes[
i], n_sizes[
j], k_sizes[
k]);
 
  152 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  155                                   IndexType k_size, IndexType n_size) {
 
  172   auto padded_left_size = 2 * t_left.
size();
 
  173   auto padded_right_size = 2 * t_right.
size();
 
  174   auto padded_result_size = 2 * t_result.
size();
 
  176   std::size_t t_left_bytes = padded_left_size * 
sizeof(DataType);
 
  177   std::size_t t_right_bytes = padded_right_size * 
sizeof(DataType);
 
  178   std::size_t t_result_bytes = padded_result_size * 
sizeof(DataType);
 
  181       static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
 
  182   DataType *d_t_right =
 
  183       static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
 
  184   DataType *d_t_result =
 
  185       static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
 
  189       gpu_t_left(d_t_left, left_dims);
 
  191       gpu_t_right(d_t_right, right_dims);
 
  193       gpu_t_result(d_t_result, result_dims);
 
  197   DataType nan = std::numeric_limits<DataType>::quiet_NaN();
 
  198   auto host_left_data = 
new DataType[padded_left_size];
 
  199   std::copy_n(t_left.
data(), t_left.
size(), host_left_data);
 
  200   std::fill_n(host_left_data + t_left.
size(), t_left.
size(), nan);
 
  201   auto host_right_data = 
new DataType[padded_right_size];
 
  202   std::copy_n(t_right.
data(), t_right.
size(), host_right_data);
 
  203   std::fill_n(host_right_data + t_right.
size(), t_right.
size(), nan);
 
  204   auto host_result_data = 
new DataType[padded_result_size];
 
  205   std::fill_n(host_result_data, padded_result_size, nan);
 
  207   sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
 
  208   sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
 
  209   sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
 
  211   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
 
  212   sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
 
  214   t_result = t_left.contract(t_right, dims);
 
  216   for (IndexType 
i = 0; 
i < t_result.
size(); 
i++) {
 
  217     if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
 
  226       std::cout << 
"M : " << m_size << 
", N : " << n_size << 
", K : " << k_size
 
  227                 << 
", invalid read detected at IndexType " << 
i << 
": " 
  228                 << t_result(
i) << 
" vs " << host_result_data[
i] << std::endl;
 
  230       std::cout << 
"M : " << m_size << 
", N : " << n_size << 
", K : " << k_size
 
  231                 << 
", mismatch detected at IndexType " << 
i << 
": " 
  232                 << t_result(
i) << 
" vs " << host_result_data[
i] << std::endl;
 
  237   for (IndexType 
i = t_result.
size(); 
i < padded_result_size; 
i++) {
 
  241     std::cout << 
"M : " << m_size << 
", N : " << n_size << 
", K : " << k_size
 
  242               << 
", invalid write detected at IndexType " << 
i << 
": " 
  243               << host_result_data[
i] << std::endl;
 
  246   sycl_device.deallocate(d_t_left);
 
  247   sycl_device.deallocate(d_t_right);
 
  248   sycl_device.deallocate(d_t_result);
 
  250   delete[] host_left_data;
 
  251   delete[] host_right_data;
 
  252   delete[] host_result_data;
 
  255 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  257 void test_scalar(
const Device &sycl_device, IndexType m_size, IndexType k_size,
 
  282       static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
 
  283   DataType *d_t_right =
 
  284       static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
 
  285   DataType *d_t_result =
 
  286       static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
 
  289       gpu_t_left(d_t_left, left_dims);
 
  291       gpu_t_right(d_t_right, right_dims);
 
  293       gpu_t_result(d_t_result);
 
  295   sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
 
  296   sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
 
  298   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
 
  299   sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
 
  302   t_result = t_left.contract(t_right, dims);
 
  304   if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
 
  307     std::cout << 
"K: " << k_size << 
", N: " << n_size << 
", M: " << m_size
 
  308               << 
" : mismatch detected: " << t_result() << 
" vs " 
  309               << t_result_gpu() << std::endl;
 
  313   sycl_device.deallocate(d_t_left);
 
  314   sycl_device.deallocate(d_t_right);
 
  315   sycl_device.deallocate(d_t_result);
 
  318 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  321                        IndexType k_size, IndexType n_size, IndexType m_batch,
 
  322                        IndexType start, IndexType limit) {
 
  328   TensorDim left_dims = {{m_batch, k_size, m_size}};
 
  329   TensorDim right_dims = {{m_batch, n_size, k_size}};
 
  330   TensorDim res_dims = {{m_batch, m_size, n_size}};
 
  333   TensorType t_left(left_dims);
 
  334   TensorType t_right(right_dims);
 
  335   TensorType t_result_gpu(res_dims);
 
  336   TensorType t_result(res_dims);
 
  341   std::size_t t_left_bytes = t_left.size() * 
sizeof(DataType);
 
  342   std::size_t t_right_bytes = t_right.size() * 
sizeof(DataType);
 
  343   std::size_t t_result_bytes = t_result.size() * 
sizeof(DataType);
 
  346       static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
 
  347   DataType *d_t_right =
 
  348       static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
 
  349   DataType *d_t_result =
 
  350       static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
 
  356   sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
 
  357   sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
 
  358   for (
int i = start; 
i < limit; ++
i) {
 
  359     auto x = gpu_t_left.template chip<0>(
i);
 
  360     auto y = gpu_t_right.template chip<0>(
i);
 
  361     auto z = gpu_t_result.template chip<0>(
i);
 
  362     z.device(sycl_device) = 
x.contract(
y, contract_pairs);
 
  364   sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
 
  367   for (
int i = start; 
i < limit; ++
i) {
 
  368     auto x = t_left.template chip<0>(
i);
 
  369     auto y = t_right.template chip<0>(
i);
 
  370     auto z = t_result.template chip<0>(
i);
 
  371     z = 
x.contract(
y, contract_pairs);
 
  374   for (IndexType 
i = 0; 
i < t_result.size(); 
i++) {
 
  375     if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
 
  383     std::cout << 
"mismatch detected at IndexType " << 
i << 
": " << t_result(
i)
 
  384               << 
" vs " << t_result_gpu(
i) << std::endl;
 
  387   sycl_device.deallocate(d_t_left);
 
  388   sycl_device.deallocate(d_t_right);
 
  389   sycl_device.deallocate(d_t_result);
 
  392 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  395                                 IndexType k_size, IndexType n_size) {
 
  417       static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
 
  418   DataType *d_t_right =
 
  419       static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
 
  420   DataType *d_t_result =
 
  421       static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
 
  424       gpu_t_left(d_t_left, left_dims);
 
  426       gpu_t_right(d_t_right, right_dims);
 
  428       gpu_t_result(d_t_result, res_dims);
 
  430   sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
 
  431   sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
 
  433   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
 
  434   sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
 
  437   t_result = t_left.contract(t_right, dims);
 
  439   for (IndexType 
j = 0; 
j < m_size; 
j++) {
 
  440     for (IndexType 
i = 0; 
i < n_size; 
i++) {
 
  441       if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
 
  449       std::cout << 
"M : " << m_size << 
", N : " << n_size << 
", K : " << k_size
 
  450                 << 
", mismatch detected at IndexType m: " << 
j << 
" n: " << 
i 
  451                 << 
" CPU : " << t_result(
j, 
i)
 
  452                 << 
" vs SYCL:" << t_result_gpu(
j, 
i) << std::endl;
 
  456   sycl_device.deallocate(d_t_left);
 
  457   sycl_device.deallocate(d_t_right);
 
  458   sycl_device.deallocate(d_t_result);
 
  461 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  464                                 IndexType k_size, IndexType n_size) {
 
  486       static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
 
  487   DataType *d_t_right =
 
  488       static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
 
  489   DataType *d_t_result =
 
  490       static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
 
  493       gpu_t_left(d_t_left, left_dims);
 
  495       gpu_t_right(d_t_right, right_dims);
 
  497       gpu_t_result(d_t_result, res_dims);
 
  499   sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
 
  500   sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
 
  502   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
 
  503   sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
 
  506   t_result = t_left.contract(t_right, dims);
 
  508   for (IndexType 
i = 0; 
i < t_result.
size(); 
i++) {
 
  509     if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
 
  517     std::cout << 
"M : " << m_size << 
", N : " << n_size << 
", K : " << k_size
 
  518               << 
", mismatch detected at IndexType " << 
i << 
": " << t_result(
i)
 
  519               << 
" vs " << t_result_gpu(
i) << std::endl;
 
  522   sycl_device.deallocate(d_t_left);
 
  523   sycl_device.deallocate(d_t_right);
 
  524   sycl_device.deallocate(d_t_result);
 
  527 template <
int DataLayout, 
typename DataType, 
typename IndexType,
 
  530                                  IndexType k_size, IndexType n_size) {
 
  552       static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
 
  553   DataType *d_t_right =
 
  554       static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
 
  555   DataType *d_t_result =
 
  556       static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
 
  559       gpu_t_left(d_t_left, left_dims);
 
  561       gpu_t_right(d_t_right, right_dims);
 
  563       gpu_t_result(d_t_result, res_dims);
 
  565   sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
 
  566   sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
 
  568   gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
 
  569   sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
 
  572   t_result = t_left.contract(t_right, dims);
 
  574   for (IndexType 
i = 0; 
i < t_result.
size(); 
i++) {
 
  575     if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
 
  583     std::cout << 
"M : " << m_size << 
", N : " << n_size << 
", K : " << k_size
 
  584               << 
", mismatch detected at IndexType " << 
i << 
": " << t_result(
i)
 
  585               << 
" vs " << t_result_gpu(
i) << std::endl;
 
  589   sycl_device.deallocate(d_t_left);
 
  590   sycl_device.deallocate(d_t_right);
 
  591   sycl_device.deallocate(d_t_result);
 
  594 template <
typename Dev>
 
  596   typedef float DataType;
 
  598   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  599   start = std::chrono::system_clock::now();
 
  601   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
 
  603   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
 
  605   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
 
  607   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
 
  609   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
 
  611   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
 
  613   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
 
  615   test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
 
  617   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
 
  619   test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
 
  621   end = std::chrono::system_clock::now();
 
  622   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  623   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  624   std::cout << 
"tensor out of bound tests finished computation at " 
  625             << std::ctime(&end_time)
 
  626             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  629 template <
typename Dev>
 
  631   typedef float DataType;
 
  633   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  634   start = std::chrono::system_clock::now();
 
  636   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
 
  638   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
 
  640   end = std::chrono::system_clock::now();
 
  641   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  642   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  643   std::cout << 
"tensor tensor tests finished computation at " 
  644             << std::ctime(&end_time)
 
  645             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  648 template <
typename Dev>
 
  650   typedef float DataType;
 
  652   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  653   start = std::chrono::system_clock::now();
 
  655   test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
 
  656   test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
 
  658   end = std::chrono::system_clock::now();
 
  659   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  660   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  661   std::cout << 
"tensor tensor tests finished computation at " 
  662             << std::ctime(&end_time)
 
  663             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  666 template <
typename Dev>
 
  668   typedef float DataType;
 
  670   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  671   start = std::chrono::system_clock::now();
 
  673   test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
 
  674   test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
 
  676   end = std::chrono::system_clock::now();
 
  677   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  678   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  679   std::cout << 
"tensor tensor tests finished computation at " 
  680             << std::ctime(&end_time)
 
  681             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  684 template <
typename Dev>
 
  686   typedef float DataType;
 
  688   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  689   start = std::chrono::system_clock::now();
 
  690   test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
 
  691   test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
 
  693   end = std::chrono::system_clock::now();
 
  694   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  695   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  696   std::cout << 
"tensor tensor tests finished computation at " 
  697             << std::ctime(&end_time)
 
  698             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  701 template <
typename Dev>
 
  703   typedef float DataType;
 
  705   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  706   start = std::chrono::system_clock::now();
 
  708   test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
 
  709   test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
 
  711   end = std::chrono::system_clock::now();
 
  712   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  713   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  714   std::cout << 
"tensor tensor tests finished computation at " 
  715             << std::ctime(&end_time)
 
  716             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  718 template <
typename Dev>
 
  720   typedef float DataType;
 
  722   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  723   start = std::chrono::system_clock::now();
 
  725   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
 
  727   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
 
  729   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
 
  731   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
 
  733   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
 
  735   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
 
  738   end = std::chrono::system_clock::now();
 
  739   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  740   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  741   std::cout << 
"contracted tensor tests finished computation at " 
  742             << std::ctime(&end_time)
 
  743             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  746 template <
typename Dev>
 
  748   typedef float DataType;
 
  750   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  751   start = std::chrono::system_clock::now();
 
  753   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
 
  755   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
 
  757   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
 
  759   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
 
  761   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
 
  763   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
 
  766   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
 
  768   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
 
  770   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
 
  772   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
 
  774   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
 
  776   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
 
  778   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
 
  781   end = std::chrono::system_clock::now();
 
  782   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  783   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  784   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  785             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  788 template <
typename Dev>
 
  790   typedef float DataType;
 
  792   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  793   start = std::chrono::system_clock::now();
 
  795   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
 
  797   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
 
  799   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
 
  801   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
 
  803   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
 
  805   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
 
  807   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
 
  809   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
 
  811   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
 
  813   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
 
  815   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
 
  817   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
 
  824 #ifndef EIGEN_SYCL_DISABLE_GEMV 
  825   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
 
  829   end = std::chrono::system_clock::now();
 
  830   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  831   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  832   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  833             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  836 template <
typename Dev>
 
  838   typedef float DataType;
 
  840   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  841   start = std::chrono::system_clock::now();
 
  843   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
 
  844   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
 
  845   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
 
  846   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
 
  847   test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
 
  848   test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
 
  850   end = std::chrono::system_clock::now();
 
  851   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  852   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  853   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  854             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  857 template <
typename Dev>
 
  859   typedef float DataType;
 
  861   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  862   start = std::chrono::system_clock::now();
 
  864   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
 
  865   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
 
  867   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
 
  869   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
 
  871   test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
 
  873   end = std::chrono::system_clock::now();
 
  874   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  875   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  876   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  877             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  880 template <
typename Dev>
 
  882   typedef float DataType;
 
  884   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  885   start = std::chrono::system_clock::now();
 
  887   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
 
  888   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
 
  890   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
 
  892   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
 
  894   test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
 
  896   end = std::chrono::system_clock::now();
 
  897   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  898   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  899   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  900             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  903 template <
typename Dev>
 
  905   typedef float DataType;
 
  907   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  908   start = std::chrono::system_clock::now();
 
  910   contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
 
  912   contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
 
  914   end = std::chrono::system_clock::now();
 
  915   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  916   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  917   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  918             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  921 template <
typename Dev>
 
  923     const Dev &sycl_device) {
 
  924   typedef float DataType;
 
  926   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  927   start = std::chrono::system_clock::now();
 
  929   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
 
  931   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
 
  933   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
 
  935   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
 
  937   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
 
  939   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
 
  941   contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
 
  943   end = std::chrono::system_clock::now();
 
  944   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  945   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  946   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  947             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  950 template <
typename Dev>
 
  952     const Dev &sycl_device) {
 
  953   typedef float DataType;
 
  955   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  956   start = std::chrono::system_clock::now();
 
  958   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
 
  960   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
 
  962   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
 
  964   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
 
  966   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
 
  968   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
 
  970   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
 
  972   contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
 
  974   end = std::chrono::system_clock::now();
 
  975   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  976   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  977   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  978             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
  981 template <
typename Dev>
 
  983     const Dev &sycl_device) {
 
  984   typedef float DataType;
 
  986   std::chrono::time_point<std::chrono::system_clock> start, 
end;
 
  987   start = std::chrono::system_clock::now();
 
  989   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
 
  991   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
 
  993   contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
 
  995   end = std::chrono::system_clock::now();
 
  996   std::chrono::duration<double> elapsed_seconds = 
end - start;
 
  997   std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
 
  998   std::cout << 
"finished computation at " << std::ctime(&end_time)
 
  999             << 
"elapsed time: " << elapsed_seconds.count() << 
"s\n";
 
 1003   for (
const auto &device : Eigen::get_sycl_supported_devices()) {
 
 1004     std::cout << 
"Running on " 
 1005               << device.template get_info<cl::sycl::info::device::name>()
 
 1007     QueueInterface queueInterface(device);
 
 1008     auto sycl_device = Eigen::SyclDevice(&queueInterface);