14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
27 #include <unsupported/Eigen/CXX11/Tensor>
30 using Eigen::SyclDevice;
34 template <
int DataLayout,
typename DataType,
typename IndexType,
37 IndexType k_size, IndexType n_size) {
61 static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
63 static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
64 DataType *d_t_result =
65 static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
68 gpu_t_left(d_t_left, left_dims);
70 gpu_t_right(d_t_right, right_dims);
72 gpu_t_result(d_t_result, result_dims);
74 sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
75 sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
77 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
78 sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
81 t_result = t_left.contract(t_right, dims);
83 for (IndexType
i = 0;
i < t_result.
size();
i++) {
84 if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
93 std::cout <<
"M : " << m_size <<
", N : " << n_size <<
", K : " << k_size
94 <<
", mismatch detected at IndexType " <<
i <<
": " << t_result(
i)
95 <<
" vs " << t_result_gpu(
i) << std::endl;
98 sycl_device.deallocate(d_t_left);
99 sycl_device.deallocate(d_t_right);
100 sycl_device.deallocate(d_t_result);
103 template <
int DataLayout,
typename DataType,
typename IndexType,
106 for (IndexType k = 32; k < 256; k++) {
107 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
112 template <
int DataLayout,
typename DataType,
typename IndexType,
115 for (IndexType k = 32; k < 256; k++) {
116 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
121 template <
int DataLayout,
typename DataType,
typename IndexType,
124 for (IndexType k = 32; k < 256; k++) {
125 test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
130 template <
int DataLayout,
typename DataType,
typename IndexType,
133 IndexType m_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255,
134 257, 511, 512, 513, 1023, 1024, 1025};
136 IndexType n_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255,
137 257, 511, 512, 513, 1023, 1024, 1025};
139 IndexType k_sizes[] = {31, 39, 63, 64, 65, 95, 96, 127, 129,
140 255, 257, 511, 512, 513, 1023, 1024, 1025};
142 for (IndexType
i = 0;
i < 15;
i++) {
143 for (IndexType
j = 0;
j < 15;
j++) {
144 for (IndexType k = 0; k < 17; k++) {
145 test_sycl_contraction<DataLayout, DataType, IndexType>(
146 sycl_device, m_sizes[
i], n_sizes[
j], k_sizes[k]);
152 template <
int DataLayout,
typename DataType,
typename IndexType,
155 IndexType k_size, IndexType n_size) {
172 auto padded_left_size = 2 * t_left.
size();
173 auto padded_right_size = 2 * t_right.
size();
174 auto padded_result_size = 2 * t_result.
size();
176 std::size_t t_left_bytes = padded_left_size *
sizeof(DataType);
177 std::size_t t_right_bytes = padded_right_size *
sizeof(DataType);
178 std::size_t t_result_bytes = padded_result_size *
sizeof(DataType);
181 static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
182 DataType *d_t_right =
183 static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
184 DataType *d_t_result =
185 static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
189 gpu_t_left(d_t_left, left_dims);
191 gpu_t_right(d_t_right, right_dims);
193 gpu_t_result(d_t_result, result_dims);
197 DataType nan = std::numeric_limits<DataType>::quiet_NaN();
198 auto host_left_data =
new DataType[padded_left_size];
199 std::copy_n(t_left.
data(), t_left.
size(), host_left_data);
200 std::fill_n(host_left_data + t_left.
size(), t_left.
size(), nan);
201 auto host_right_data =
new DataType[padded_right_size];
202 std::copy_n(t_right.
data(), t_right.
size(), host_right_data);
203 std::fill_n(host_right_data + t_right.
size(), t_right.
size(), nan);
204 auto host_result_data =
new DataType[padded_result_size];
205 std::fill_n(host_result_data, padded_result_size, nan);
207 sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
208 sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
209 sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
211 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
212 sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
214 t_result = t_left.contract(t_right, dims);
216 for (IndexType
i = 0;
i < t_result.
size();
i++) {
217 if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
226 std::cout <<
"M : " << m_size <<
", N : " << n_size <<
", K : " << k_size
227 <<
", invalid read detected at IndexType " <<
i <<
": "
228 << t_result(
i) <<
" vs " << host_result_data[
i] << std::endl;
230 std::cout <<
"M : " << m_size <<
", N : " << n_size <<
", K : " << k_size
231 <<
", mismatch detected at IndexType " <<
i <<
": "
232 << t_result(
i) <<
" vs " << host_result_data[
i] << std::endl;
237 for (IndexType
i = t_result.
size();
i < padded_result_size;
i++) {
241 std::cout <<
"M : " << m_size <<
", N : " << n_size <<
", K : " << k_size
242 <<
", invalid write detected at IndexType " <<
i <<
": "
243 << host_result_data[
i] << std::endl;
246 sycl_device.deallocate(d_t_left);
247 sycl_device.deallocate(d_t_right);
248 sycl_device.deallocate(d_t_result);
250 delete[] host_left_data;
251 delete[] host_right_data;
252 delete[] host_result_data;
255 template <
int DataLayout,
typename DataType,
typename IndexType,
257 void test_scalar(
const Device &sycl_device, IndexType m_size, IndexType k_size,
282 static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
283 DataType *d_t_right =
284 static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
285 DataType *d_t_result =
286 static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
289 gpu_t_left(d_t_left, left_dims);
291 gpu_t_right(d_t_right, right_dims);
293 gpu_t_result(d_t_result);
295 sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
296 sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
298 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
299 sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
302 t_result = t_left.contract(t_right, dims);
304 if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
307 std::cout <<
"K: " << k_size <<
", N: " << n_size <<
", M: " << m_size
308 <<
" : mismatch detected: " << t_result() <<
" vs "
309 << t_result_gpu() << std::endl;
313 sycl_device.deallocate(d_t_left);
314 sycl_device.deallocate(d_t_right);
315 sycl_device.deallocate(d_t_result);
318 template <
int DataLayout,
typename DataType,
typename IndexType,
321 IndexType k_size, IndexType n_size, IndexType m_batch,
322 IndexType start, IndexType limit) {
328 TensorDim left_dims = {{m_batch, k_size, m_size}};
329 TensorDim right_dims = {{m_batch, n_size, k_size}};
330 TensorDim res_dims = {{m_batch, m_size, n_size}};
333 TensorType t_left(left_dims);
334 TensorType t_right(right_dims);
335 TensorType t_result_gpu(res_dims);
336 TensorType t_result(res_dims);
341 std::size_t t_left_bytes = t_left.size() *
sizeof(DataType);
342 std::size_t t_right_bytes = t_right.size() *
sizeof(DataType);
343 std::size_t t_result_bytes = t_result.size() *
sizeof(DataType);
346 static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
347 DataType *d_t_right =
348 static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
349 DataType *d_t_result =
350 static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
356 sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
357 sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
358 for (
int i = start;
i < limit; ++
i) {
359 auto x = gpu_t_left.template chip<0>(
i);
360 auto y = gpu_t_right.template chip<0>(
i);
361 auto z = gpu_t_result.template chip<0>(
i);
362 z.device(sycl_device) =
x.contract(
y, contract_pairs);
364 sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
367 for (
int i = start;
i < limit; ++
i) {
368 auto x = t_left.template chip<0>(
i);
369 auto y = t_right.template chip<0>(
i);
370 auto z = t_result.template chip<0>(
i);
371 z =
x.contract(
y, contract_pairs);
374 for (IndexType
i = 0;
i < t_result.size();
i++) {
375 if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
383 std::cout <<
"mismatch detected at IndexType " <<
i <<
": " << t_result(
i)
384 <<
" vs " << t_result_gpu(
i) << std::endl;
387 sycl_device.deallocate(d_t_left);
388 sycl_device.deallocate(d_t_right);
389 sycl_device.deallocate(d_t_result);
392 template <
int DataLayout,
typename DataType,
typename IndexType,
395 IndexType k_size, IndexType n_size) {
417 static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
418 DataType *d_t_right =
419 static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
420 DataType *d_t_result =
421 static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
424 gpu_t_left(d_t_left, left_dims);
426 gpu_t_right(d_t_right, right_dims);
428 gpu_t_result(d_t_result, res_dims);
430 sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
431 sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
433 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
434 sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
437 t_result = t_left.contract(t_right, dims);
439 for (IndexType
j = 0;
j < m_size;
j++) {
440 for (IndexType
i = 0;
i < n_size;
i++) {
441 if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
449 std::cout <<
"M : " << m_size <<
", N : " << n_size <<
", K : " << k_size
450 <<
", mismatch detected at IndexType m: " <<
j <<
" n: " <<
i
451 <<
" CPU : " << t_result(
j,
i)
452 <<
" vs SYCL:" << t_result_gpu(
j,
i) << std::endl;
456 sycl_device.deallocate(d_t_left);
457 sycl_device.deallocate(d_t_right);
458 sycl_device.deallocate(d_t_result);
461 template <
int DataLayout,
typename DataType,
typename IndexType,
464 IndexType k_size, IndexType n_size) {
486 static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
487 DataType *d_t_right =
488 static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
489 DataType *d_t_result =
490 static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
493 gpu_t_left(d_t_left, left_dims);
495 gpu_t_right(d_t_right, right_dims);
497 gpu_t_result(d_t_result, res_dims);
499 sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
500 sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
502 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
503 sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
506 t_result = t_left.contract(t_right, dims);
508 for (IndexType
i = 0;
i < t_result.
size();
i++) {
509 if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
517 std::cout <<
"M : " << m_size <<
", N : " << n_size <<
", K : " << k_size
518 <<
", mismatch detected at IndexType " <<
i <<
": " << t_result(
i)
519 <<
" vs " << t_result_gpu(
i) << std::endl;
522 sycl_device.deallocate(d_t_left);
523 sycl_device.deallocate(d_t_right);
524 sycl_device.deallocate(d_t_result);
527 template <
int DataLayout,
typename DataType,
typename IndexType,
530 IndexType k_size, IndexType n_size) {
552 static_cast<DataType *
>(sycl_device.allocate(t_left_bytes));
553 DataType *d_t_right =
554 static_cast<DataType *
>(sycl_device.allocate(t_right_bytes));
555 DataType *d_t_result =
556 static_cast<DataType *
>(sycl_device.allocate(t_result_bytes));
559 gpu_t_left(d_t_left, left_dims);
561 gpu_t_right(d_t_right, right_dims);
563 gpu_t_result(d_t_result, res_dims);
565 sycl_device.memcpyHostToDevice(d_t_left, t_left.
data(), t_left_bytes);
566 sycl_device.memcpyHostToDevice(d_t_right, t_right.
data(), t_right_bytes);
568 gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
569 sycl_device.memcpyDeviceToHost(t_result_gpu.
data(), d_t_result,
572 t_result = t_left.contract(t_right, dims);
574 for (IndexType
i = 0;
i < t_result.
size();
i++) {
575 if (
static_cast<DataType
>(
std::fabs(
static_cast<DataType
>(
583 std::cout <<
"M : " << m_size <<
", N : " << n_size <<
", K : " << k_size
584 <<
", mismatch detected at IndexType " <<
i <<
": " << t_result(
i)
585 <<
" vs " << t_result_gpu(
i) << std::endl;
589 sycl_device.deallocate(d_t_left);
590 sycl_device.deallocate(d_t_right);
591 sycl_device.deallocate(d_t_result);
594 template <
typename Dev>
596 typedef float DataType;
598 std::chrono::time_point<std::chrono::system_clock> start,
end;
599 start = std::chrono::system_clock::now();
601 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
603 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
605 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
607 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
609 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
611 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
613 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
615 test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
617 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
619 test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
621 end = std::chrono::system_clock::now();
622 std::chrono::duration<double> elapsed_seconds =
end - start;
623 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
624 std::cout <<
"tensor out of bound tests finished computation at "
625 << std::ctime(&end_time)
626 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
629 template <
typename Dev>
631 typedef float DataType;
633 std::chrono::time_point<std::chrono::system_clock> start,
end;
634 start = std::chrono::system_clock::now();
636 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
638 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
640 end = std::chrono::system_clock::now();
641 std::chrono::duration<double> elapsed_seconds =
end - start;
642 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
643 std::cout <<
"tensor tensor tests finished computation at "
644 << std::ctime(&end_time)
645 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
648 template <
typename Dev>
650 typedef float DataType;
652 std::chrono::time_point<std::chrono::system_clock> start,
end;
653 start = std::chrono::system_clock::now();
655 test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
656 test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
658 end = std::chrono::system_clock::now();
659 std::chrono::duration<double> elapsed_seconds =
end - start;
660 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
661 std::cout <<
"tensor tensor tests finished computation at "
662 << std::ctime(&end_time)
663 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
666 template <
typename Dev>
668 typedef float DataType;
670 std::chrono::time_point<std::chrono::system_clock> start,
end;
671 start = std::chrono::system_clock::now();
673 test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
674 test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
676 end = std::chrono::system_clock::now();
677 std::chrono::duration<double> elapsed_seconds =
end - start;
678 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
679 std::cout <<
"tensor tensor tests finished computation at "
680 << std::ctime(&end_time)
681 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
684 template <
typename Dev>
686 typedef float DataType;
688 std::chrono::time_point<std::chrono::system_clock> start,
end;
689 start = std::chrono::system_clock::now();
690 test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
691 test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
693 end = std::chrono::system_clock::now();
694 std::chrono::duration<double> elapsed_seconds =
end - start;
695 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
696 std::cout <<
"tensor tensor tests finished computation at "
697 << std::ctime(&end_time)
698 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
701 template <
typename Dev>
703 typedef float DataType;
705 std::chrono::time_point<std::chrono::system_clock> start,
end;
706 start = std::chrono::system_clock::now();
708 test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
709 test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
711 end = std::chrono::system_clock::now();
712 std::chrono::duration<double> elapsed_seconds =
end - start;
713 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
714 std::cout <<
"tensor tensor tests finished computation at "
715 << std::ctime(&end_time)
716 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
718 template <
typename Dev>
720 typedef float DataType;
722 std::chrono::time_point<std::chrono::system_clock> start,
end;
723 start = std::chrono::system_clock::now();
725 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
727 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
729 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
731 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
733 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
735 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
738 end = std::chrono::system_clock::now();
739 std::chrono::duration<double> elapsed_seconds =
end - start;
740 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
741 std::cout <<
"contracted tensor tests finished computation at "
742 << std::ctime(&end_time)
743 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
746 template <
typename Dev>
748 typedef float DataType;
750 std::chrono::time_point<std::chrono::system_clock> start,
end;
751 start = std::chrono::system_clock::now();
753 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
755 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
757 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
759 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
761 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
763 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
766 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
768 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
770 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
772 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
774 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
776 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
778 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
781 end = std::chrono::system_clock::now();
782 std::chrono::duration<double> elapsed_seconds =
end - start;
783 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
784 std::cout <<
"finished computation at " << std::ctime(&end_time)
785 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
788 template <
typename Dev>
790 typedef float DataType;
792 std::chrono::time_point<std::chrono::system_clock> start,
end;
793 start = std::chrono::system_clock::now();
795 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
797 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
799 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
801 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
803 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
805 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
807 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
809 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
811 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
813 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
815 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
817 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
824 #ifndef EIGEN_SYCL_DISABLE_GEMV
825 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
829 end = std::chrono::system_clock::now();
830 std::chrono::duration<double> elapsed_seconds =
end - start;
831 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
832 std::cout <<
"finished computation at " << std::ctime(&end_time)
833 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
836 template <
typename Dev>
838 typedef float DataType;
840 std::chrono::time_point<std::chrono::system_clock> start,
end;
841 start = std::chrono::system_clock::now();
843 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
844 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
845 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
846 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
847 test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
848 test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
850 end = std::chrono::system_clock::now();
851 std::chrono::duration<double> elapsed_seconds =
end - start;
852 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
853 std::cout <<
"finished computation at " << std::ctime(&end_time)
854 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
857 template <
typename Dev>
859 typedef float DataType;
861 std::chrono::time_point<std::chrono::system_clock> start,
end;
862 start = std::chrono::system_clock::now();
864 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
865 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
867 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
869 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
871 test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
873 end = std::chrono::system_clock::now();
874 std::chrono::duration<double> elapsed_seconds =
end - start;
875 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
876 std::cout <<
"finished computation at " << std::ctime(&end_time)
877 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
880 template <
typename Dev>
882 typedef float DataType;
884 std::chrono::time_point<std::chrono::system_clock> start,
end;
885 start = std::chrono::system_clock::now();
887 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
888 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
890 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
892 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
894 test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
896 end = std::chrono::system_clock::now();
897 std::chrono::duration<double> elapsed_seconds =
end - start;
898 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
899 std::cout <<
"finished computation at " << std::ctime(&end_time)
900 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
903 template <
typename Dev>
905 typedef float DataType;
907 std::chrono::time_point<std::chrono::system_clock> start,
end;
908 start = std::chrono::system_clock::now();
910 contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
912 contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
914 end = std::chrono::system_clock::now();
915 std::chrono::duration<double> elapsed_seconds =
end - start;
916 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
917 std::cout <<
"finished computation at " << std::ctime(&end_time)
918 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
921 template <
typename Dev>
923 const Dev &sycl_device) {
924 typedef float DataType;
926 std::chrono::time_point<std::chrono::system_clock> start,
end;
927 start = std::chrono::system_clock::now();
929 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
931 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
933 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
935 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
937 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
939 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
941 contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
943 end = std::chrono::system_clock::now();
944 std::chrono::duration<double> elapsed_seconds =
end - start;
945 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
946 std::cout <<
"finished computation at " << std::ctime(&end_time)
947 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
950 template <
typename Dev>
952 const Dev &sycl_device) {
953 typedef float DataType;
955 std::chrono::time_point<std::chrono::system_clock> start,
end;
956 start = std::chrono::system_clock::now();
958 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
960 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
962 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
964 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
966 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
968 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
970 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
972 contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
974 end = std::chrono::system_clock::now();
975 std::chrono::duration<double> elapsed_seconds =
end - start;
976 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
977 std::cout <<
"finished computation at " << std::ctime(&end_time)
978 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
981 template <
typename Dev>
983 const Dev &sycl_device) {
984 typedef float DataType;
986 std::chrono::time_point<std::chrono::system_clock> start,
end;
987 start = std::chrono::system_clock::now();
989 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
991 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
993 contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
995 end = std::chrono::system_clock::now();
996 std::chrono::duration<double> elapsed_seconds =
end - start;
997 std::time_t end_time = std::chrono::system_clock::to_time_t(
end);
998 std::cout <<
"finished computation at " << std::ctime(&end_time)
999 <<
"elapsed time: " << elapsed_seconds.count() <<
"s\n";
1003 for (
const auto &device : Eigen::get_sycl_supported_devices()) {
1004 std::cout <<
"Running on "
1005 << device.template get_info<cl::sycl::info::device::name>()
1007 QueueInterface queueInterface(device);
1008 auto sycl_device = Eigen::SyclDevice(&queueInterface);