14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
25 #include <unsupported/Eigen/CXX11/Tensor>
29 using Eigen::SyclDevice;
35 template <
typename DataType,
int DataLayout,
typename IndexType>
64 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
65 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
66 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
71 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
72 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
74 gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
75 sycl_device.memcpyDeviceToHost(
result.data(), d_result, result_bytes);
77 result_host=input.convolve(kernel, dims3);
79 for(IndexType
i=0;
i< outdim0;
i++ ){
80 for(IndexType
j=0;
j< outdim1;
j++ ){
81 for(IndexType k=0; k< outdim2; k++ ){
83 std::cout <<std::setprecision(16)<<
"mismatch detected at index ( "<<
i <<
" , " <<
j <<
", " << k <<
" ) " <<
" \t " <<
result(
i,
j,k) <<
" vs "<< result_host(
i,
j,k) << std::endl;
89 sycl_device.deallocate(d_input);
90 sycl_device.deallocate(d_kernel);
91 sycl_device.deallocate(d_result);
96 template <
typename DataType,
int DataLayout,
typename IndexType>
100 IndexType indim1= 55;
101 IndexType indim2= 51;
102 IndexType outdim0=50;
103 IndexType outdim1=51;
104 IndexType outdim2=51;
125 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
126 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
127 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
132 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
133 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
135 gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
136 sycl_device.memcpyDeviceToHost(
result.data(), d_result, result_bytes);
138 result_host=input.convolve(kernel, dims3);
140 for(IndexType
i=0;
i< outdim0;
i++ ){
141 for(IndexType
j=0;
j< outdim1;
j++ ){
142 for(IndexType k=0; k< outdim2; k++ ){
144 std::cout <<std::setprecision(16)<<
"mismatch detected at index ( "<<
i <<
" , " <<
j <<
", " << k <<
" ) " <<
" \t " <<
result(
i,
j,k) <<
" vs "<< result_host(
i,
j,k) << std::endl;
150 sycl_device.deallocate(d_input);
151 sycl_device.deallocate(d_kernel);
152 sycl_device.deallocate(d_result);
157 template <
typename DataType,
int DataLayout,
typename IndexType>
160 IndexType indim0 =53;
161 IndexType indim1= 55;
162 IndexType indim2= 51;
163 IndexType outdim0=50;
164 IndexType outdim1=51;
165 IndexType outdim2=49;
186 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
187 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
188 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
193 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
194 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
196 gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
197 sycl_device.memcpyDeviceToHost(
result.data(), d_result, result_bytes);
199 result_host=input.convolve(kernel, dims3);
201 for(IndexType
i=0;
i< outdim0;
i++ ){
202 for(IndexType
j=0;
j< outdim1;
j++ ){
203 for(IndexType k=0; k< outdim2; k++ ){
205 std::cout <<std::setprecision(16)<<
"mismatch detected at index ( "<<
i <<
" , " <<
j <<
", " << k <<
" ) " <<
" \t " <<
result(
i,
j,k) <<
" vs "<< result_host(
i,
j,k) << std::endl;
211 sycl_device.deallocate(d_input);
212 sycl_device.deallocate(d_kernel);
213 sycl_device.deallocate(d_result);
218 template <
typename DataType,
int DataLayout,
typename IndexType>
239 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
240 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
241 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
246 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
247 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
249 gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
250 sycl_device.memcpyDeviceToHost(
result.data(), d_result, result_bytes);
259 sycl_device.deallocate(d_input);
260 sycl_device.deallocate(d_kernel);
261 sycl_device.deallocate(d_result);
264 template <
typename DataType,
int DataLayout,
typename IndexType>
265 static void test_expr(
const Eigen::SyclDevice& sycl_device)
285 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
286 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
287 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
292 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
293 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
295 gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
296 sycl_device.memcpyDeviceToHost(
result.data(), d_result, result_bytes);
299 input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
301 input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
303 input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
305 input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
307 sycl_device.deallocate(d_input);
308 sycl_device.deallocate(d_kernel);
309 sycl_device.deallocate(d_result);
313 template <
typename DataType,
int DataLayout,
typename IndexType>
314 static void test_modes(
const Eigen::SyclDevice& sycl_device){
338 padding[0] = std::make_pair(0, 0);
345 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
346 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
347 DataType * d_valid =
static_cast<DataType*
>(sycl_device.allocate(valid_bytes));
352 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
353 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
355 gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
356 sycl_device.memcpyDeviceToHost(valid.
data(), d_valid, valid_bytes);
363 padding[0] = std::make_pair(1, 1);
366 DataType * d_same =
static_cast<DataType*
>(sycl_device.allocate(same_bytes));
368 gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
369 sycl_device.memcpyDeviceToHost(same.
data(), d_same, same_bytes);
378 padding[0] = std::make_pair(2, 2);
382 DataType * d_full =
static_cast<DataType*
>(sycl_device.allocate(full_bytes));
384 gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
385 sycl_device.memcpyDeviceToHost(full.
data(), d_full, full_bytes);
394 sycl_device.deallocate(d_input);
395 sycl_device.deallocate(d_kernel);
396 sycl_device.deallocate(d_valid);
397 sycl_device.deallocate(d_same);
398 sycl_device.deallocate(d_full);
402 template <
typename DataType,
int DataLayout,
typename IndexType>
426 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
427 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
428 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
433 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
434 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
436 gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
437 sycl_device.memcpyDeviceToHost(
result.data(), d_result, result_bytes);
441 input(6)*kernel(2)));
443 input(12)*kernel(2)));
447 QueueInterface queueInterface(
s);
448 auto sycl_device=Eigen::SyclDevice(&queueInterface);
449 test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
450 test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
451 test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
452 test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
453 test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
454 test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
455 test_evals<float, ColMajor, int64_t>(sycl_device);
456 test_evals<float, RowMajor, int64_t>(sycl_device);
457 test_expr<float, ColMajor, int64_t>(sycl_device);
458 test_expr<float, RowMajor, int64_t>(sycl_device);
459 test_modes<float, ColMajor, int64_t>(sycl_device);
460 test_modes<float, RowMajor, int64_t>(sycl_device);
461 test_strides<float, ColMajor, int64_t>(sycl_device);
462 test_strides<float, RowMajor, int64_t>(sycl_device);
466 for (
const auto& device :Eigen::get_sycl_supported_devices()) {