14 #define EIGEN_TEST_NO_LONGDOUBLE 15 #define EIGEN_TEST_NO_COMPLEX 17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 18 #define EIGEN_USE_SYCL 25 #include <unsupported/Eigen/CXX11/Tensor> 29 using Eigen::SyclDevice;
35 template <
typename DataType,
int DataLayout,
typename IndexType>
64 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
65 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
66 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
71 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
72 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
74 gpu_result.
device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
75 sycl_device.memcpyDeviceToHost(result.
data(), d_result, result_bytes);
77 result_host=input.convolve(kernel, dims3);
79 for(IndexType
i=0;
i< outdim0;
i++ ){
80 for(IndexType
j=0;
j< outdim1;
j++ ){
81 for(IndexType k=0; k< outdim2; k++ ){
83 std::cout <<std::setprecision(16)<<
"mismatch detected at index ( "<<
i <<
" , " <<
j <<
", " << k <<
" ) " <<
" \t " <<
result(
i,
j,k) <<
" vs "<< result_host(
i,
j,k) << std::endl;
89 sycl_device.deallocate(d_input);
90 sycl_device.deallocate(d_kernel);
91 sycl_device.deallocate(d_result);
96 template <
typename DataType,
int DataLayout,
typename IndexType>
100 IndexType indim1= 55;
101 IndexType indim2= 51;
102 IndexType outdim0=50;
103 IndexType outdim1=51;
104 IndexType outdim2=51;
125 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
126 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
127 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
132 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
133 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
135 gpu_result.
device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
136 sycl_device.memcpyDeviceToHost(result.
data(), d_result, result_bytes);
138 result_host=input.convolve(kernel, dims3);
140 for(IndexType
i=0;
i< outdim0;
i++ ){
141 for(IndexType
j=0;
j< outdim1;
j++ ){
142 for(IndexType k=0; k< outdim2; k++ ){
144 std::cout <<std::setprecision(16)<<
"mismatch detected at index ( "<<
i <<
" , " <<
j <<
", " << k <<
" ) " <<
" \t " <<
result(
i,
j,k) <<
" vs "<< result_host(
i,
j,k) << std::endl;
150 sycl_device.deallocate(d_input);
151 sycl_device.deallocate(d_kernel);
152 sycl_device.deallocate(d_result);
157 template <
typename DataType,
int DataLayout,
typename IndexType>
160 IndexType indim0 =53;
161 IndexType indim1= 55;
162 IndexType indim2= 51;
163 IndexType outdim0=50;
164 IndexType outdim1=51;
165 IndexType outdim2=49;
186 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
187 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
188 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
193 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
194 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
196 gpu_result.
device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
197 sycl_device.memcpyDeviceToHost(result.
data(), d_result, result_bytes);
199 result_host=input.convolve(kernel, dims3);
201 for(IndexType
i=0;
i< outdim0;
i++ ){
202 for(IndexType
j=0;
j< outdim1;
j++ ){
203 for(IndexType k=0; k< outdim2; k++ ){
205 std::cout <<std::setprecision(16)<<
"mismatch detected at index ( "<<
i <<
" , " <<
j <<
", " << k <<
" ) " <<
" \t " <<
result(
i,
j,k) <<
" vs "<< result_host(
i,
j,k) << std::endl;
211 sycl_device.deallocate(d_input);
212 sycl_device.deallocate(d_kernel);
213 sycl_device.deallocate(d_result);
218 template <
typename DataType,
int DataLayout,
typename IndexType>
239 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
240 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
241 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
246 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
247 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
249 gpu_result.
device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
250 sycl_device.memcpyDeviceToHost(result.
data(), d_result, result_bytes);
259 sycl_device.deallocate(d_input);
260 sycl_device.deallocate(d_kernel);
261 sycl_device.deallocate(d_result);
264 template <
typename DataType,
int DataLayout,
typename IndexType>
265 static void test_expr(
const Eigen::SyclDevice& sycl_device)
285 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
286 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
287 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
292 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
293 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
295 gpu_result.
device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
296 sycl_device.memcpyDeviceToHost(result.
data(), d_result, result_bytes);
299 input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
301 input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
303 input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
305 input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
307 sycl_device.deallocate(d_input);
308 sycl_device.deallocate(d_kernel);
309 sycl_device.deallocate(d_result);
313 template <
typename DataType,
int DataLayout,
typename IndexType>
314 static void test_modes(
const Eigen::SyclDevice& sycl_device){
338 padding[0] = std::make_pair(0, 0);
345 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
346 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
347 DataType * d_valid =
static_cast<DataType*
>(sycl_device.allocate(valid_bytes));
352 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
353 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
355 gpu_valid.
device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
356 sycl_device.memcpyDeviceToHost(valid.
data(), d_valid, valid_bytes);
363 padding[0] = std::make_pair(1, 1);
366 DataType * d_same =
static_cast<DataType*
>(sycl_device.allocate(same_bytes));
368 gpu_same.
device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
369 sycl_device.memcpyDeviceToHost(same.
data(), d_same, same_bytes);
378 padding[0] = std::make_pair(2, 2);
382 DataType * d_full =
static_cast<DataType*
>(sycl_device.allocate(full_bytes));
384 gpu_full.
device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
385 sycl_device.memcpyDeviceToHost(full.
data(), d_full, full_bytes);
394 sycl_device.deallocate(d_input);
395 sycl_device.deallocate(d_kernel);
396 sycl_device.deallocate(d_valid);
397 sycl_device.deallocate(d_same);
398 sycl_device.deallocate(d_full);
402 template <
typename DataType,
int DataLayout,
typename IndexType>
426 DataType * d_input =
static_cast<DataType*
>(sycl_device.allocate(input_bytes));
427 DataType * d_kernel =
static_cast<DataType*
>(sycl_device.allocate(kernel_bytes));
428 DataType * d_result =
static_cast<DataType*
>(sycl_device.allocate(result_bytes));
433 sycl_device.memcpyHostToDevice(d_input, input.
data(), input_bytes);
434 sycl_device.memcpyHostToDevice(d_kernel, kernel.
data(), kernel_bytes);
436 gpu_result.
device(sycl_device)=gpu_input.
stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
437 sycl_device.memcpyDeviceToHost(result.
data(), d_result, result_bytes);
441 input(6)*kernel(2)));
443 input(12)*kernel(2)));
447 QueueInterface queueInterface(s);
448 auto sycl_device=Eigen::SyclDevice(&queueInterface);
449 test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
450 test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
451 test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
452 test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
453 test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
454 test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
455 test_evals<float, ColMajor, int64_t>(sycl_device);
456 test_evals<float, RowMajor, int64_t>(sycl_device);
457 test_expr<float, ColMajor, int64_t>(sycl_device);
458 test_expr<float, RowMajor, int64_t>(sycl_device);
459 test_modes<float, ColMajor, int64_t>(sycl_device);
460 test_modes<float, RowMajor, int64_t>(sycl_device);
461 test_strides<float, ColMajor, int64_t>(sycl_device);
462 test_strides<float, RowMajor, int64_t>(sycl_device);
466 for (
const auto& device :Eigen::get_sycl_supported_devices()) {
EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const
void tensorConvolutionPerDevice(Dev_selector &s)
static void test_strides(const Eigen::SyclDevice &sycl_device)
static void test_larg_expr3D(const Eigen::SyclDevice &sycl_device)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor< Scalar_, NumIndices_, Options_, IndexType_ > & setRandom()
static void test_larg_expr2D(const Eigen::SyclDevice &sycl_device)
#define VERIFY_IS_APPROX(a, b)
#define VERIFY_IS_EQUAL(a, b)
A tensor expression mapping an existing array of data.
static void test_larg_expr1D(const Eigen::SyclDevice &sycl_device)
static void test_modes(const Eigen::SyclDevice &sycl_device)
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
TensorDevice< TensorMap< PlainObjectType, Options_, MakePointer_ >, DeviceType > device(const DeviceType &dev)
static const float error_threshold
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorStridingOp< const Strides, const TensorMap< PlainObjectType, Options_, MakePointer_ > > stride(const Strides &strides) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
static void test_expr(const Eigen::SyclDevice &sycl_device)
#define CALL_SUBTEST(FUNC)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const
static void test_evals(const Eigen::SyclDevice &sycl_device)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
EIGEN_DEVICE_FUNC bool isApprox(const Scalar &x, const Scalar &y, const typename NumTraits< Scalar >::Real &precision=NumTraits< Scalar >::dummy_precision())
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor< Scalar_, NumIndices_, Options_, IndexType_ > & setZero()