16 #define EIGEN_TEST_NO_LONGDOUBLE
17 #define EIGEN_TEST_NO_COMPLEX
19 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
20 #define EIGEN_USE_SYCL
24 #include <unsupported/Eigen/CXX11/Tensor>
27 using Eigen::SyclDevice;
31 template <
typename DataType,
int DataLayout,
typename IndexType>
46 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor1.
size()*
sizeof(DataType)));
47 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(tensor2.
size()*
sizeof(DataType)));
48 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(tensor3.
size()*
sizeof(DataType)));
49 DataType* gpu_data4 =
static_cast<DataType*
>(sycl_device.allocate(tensor4.
size()*
sizeof(DataType)));
56 sycl_device.memcpyHostToDevice(gpu_data1, tensor1.
data(),(tensor1.
size())*
sizeof(DataType));
58 gpu2.device(sycl_device)=gpu1.reshape(dim2);
59 sycl_device.memcpyDeviceToHost(tensor2.
data(), gpu_data2,(tensor1.
size())*
sizeof(DataType));
61 gpu3.device(sycl_device)=gpu1.reshape(dim3);
62 sycl_device.memcpyDeviceToHost(tensor3.
data(), gpu_data3,(tensor3.
size())*
sizeof(DataType));
64 gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
65 sycl_device.memcpyDeviceToHost(tensor4.
data(), gpu_data4,(tensor4.
size())*
sizeof(DataType));
66 for (IndexType
i = 0;
i < 2; ++
i){
67 for (IndexType
j = 0;
j < 3; ++
j){
68 for (IndexType k = 0; k < 7; ++k){
82 sycl_device.deallocate(gpu_data1);
83 sycl_device.deallocate(gpu_data2);
84 sycl_device.deallocate(gpu_data3);
85 sycl_device.deallocate(gpu_data4);
89 template<
typename DataType,
int DataLayout,
typename IndexType>
101 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
102 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(tensor2d.
size()*
sizeof(DataType)));
103 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(tensor5d.
size()*
sizeof(DataType)));
109 sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
111 gpu2.reshape(dim1).device(sycl_device)=gpu1;
112 sycl_device.memcpyDeviceToHost(tensor2d.
data(), gpu_data2,(tensor2d.
size())*
sizeof(DataType));
114 gpu3.reshape(dim1).device(sycl_device)=gpu1;
115 sycl_device.memcpyDeviceToHost(tensor5d.
data(), gpu_data3,(tensor5d.
size())*
sizeof(DataType));
118 for (IndexType
i = 0;
i < 2; ++
i){
119 for (IndexType
j = 0;
j < 3; ++
j){
120 for (IndexType k = 0; k < 7; ++k){
131 sycl_device.deallocate(gpu_data1);
132 sycl_device.deallocate(gpu_data2);
133 sycl_device.deallocate(gpu_data3);
137 template <
typename DataType,
int DataLayout,
typename IndexType>
140 IndexType sizeDim1 = 2;
141 IndexType sizeDim2 = 3;
142 IndexType sizeDim3 = 5;
143 IndexType sizeDim4 = 7;
144 IndexType sizeDim5 = 11;
151 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
152 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(slice1.
size()*
sizeof(DataType)));
157 sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
159 sycl_device.memcpyDeviceToHost(slice1.
data(), gpu_data2,(slice1.
size())*
sizeof(DataType));
165 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(slice2.
size()*
sizeof(DataType)));
169 gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
170 sycl_device.memcpyDeviceToHost(slice2.
data(), gpu_data3,(slice2.
size())*
sizeof(DataType));
171 for (IndexType
i = 0;
i < 2; ++
i) {
172 for (IndexType
j = 0;
j < 2; ++
j) {
173 for (IndexType k = 0; k < 3; ++k) {
178 sycl_device.deallocate(gpu_data1);
179 sycl_device.deallocate(gpu_data2);
180 sycl_device.deallocate(gpu_data3);
184 template <
typename DataType,
int DataLayout,
typename IndexType>
187 IndexType sizeDim1 = 2;
188 IndexType sizeDim2 = 3;
189 IndexType sizeDim3 = 5;
190 IndexType sizeDim4 = 7;
191 IndexType sizeDim5 = 11;
194 Index5 indicesStart(1
L,2
L,3
L,4
L,5
L);
195 Index5 indicesStop(2
L,3
L,4
L,5
L,6
L);
196 Index5 lengths(1
L,1
L,1
L,1
L,1
L);
206 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
207 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(slice1.
size()*
sizeof(DataType)));
208 DataType* gpu_data_stride2 =
static_cast<DataType*
>(sycl_device.allocate(slice_stride1.
size()*
sizeof(DataType)));
216 sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
218 sycl_device.memcpyDeviceToHost(slice1.
data(), gpu_data2,(slice1.
size())*
sizeof(DataType));
220 gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,
strides);
221 sycl_device.memcpyDeviceToHost(slice_stride1.
data(), gpu_data_stride2,(slice_stride1.
size())*
sizeof(DataType));
230 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(slice2.
size()*
sizeof(DataType)));
231 DataType* gpu_data_stride3 =
static_cast<DataType*
>(sycl_device.allocate(strideSlice2.
size()*
sizeof(DataType)));
236 Index5 strides2(1
L,1
L,1
L,1
L,1
L);
237 Index5 indicesStart2(1
L,1
L,3
L,4
L,5
L);
238 Index5 indicesStop2(2
L,2
L,5
L,6
L,8
L);
240 gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
241 sycl_device.memcpyDeviceToHost(slice2.
data(), gpu_data3,(slice2.
size())*
sizeof(DataType));
243 gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
244 sycl_device.memcpyDeviceToHost(strideSlice2.
data(), gpu_data_stride3,(strideSlice2.
size())*
sizeof(DataType));
246 for (IndexType
i = 0;
i < 2; ++
i) {
247 for (IndexType
j = 0;
j < 2; ++
j) {
248 for (IndexType k = 0; k < 3; ++k) {
254 sycl_device.deallocate(gpu_data1);
255 sycl_device.deallocate(gpu_data2);
256 sycl_device.deallocate(gpu_data3);
259 template<
typename DataType,
int DataLayout,
typename IndexType>
264 IndexType sizeDim1 = 7
L;
265 IndexType sizeDim2 = 11
L;
268 IndexType sliceDim1 = 2;
269 IndexType sliceDim2 = 3;
271 Tensor2f
slice(sliceRange);
273 Index2 indicesStart(3
L,4
L);
274 Index2 indicesStop(5
L,7
L);
275 Index2 lengths(2
L,3
L);
277 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor.size()*
sizeof(DataType)));
278 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(tensor2.
size()*
sizeof(DataType)));
279 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(
slice.size()*
sizeof(DataType)));
286 sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*
sizeof(DataType));
287 gpu2.device(sycl_device)=gpu1;
290 sycl_device.memcpyHostToDevice(gpu_data3,
slice.data(),(
slice.size())*
sizeof(DataType));
293 gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3;
294 gpu2.stridedSlice(indicesStart,indicesStop,
strides).device(sycl_device)=gpu3;
295 sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*
sizeof(DataType));
296 sycl_device.memcpyDeviceToHost(tensor2.
data(), gpu_data2,(tensor2.
size())*
sizeof(DataType));
298 for(IndexType
i=0;
i<sizeDim1;
i++)
299 for(IndexType
j=0;
j<sizeDim2;
j++){
302 sycl_device.deallocate(gpu_data1);
303 sycl_device.deallocate(gpu_data2);
304 sycl_device.deallocate(gpu_data3);
307 template <
typename OutIndex,
typename DSizes>
310 for (
int i = 0;
i < DSizes::count; ++
i) {
316 template <
class DataType,
int DataLayout,
typename IndexType,
typename ConvertedIndexType>
325 TensorI64 out_tensor_gpu(tensor_range);
326 TensorI64 out_tensor_cpu(tensor_range);
327 out_tensor_cpu.setRandom();
329 TensorI64 sub_tensor(slice_range);
330 sub_tensor.setRandom();
332 DataType* out_gpu_data =
static_cast<DataType*
>(sycl_device.allocate(out_tensor_cpu.size() *
sizeof(DataType)));
333 DataType* sub_gpu_data =
static_cast<DataType*
>(sycl_device.allocate(sub_tensor.size() *
sizeof(DataType)));
334 TensorMI64 out_gpu(out_gpu_data, tensor_range);
335 TensorMI64 sub_gpu(sub_gpu_data, slice_range);
337 sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() *
sizeof(DataType));
338 sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() *
sizeof(DataType));
342 TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
343 TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
344 TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
345 TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
347 out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
349 out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
351 sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() *
sizeof(DataType));
353 for (IndexType
i = 0;
i < out_tensor_cpu.size(); ++
i) {
354 auto exp = out_tensor_cpu(
i);
355 auto val = out_tensor_gpu(
i);
357 std::cout <<
"#" <<
i <<
" got " << val <<
" but expected " <<
exp << std::endl;
361 sycl_device.deallocate(out_gpu_data);
362 sycl_device.deallocate(sub_gpu_data);
367 QueueInterface queueInterface(
s);
368 auto sycl_device = Eigen::SyclDevice(&queueInterface);
369 test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
370 test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
371 test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
372 test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
373 test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
374 test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
375 test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
376 test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
377 test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
378 test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
379 run_eigen<float, RowMajor, long, int>(sycl_device);
383 for (
const auto& device :Eigen::get_sycl_supported_devices()) {
384 CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));