16 #define EIGEN_TEST_NO_LONGDOUBLE 17 #define EIGEN_TEST_NO_COMPLEX 19 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 20 #define EIGEN_USE_SYCL 24 #include <unsupported/Eigen/CXX11/Tensor> 27 using Eigen::SyclDevice;
31 template <
typename DataType,
int DataLayout,
typename IndexType>
46 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor1.
size()*
sizeof(DataType)));
47 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(tensor2.
size()*
sizeof(DataType)));
48 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(tensor3.
size()*
sizeof(DataType)));
49 DataType* gpu_data4 =
static_cast<DataType*
>(sycl_device.allocate(tensor4.
size()*
sizeof(DataType)));
56 sycl_device.memcpyHostToDevice(gpu_data1, tensor1.
data(),(tensor1.
size())*
sizeof(DataType));
59 sycl_device.memcpyDeviceToHost(tensor2.
data(), gpu_data2,(tensor1.
size())*
sizeof(DataType));
62 sycl_device.memcpyDeviceToHost(tensor3.
data(), gpu_data3,(tensor3.
size())*
sizeof(DataType));
65 sycl_device.memcpyDeviceToHost(tensor4.
data(), gpu_data4,(tensor4.
size())*
sizeof(DataType));
66 for (IndexType
i = 0;
i < 2; ++
i){
67 for (IndexType
j = 0;
j < 3; ++
j){
68 for (IndexType k = 0; k < 7; ++k){
82 sycl_device.deallocate(gpu_data1);
83 sycl_device.deallocate(gpu_data2);
84 sycl_device.deallocate(gpu_data3);
85 sycl_device.deallocate(gpu_data4);
89 template<
typename DataType,
int DataLayout,
typename IndexType>
101 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
102 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(tensor2d.
size()*
sizeof(DataType)));
103 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(tensor5d.
size()*
sizeof(DataType)));
109 sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
111 gpu2.
reshape(dim1).device(sycl_device)=gpu1;
112 sycl_device.memcpyDeviceToHost(tensor2d.
data(), gpu_data2,(tensor2d.
size())*
sizeof(DataType));
114 gpu3.
reshape(dim1).device(sycl_device)=gpu1;
115 sycl_device.memcpyDeviceToHost(tensor5d.
data(), gpu_data3,(tensor5d.
size())*
sizeof(DataType));
118 for (IndexType
i = 0;
i < 2; ++
i){
119 for (IndexType
j = 0;
j < 3; ++
j){
120 for (IndexType k = 0; k < 7; ++k){
131 sycl_device.deallocate(gpu_data1);
132 sycl_device.deallocate(gpu_data2);
133 sycl_device.deallocate(gpu_data3);
137 template <
typename DataType,
int DataLayout,
typename IndexType>
140 IndexType sizeDim1 = 2;
141 IndexType sizeDim2 = 3;
142 IndexType sizeDim3 = 5;
143 IndexType sizeDim4 = 7;
144 IndexType sizeDim5 = 11;
151 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
152 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(slice1.
size()*
sizeof(DataType)));
157 sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
158 gpu2.
device(sycl_device)=gpu1.
slice(indices, sizes);
159 sycl_device.memcpyDeviceToHost(slice1.
data(), gpu_data2,(slice1.
size())*
sizeof(DataType));
165 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(slice2.
size()*
sizeof(DataType)));
169 gpu3.
device(sycl_device)=gpu1.
slice(indices2, sizes2);
170 sycl_device.memcpyDeviceToHost(slice2.
data(), gpu_data3,(slice2.
size())*
sizeof(DataType));
171 for (IndexType
i = 0;
i < 2; ++
i) {
172 for (IndexType
j = 0;
j < 2; ++
j) {
173 for (IndexType k = 0; k < 3; ++k) {
178 sycl_device.deallocate(gpu_data1);
179 sycl_device.deallocate(gpu_data2);
180 sycl_device.deallocate(gpu_data3);
184 template <
typename DataType,
int DataLayout,
typename IndexType>
187 IndexType sizeDim1 = 2;
188 IndexType sizeDim2 = 3;
189 IndexType sizeDim3 = 5;
190 IndexType sizeDim4 = 7;
191 IndexType sizeDim5 = 11;
194 Index5 indicesStart(1
L,2
L,3
L,4
L,5
L);
195 Index5 indicesStop(2
L,3
L,4
L,5
L,6
L);
196 Index5 lengths(1
L,1
L,1
L,1
L,1
L);
206 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
207 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(slice1.
size()*
sizeof(DataType)));
208 DataType* gpu_data_stride2 =
static_cast<DataType*
>(sycl_device.allocate(slice_stride1.
size()*
sizeof(DataType)));
216 sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
217 gpu2.
device(sycl_device)=gpu1.
slice(indices, sizes);
218 sycl_device.memcpyDeviceToHost(slice1.
data(), gpu_data2,(slice1.
size())*
sizeof(DataType));
221 sycl_device.memcpyDeviceToHost(slice_stride1.
data(), gpu_data_stride2,(slice_stride1.
size())*
sizeof(DataType));
230 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(slice2.
size()*
sizeof(DataType)));
231 DataType* gpu_data_stride3 =
static_cast<DataType*
>(sycl_device.allocate(strideSlice2.
size()*
sizeof(DataType)));
236 Index5 strides2(1
L,1
L,1
L,1
L,1
L);
237 Index5 indicesStart2(1
L,1
L,3
L,4
L,5
L);
238 Index5 indicesStop2(2
L,2
L,5
L,6
L,8
L);
240 gpu3.
device(sycl_device)=gpu1.
slice(indices2, sizes2);
241 sycl_device.memcpyDeviceToHost(slice2.
data(), gpu_data3,(slice2.
size())*
sizeof(DataType));
243 gpu_stride3.
device(sycl_device)=gpu1.
stridedSlice(indicesStart2,indicesStop2,strides2);
244 sycl_device.memcpyDeviceToHost(strideSlice2.
data(), gpu_data_stride3,(strideSlice2.
size())*
sizeof(DataType));
246 for (IndexType
i = 0;
i < 2; ++
i) {
247 for (IndexType
j = 0;
j < 2; ++
j) {
248 for (IndexType k = 0; k < 3; ++k) {
254 sycl_device.deallocate(gpu_data1);
255 sycl_device.deallocate(gpu_data2);
256 sycl_device.deallocate(gpu_data3);
259 template<
typename DataType,
int DataLayout,
typename IndexType>
264 IndexType sizeDim1 = 7
L;
265 IndexType sizeDim2 = 11
L;
268 IndexType sliceDim1 = 2;
269 IndexType sliceDim2 = 3;
271 Tensor2f
slice(sliceRange);
273 Index2 indicesStart(3
L,4
L);
274 Index2 indicesStop(5
L,7
L);
275 Index2 lengths(2
L,3
L);
277 DataType* gpu_data1 =
static_cast<DataType*
>(sycl_device.allocate(tensor.size()*
sizeof(DataType)));
278 DataType* gpu_data2 =
static_cast<DataType*
>(sycl_device.allocate(tensor2.
size()*
sizeof(DataType)));
279 DataType* gpu_data3 =
static_cast<DataType*
>(sycl_device.allocate(slice.size()*
sizeof(DataType)));
286 sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*
sizeof(DataType));
287 gpu2.
device(sycl_device)=gpu1;
290 sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*
sizeof(DataType));
293 gpu1.
slice(indicesStart,lengths).device(sycl_device)=gpu3;
294 gpu2.
stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3;
295 sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*
sizeof(DataType));
296 sycl_device.memcpyDeviceToHost(tensor2.
data(), gpu_data2,(tensor2.
size())*
sizeof(DataType));
298 for(IndexType
i=0;
i<sizeDim1;
i++)
299 for(IndexType
j=0;
j<sizeDim2;
j++){
302 sycl_device.deallocate(gpu_data1);
303 sycl_device.deallocate(gpu_data2);
304 sycl_device.deallocate(gpu_data3);
307 template <
typename OutIndex,
typename DSizes>
310 for (
int i = 0;
i < DSizes::count; ++
i) {
316 template <
class DataType,
int DataLayout,
typename IndexType,
typename ConvertedIndexType>
325 TensorI64 out_tensor_gpu(tensor_range);
326 TensorI64 out_tensor_cpu(tensor_range);
327 out_tensor_cpu.setRandom();
329 TensorI64 sub_tensor(slice_range);
330 sub_tensor.setRandom();
332 DataType* out_gpu_data =
static_cast<DataType*
>(sycl_device.allocate(out_tensor_cpu.size() *
sizeof(DataType)));
333 DataType* sub_gpu_data =
static_cast<DataType*
>(sycl_device.allocate(sub_tensor.size() *
sizeof(DataType)));
334 TensorMI64 out_gpu(out_gpu_data, tensor_range);
335 TensorMI64 sub_gpu(sub_gpu_data, slice_range);
337 sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() *
sizeof(DataType));
338 sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() *
sizeof(DataType));
342 TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
343 TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
344 TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
345 TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
347 out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
349 out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
351 sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() *
sizeof(DataType));
353 for (IndexType
i = 0;
i < out_tensor_cpu.size(); ++
i) {
354 auto exp = out_tensor_cpu(
i);
355 auto val = out_tensor_gpu(
i);
357 std::cout <<
"#" <<
i <<
" got " << val <<
" but expected " <<
exp << std::endl;
361 sycl_device.deallocate(out_gpu_data);
362 sycl_device.deallocate(sub_gpu_data);
367 QueueInterface queueInterface(s);
368 auto sycl_device = Eigen::SyclDevice(&queueInterface);
369 test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
370 test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
371 test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
372 test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
373 test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
374 test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
375 test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
376 test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
377 test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
378 test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
379 run_eigen<float, RowMajor, long, int>(sycl_device);
383 for (
const auto& device :Eigen::get_sycl_supported_devices()) {
384 CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const
void sycl_morphing_test_per_device(dev_Selector s)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorStridingSlicingOp< const StartIndices, const StopIndices, const Strides, const TensorMap< PlainObjectType, Options_, MakePointer_ > > stridedSlice(const StartIndices &startIndices, const StopIndices &stopIndices, const Strides &strides) const
static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
std::ofstream out("Result.txt")
std::vector< Array2i > sizes
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor< Scalar_, NumIndices_, Options_, IndexType_ > & setRandom()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp< const NewDimensions, const TensorMap< PlainObjectType, Options_, MakePointer_ > > reshape(const NewDimensions &newDimensions) const
EIGEN_DEVICE_FUNC const ExpReturnType exp() const
static void test_reshape_as_lvalue(const Eigen::SyclDevice &sycl_device)
static void test_strided_slice_write_sycl(const Eigen::SyclDevice &sycl_device)
#define VERIFY_IS_EQUAL(a, b)
A tensor expression mapping an existing array of data.
static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Eigen::array< OutIndex, DSizes::count > To32BitDims(const DSizes &in)
EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
TensorDevice< TensorMap< PlainObjectType, Options_, MakePointer_ >, DeviceType > device(const DeviceType &dev)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorSlicingOp< const StartIndices, const Sizes, const TensorMap< PlainObjectType, Options_, MakePointer_ > > slice(const StartIndices &startIndices, const Sizes &sizes) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
#define CALL_SUBTEST(FUNC)
int run_eigen(const SyclDevice &sycl_device)
static void test_simple_reshape(const Eigen::SyclDevice &sycl_device)
static const int DataLayout