cxx11_tensor_sycl.cpp
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016
5 // Mehdi Goli Codeplay Software Ltd.
6 // Ralph Potter Codeplay Software Ltd.
7 // Luke Iwanski Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 // Benoit Steiner <benoit.steiner.goog@gmail.com>
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 
16 #define EIGEN_TEST_NO_LONGDOUBLE
17 #define EIGEN_TEST_NO_COMPLEX
18 
19 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
20 #define EIGEN_USE_SYCL
21 
22 #include "main.h"
23 #include <unsupported/Eigen/CXX11/Tensor>
24 
25 using Eigen::array;
26 using Eigen::SyclDevice;
27 using Eigen::Tensor;
28 using Eigen::TensorMap;
29 
30 template <typename DataType, int DataLayout, typename IndexType>
31 void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
32  IndexType sizeDim1 = 5;
33  IndexType sizeDim2 = 5;
34  IndexType sizeDim3 = 1;
35  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
40 
41  in1 = in1.random();
42 
43  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
44  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));
45 
46  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
47  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
48 
49  sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
50  sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
51  gpu1.device(sycl_device) = gpu1 * 3.14f;
52  gpu2.device(sycl_device) = gpu2 * 2.7f;
53  sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType));
54  sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType));
55  sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
56  sycl_device.synchronize();
57 
58  for (IndexType i = 0; i < in1.size(); ++i) {
59  // std::cout << "SYCL DATA : " << out1(i) << " vs CPU DATA : " << in1(i) * 3.14f << "\n";
60  VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
61  VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
62  VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
63  }
64 
65  sycl_device.deallocate(gpu_data1);
66  sycl_device.deallocate(gpu_data2);
67 }
68 
69 template <typename DataType, int DataLayout, typename IndexType>
70 void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
71  IndexType size = 20;
72  array<IndexType, 1> tensorRange = {{size}};
76 
77  in1 = in1.random();
78  in2 = in1;
79 
80  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
81 
82  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
83  sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
84  sycl_device.synchronize();
85  in1.setZero();
86 
87  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
88  sycl_device.synchronize();
89 
90  for (IndexType i = 0; i < in1.size(); ++i) {
91  VERIFY_IS_APPROX(out(i), in2(i));
92  }
93 
94  sycl_device.deallocate(gpu_data);
95 }
96 
97 template <typename DataType, int DataLayout, typename IndexType>
98 void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
100  IndexType full_size = 32;
101  IndexType half_size = full_size / 2;
102  array<IndexType, 1> tensorRange = {{full_size}};
103  tensor_type in1(tensorRange);
104  tensor_type out(tensorRange);
105 
106  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
107  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
108 
109  in1 = in1.random();
110  // Copy all data to device, then permute on copy back to host
111  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
112  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
113  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
114 
115  for (IndexType i = 0; i < half_size; ++i) {
116  VERIFY_IS_APPROX(out(i), in1(i + half_size));
117  VERIFY_IS_APPROX(out(i + half_size), in1(i));
118  }
119 
120  in1 = in1.random();
121  out.setZero();
122  // Permute copies to device, then copy all back to host
123  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
124  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
125  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
126 
127  for (IndexType i = 0; i < half_size; ++i) {
128  VERIFY_IS_APPROX(out(i), in1(i + half_size));
129  VERIFY_IS_APPROX(out(i + half_size), in1(i));
130  }
131 
132  in1 = in1.random();
133  out.setZero();
134  DataType* gpu_data_out = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
135  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
136  // Copy all to device, permute copies on device, then copy all back to host
137  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
138  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
139  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
140  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
141 
142  for (IndexType i = 0; i < half_size; ++i) {
143  VERIFY_IS_APPROX(out(i), in1(i + half_size));
144  VERIFY_IS_APPROX(out(i + half_size), in1(i));
145  }
146 
147  sycl_device.deallocate(gpu_data_out);
148  sycl_device.deallocate(gpu_data);
149 }
150 
151 template <typename DataType, int DataLayout, typename IndexType>
152 void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
153  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
154  IndexType full_size = 32;
155  IndexType half_size = full_size / 2;
156  array<IndexType, 1> tensorRange = {{full_size}};
157  tensor_type cpu_out(tensorRange);
158  tensor_type out(tensorRange);
159 
160  cpu_out.setZero();
161 
162  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
163  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
164 
165  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
166  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
167 
168  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
169  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
170  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
171 
172  for (IndexType i = 0; i < full_size; ++i) {
173  VERIFY_IS_APPROX(out(i), cpu_out(i));
174  }
175 
176  sycl_device.deallocate(gpu_data);
177 }
178 
179 template <typename DataType, int DataLayout, typename IndexType>
180 void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
181 
182  IndexType sizeDim1 = 100;
183  IndexType sizeDim2 = 10;
184  IndexType sizeDim3 = 20;
185  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
190 
191  in2 = in2.random();
192  in3 = in3.random();
193 
194  DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
195  DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
196  DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
197  DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
198 
199  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
200  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
201  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
202  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
203 
205  gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
206  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
207  sycl_device.synchronize();
208 
209  for (IndexType i = 0; i < sizeDim1; ++i) {
210  for (IndexType j = 0; j < sizeDim2; ++j) {
211  for (IndexType k = 0; k < sizeDim3; ++k) {
212  VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
213  }
214  }
215  }
216  printf("a=1.2f Test passed\n");
217 
219  gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
220  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
221  sycl_device.synchronize();
222 
223  for (IndexType i = 0; i < sizeDim1; ++i) {
224  for (IndexType j = 0; j < sizeDim2; ++j) {
225  for (IndexType k = 0; k < sizeDim3; ++k) {
226  VERIFY_IS_APPROX(out(i,j,k),
227  in1(i,j,k) * 1.2f);
228  }
229  }
230  }
231  printf("a=b*1.2f Test Passed\n");
232 
234  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
235  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
236  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
237  sycl_device.synchronize();
238 
239  for (IndexType i = 0; i < sizeDim1; ++i) {
240  for (IndexType j = 0; j < sizeDim2; ++j) {
241  for (IndexType k = 0; k < sizeDim3; ++k) {
242  VERIFY_IS_APPROX(out(i,j,k),
243  in1(i,j,k) *
244  in2(i,j,k));
245  }
246  }
247  }
248  printf("c=a*b Test Passed\n");
249 
251  gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
252  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
253  sycl_device.synchronize();
254  for (IndexType i = 0; i < sizeDim1; ++i) {
255  for (IndexType j = 0; j < sizeDim2; ++j) {
256  for (IndexType k = 0; k < sizeDim3; ++k) {
257  VERIFY_IS_APPROX(out(i,j,k),
258  in1(i,j,k) +
259  in2(i,j,k));
260  }
261  }
262  }
263  printf("c=a+b Test Passed\n");
264 
266  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
267  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
268  sycl_device.synchronize();
269  for (IndexType i = 0; i < sizeDim1; ++i) {
270  for (IndexType j = 0; j < sizeDim2; ++j) {
271  for (IndexType k = 0; k < sizeDim3; ++k) {
272  VERIFY_IS_APPROX(out(i,j,k),
273  in1(i,j,k) *
274  in1(i,j,k));
275  }
276  }
277  }
278  printf("c= a*a Test Passed\n");
279 
280  //a*3.14f + b*2.7f
281  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
282  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
283  sycl_device.synchronize();
284  for (IndexType i = 0; i < sizeDim1; ++i) {
285  for (IndexType j = 0; j < sizeDim2; ++j) {
286  for (IndexType k = 0; k < sizeDim3; ++k) {
287  VERIFY_IS_APPROX(out(i,j,k),
288  in1(i,j,k) * 3.14f
289  + in2(i,j,k) * 2.7f);
290  }
291  }
292  }
293  printf("a*3.14f + b*2.7f Test Passed\n");
294 
296  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType));
297  gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
298  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
299  sycl_device.synchronize();
300  for (IndexType i = 0; i < sizeDim1; ++i) {
301  for (IndexType j = 0; j < sizeDim2; ++j) {
302  for (IndexType k = 0; k < sizeDim3; ++k) {
303  VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
304  ? in2(i, j, k)
305  : in3(i, j, k));
306  }
307  }
308  }
309  printf("d= (a>0.5? b:c) Test Passed\n");
310  sycl_device.deallocate(gpu_in1_data);
311  sycl_device.deallocate(gpu_in2_data);
312  sycl_device.deallocate(gpu_in3_data);
313  sycl_device.deallocate(gpu_out_data);
314 }
315 template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType>
316 static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
317  IndexType size = 20;
318  array<IndexType, 1> tensorRange = {{size}};
321  Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
322 
323  in = in.random();
324 
325  Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
326  Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
327 
328  TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
329  TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
330  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
331  gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
332  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
333  out_host = in. template cast<Scalar2>();
334  for(IndexType i=0; i< size; i++)
335  {
336  VERIFY_IS_APPROX(out(i), out_host(i));
337  }
338  printf("cast Test Passed\n");
339  sycl_device.deallocate(gpu_in_data);
340  sycl_device.deallocate(gpu_out_data);
341 }
342 template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
343  QueueInterface queueInterface(s);
344  auto sycl_device = Eigen::SyclDevice(&queueInterface);
345  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
346  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
347  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
348  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
349  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
350  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
351  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
352  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
353  test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
354  test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
355 }
356 
357 EIGEN_DECLARE_TEST(cxx11_tensor_sycl) {
358  for (const auto& device :Eigen::get_sycl_supported_devices()) {
359  CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
360  }
361 }
void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const
Definition: Tensor.h:103
int array[24]
std::ofstream out("Result.txt")
EIGEN_DECLARE_TEST(cxx11_tensor_sycl)
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
void test_sycl_computations(const Eigen::SyclDevice &sycl_device)
#define VERIFY_IS_APPROX(a, b)
A tensor expression mapping an existing array of data.
void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device)
signed __int64 int64_t
Definition: ms_stdint.h:94
static void test_sycl_cast(const Eigen::SyclDevice &sycl_device)
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
RealScalar s
TensorDevice< TensorMap< PlainObjectType, Options_, MakePointer_ >, DeviceType > device(const DeviceType &dev)
Definition: TensorBase.h:1145
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
Definition: Tensor.h:104
#define CALL_SUBTEST(FUNC)
Definition: main.h:399
void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device)
void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device)
std::ptrdiff_t j
void sycl_computing_test_per_device(dev_Selector s)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor< Scalar_, NumIndices_, Options_, IndexType_ > & setZero()
Definition: TensorBase.h:988
The tensor class.
Definition: Tensor.h:63


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:34:08