16 #define EIGEN_TEST_NO_LONGDOUBLE
17 #define EIGEN_TEST_NO_COMPLEX
19 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
20 #define EIGEN_USE_SYCL
24 #include <Eigen/CXX11/Tensor>
28 template <
typename DataType,
int DataLayout,
typename IndexType>
31 IndexType sizeDim1 = 2;
32 IndexType sizeDim2 = 3;
33 IndexType sizeDim3 = 5;
34 IndexType sizeDim4 = 7;
35 IndexType sizeDim5 = 11;
45 const size_t tensorBuffSize =tensor.
size()*
sizeof(DataType);
46 const size_t chip1TensorBuffSize =chip1.
size()*
sizeof(DataType);
47 DataType* gpu_data_tensor =
static_cast<DataType*
>(sycl_device.allocate(tensorBuffSize));
48 DataType* gpu_data_chip1 =
static_cast<DataType*
>(sycl_device.allocate(chip1TensorBuffSize));
53 sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.
data(), tensorBuffSize);
54 gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1
l);
55 sycl_device.memcpyDeviceToHost(chip1.
data(), gpu_data_chip1, chip1TensorBuffSize);
62 for (IndexType
i = 0;
i < sizeDim2; ++
i) {
63 for (IndexType
j = 0;
j < sizeDim3; ++
j) {
64 for (IndexType k = 0; k < sizeDim4; ++k) {
65 for (IndexType
l = 0;
l < sizeDim5; ++
l) {
74 const size_t chip2TensorBuffSize =chip2.
size()*
sizeof(DataType);
75 DataType* gpu_data_chip2 =
static_cast<DataType*
>(sycl_device.allocate(chip2TensorBuffSize));
78 gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1
l);
79 sycl_device.memcpyDeviceToHost(chip2.
data(), gpu_data_chip2, chip2TensorBuffSize);
86 for (IndexType
i = 0;
i < sizeDim1; ++
i) {
87 for (IndexType
j = 0;
j < sizeDim3; ++
j) {
88 for (IndexType k = 0; k < sizeDim4; ++k) {
89 for (IndexType
l = 0;
l < sizeDim5; ++
l) {
98 const size_t chip3TensorBuffSize =chip3.
size()*
sizeof(DataType);
99 DataType* gpu_data_chip3 =
static_cast<DataType*
>(sycl_device.allocate(chip3TensorBuffSize));
102 gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2
l);
103 sycl_device.memcpyDeviceToHost(chip3.
data(), gpu_data_chip3, chip3TensorBuffSize);
110 for (IndexType
i = 0;
i < sizeDim1; ++
i) {
111 for (IndexType
j = 0;
j < sizeDim2; ++
j) {
112 for (IndexType k = 0; k < sizeDim4; ++k) {
113 for (IndexType
l = 0;
l < sizeDim5; ++
l) {
122 const size_t chip4TensorBuffSize =chip4.
size()*
sizeof(DataType);
123 DataType* gpu_data_chip4 =
static_cast<DataType*
>(sycl_device.allocate(chip4TensorBuffSize));
126 gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5
l);
127 sycl_device.memcpyDeviceToHost(chip4.
data(), gpu_data_chip4, chip4TensorBuffSize);
134 for (IndexType
i = 0;
i < sizeDim1; ++
i) {
135 for (IndexType
j = 0;
j < sizeDim2; ++
j) {
136 for (IndexType k = 0; k < sizeDim3; ++k) {
137 for (IndexType
l = 0;
l < sizeDim5; ++
l) {
147 const size_t chip5TensorBuffSize =chip5.
size()*
sizeof(DataType);
148 DataType* gpu_data_chip5 =
static_cast<DataType*
>(sycl_device.allocate(chip5TensorBuffSize));
151 gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7
l);
152 sycl_device.memcpyDeviceToHost(chip5.
data(), gpu_data_chip5, chip5TensorBuffSize);
159 for (IndexType
i = 0;
i < sizeDim1; ++
i) {
160 for (IndexType
j = 0;
j < sizeDim2; ++
j) {
161 for (IndexType k = 0; k < sizeDim3; ++k) {
162 for (IndexType
l = 0;
l < sizeDim4; ++
l) {
169 sycl_device.deallocate(gpu_data_tensor);
170 sycl_device.deallocate(gpu_data_chip1);
171 sycl_device.deallocate(gpu_data_chip2);
172 sycl_device.deallocate(gpu_data_chip3);
173 sycl_device.deallocate(gpu_data_chip4);
174 sycl_device.deallocate(gpu_data_chip5);
177 template <
typename DataType,
int DataLayout,
typename IndexType>
180 IndexType sizeDim1 = 2;
181 IndexType sizeDim2 = 3;
182 IndexType sizeDim3 = 5;
183 IndexType sizeDim4 = 7;
184 IndexType sizeDim5 = 11;
194 const size_t tensorBuffSize =tensor.
size()*
sizeof(DataType);
195 const size_t chip1TensorBuffSize =chip1.
size()*
sizeof(DataType);
196 DataType* gpu_data_tensor =
static_cast<DataType*
>(sycl_device.allocate(tensorBuffSize));
197 DataType* gpu_data_chip1 =
static_cast<DataType*
>(sycl_device.allocate(chip1TensorBuffSize));
202 sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.
data(), tensorBuffSize);
203 gpu_chip1.device(sycl_device)=gpu_tensor.chip(1
l,0
l);
204 sycl_device.memcpyDeviceToHost(chip1.
data(), gpu_data_chip1, chip1TensorBuffSize);
211 for (IndexType
i = 0;
i < sizeDim2; ++
i) {
212 for (IndexType
j = 0;
j < sizeDim3; ++
j) {
213 for (IndexType k = 0; k < sizeDim4; ++k) {
214 for (IndexType
l = 0;
l < sizeDim5; ++
l) {
223 const size_t chip2TensorBuffSize =chip2.
size()*
sizeof(DataType);
224 DataType* gpu_data_chip2 =
static_cast<DataType*
>(sycl_device.allocate(chip2TensorBuffSize));
227 gpu_chip2.device(sycl_device)=gpu_tensor.chip(1
l,1
l);
228 sycl_device.memcpyDeviceToHost(chip2.
data(), gpu_data_chip2, chip2TensorBuffSize);
235 for (IndexType
i = 0;
i < sizeDim1; ++
i) {
236 for (IndexType
j = 0;
j < sizeDim3; ++
j) {
237 for (IndexType k = 0; k < sizeDim4; ++k) {
238 for (IndexType
l = 0;
l < sizeDim5; ++
l) {
247 const size_t chip3TensorBuffSize =chip3.
size()*
sizeof(DataType);
248 DataType* gpu_data_chip3 =
static_cast<DataType*
>(sycl_device.allocate(chip3TensorBuffSize));
251 gpu_chip3.device(sycl_device)=gpu_tensor.chip(2
l,2
l);
252 sycl_device.memcpyDeviceToHost(chip3.
data(), gpu_data_chip3, chip3TensorBuffSize);
259 for (IndexType
i = 0;
i < sizeDim1; ++
i) {
260 for (IndexType
j = 0;
j < sizeDim2; ++
j) {
261 for (IndexType k = 0; k < sizeDim4; ++k) {
262 for (IndexType
l = 0;
l < sizeDim5; ++
l) {
271 const size_t chip4TensorBuffSize =chip4.
size()*
sizeof(DataType);
272 DataType* gpu_data_chip4 =
static_cast<DataType*
>(sycl_device.allocate(chip4TensorBuffSize));
275 gpu_chip4.device(sycl_device)=gpu_tensor.chip(5
l,3
l);
276 sycl_device.memcpyDeviceToHost(chip4.
data(), gpu_data_chip4, chip4TensorBuffSize);
283 for (IndexType
i = 0;
i < sizeDim1; ++
i) {
284 for (IndexType
j = 0;
j < sizeDim2; ++
j) {
285 for (IndexType k = 0; k < sizeDim3; ++k) {
286 for (IndexType
l = 0;
l < sizeDim5; ++
l) {
296 const size_t chip5TensorBuffSize =chip5.
size()*
sizeof(DataType);
297 DataType* gpu_data_chip5 =
static_cast<DataType*
>(sycl_device.allocate(chip5TensorBuffSize));
300 gpu_chip5.device(sycl_device)=gpu_tensor.chip(7
l,4
l);
301 sycl_device.memcpyDeviceToHost(chip5.
data(), gpu_data_chip5, chip5TensorBuffSize);
308 for (IndexType
i = 0;
i < sizeDim1; ++
i) {
309 for (IndexType
j = 0;
j < sizeDim2; ++
j) {
310 for (IndexType k = 0; k < sizeDim3; ++k) {
311 for (IndexType
l = 0;
l < sizeDim4; ++
l) {
317 sycl_device.deallocate(gpu_data_tensor);
318 sycl_device.deallocate(gpu_data_chip1);
319 sycl_device.deallocate(gpu_data_chip2);
320 sycl_device.deallocate(gpu_data_chip3);
321 sycl_device.deallocate(gpu_data_chip4);
322 sycl_device.deallocate(gpu_data_chip5);
325 template <
typename DataType,
int DataLayout,
typename IndexType>
328 IndexType sizeDim1 = 2;
329 IndexType sizeDim2 = 3;
330 IndexType sizeDim3 = 5;
331 IndexType sizeDim4 = 7;
332 IndexType sizeDim5 = 11;
344 const size_t tensorBuffSize =tensor.
size()*
sizeof(DataType);
345 const size_t chip1TensorBuffSize =chip1.
size()*
sizeof(DataType);
346 DataType* gpu_data_tensor =
static_cast<DataType*
>(sycl_device.allocate(tensorBuffSize));
347 DataType* gpu_data_chip1 =
static_cast<DataType*
>(sycl_device.allocate(chip1TensorBuffSize));
348 DataType* gpu_data_tensor1 =
static_cast<DataType*
>(sycl_device.allocate(chip1TensorBuffSize));
355 sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.
data(), tensorBuffSize);
356 sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.
data(), chip1TensorBuffSize);
357 gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0
l) + gpu_tensor1;
358 sycl_device.memcpyDeviceToHost(chip1.
data(), gpu_data_chip1, chip1TensorBuffSize);
360 for (
int i = 0;
i < sizeDim2; ++
i) {
361 for (
int j = 0;
j < sizeDim3; ++
j) {
362 for (
int k = 0; k < sizeDim4; ++k) {
363 for (
int l = 0;
l < sizeDim5; ++
l) {
375 const size_t chip2TensorBuffSize =tensor2.
size()*
sizeof(DataType);
376 DataType* gpu_data_tensor2 =
static_cast<DataType*
>(sycl_device.allocate(chip2TensorBuffSize));
377 DataType* gpu_data_chip2 =
static_cast<DataType*
>(sycl_device.allocate(chip2TensorBuffSize));
381 sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.
data(), chip2TensorBuffSize);
382 gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0
l).template chip<1l>(2
l) + gpu_tensor2;
383 sycl_device.memcpyDeviceToHost(chip2.
data(), gpu_data_chip2, chip2TensorBuffSize);
385 for (
int i = 0;
i < sizeDim2; ++
i) {
386 for (
int j = 0;
j < sizeDim4; ++
j) {
387 for (
int k = 0; k < sizeDim5; ++k) {
393 sycl_device.deallocate(gpu_data_tensor);
394 sycl_device.deallocate(gpu_data_tensor1);
395 sycl_device.deallocate(gpu_data_chip1);
396 sycl_device.deallocate(gpu_data_tensor2);
397 sycl_device.deallocate(gpu_data_chip2);
400 template <
typename DataType,
int DataLayout,
typename IndexType>
404 IndexType sizeDim1 = 2;
405 IndexType sizeDim2 = 3;
406 IndexType sizeDim3 = 5;
407 IndexType sizeDim4 = 7;
408 IndexType sizeDim5 = 11;
420 const size_t tensorBuffSize =tensor.
size()*
sizeof(DataType);
421 const size_t input2TensorBuffSize =input2.
size()*
sizeof(DataType);
422 std::cout << tensorBuffSize <<
" , "<< input2TensorBuffSize << std::endl;
423 DataType* gpu_data_tensor =
static_cast<DataType*
>(sycl_device.allocate(tensorBuffSize));
424 DataType* gpu_data_input1 =
static_cast<DataType*
>(sycl_device.allocate(tensorBuffSize));
425 DataType* gpu_data_input2 =
static_cast<DataType*
>(sycl_device.allocate(input2TensorBuffSize));
431 sycl_device.memcpyHostToDevice(gpu_data_input1, input1.
data(), tensorBuffSize);
432 gpu_tensor.device(sycl_device)=gpu_input1;
433 sycl_device.memcpyHostToDevice(gpu_data_input2, input2.
data(), input2TensorBuffSize);
434 gpu_tensor.template chip<0l>(1
l).device(sycl_device)=gpu_input2;
435 sycl_device.memcpyDeviceToHost(tensor.
data(), gpu_data_tensor, tensorBuffSize);
437 for (
int i = 0;
i < sizeDim1; ++
i) {
438 for (
int j = 0;
j < sizeDim2; ++
j) {
439 for (
int k = 0; k < sizeDim3; ++k) {
440 for (
int l = 0;
l < sizeDim4; ++
l) {
441 for (
int m = 0;
m < sizeDim5; ++
m) {
453 gpu_tensor.device(sycl_device)=gpu_input1;
458 const size_t input3TensorBuffSize =input3.
size()*
sizeof(DataType);
459 DataType* gpu_data_input3 =
static_cast<DataType*
>(sycl_device.allocate(input3TensorBuffSize));
462 sycl_device.memcpyHostToDevice(gpu_data_input3, input3.
data(), input3TensorBuffSize);
463 gpu_tensor.template chip<1l>(1
l).device(sycl_device)=gpu_input3;
464 sycl_device.memcpyDeviceToHost(tensor.
data(), gpu_data_tensor, tensorBuffSize);
466 for (
int i = 0;
i < sizeDim1; ++
i) {
467 for (
int j = 0;
j < sizeDim2; ++
j) {
468 for (
int k = 0; k <sizeDim3; ++k) {
469 for (
int l = 0;
l < sizeDim4; ++
l) {
470 for (
int m = 0;
m < sizeDim5; ++
m) {
482 gpu_tensor.device(sycl_device)=gpu_input1;
487 const size_t input4TensorBuffSize =input4.
size()*
sizeof(DataType);
488 DataType* gpu_data_input4 =
static_cast<DataType*
>(sycl_device.allocate(input4TensorBuffSize));
491 sycl_device.memcpyHostToDevice(gpu_data_input4, input4.
data(), input4TensorBuffSize);
492 gpu_tensor.template chip<2l>(3
l).device(sycl_device)=gpu_input4;
493 sycl_device.memcpyDeviceToHost(tensor.
data(), gpu_data_tensor, tensorBuffSize);
495 for (
int i = 0;
i < sizeDim1; ++
i) {
496 for (
int j = 0;
j < sizeDim2; ++
j) {
497 for (
int k = 0; k <sizeDim3; ++k) {
498 for (
int l = 0;
l < sizeDim4; ++
l) {
499 for (
int m = 0;
m < sizeDim5; ++
m) {
511 gpu_tensor.device(sycl_device)=gpu_input1;
516 const size_t input5TensorBuffSize =input5.
size()*
sizeof(DataType);
517 DataType* gpu_data_input5 =
static_cast<DataType*
>(sycl_device.allocate(input5TensorBuffSize));
520 sycl_device.memcpyHostToDevice(gpu_data_input5, input5.
data(), input5TensorBuffSize);
521 gpu_tensor.template chip<3l>(4
l).device(sycl_device)=gpu_input5;
522 sycl_device.memcpyDeviceToHost(tensor.
data(), gpu_data_tensor, tensorBuffSize);
524 for (
int i = 0;
i < sizeDim1; ++
i) {
525 for (
int j = 0;
j < sizeDim2; ++
j) {
526 for (
int k = 0; k <sizeDim3; ++k) {
527 for (
int l = 0;
l < sizeDim4; ++
l) {
528 for (
int m = 0;
m < sizeDim5; ++
m) {
539 gpu_tensor.device(sycl_device)=gpu_input1;
544 const size_t input6TensorBuffSize =input6.
size()*
sizeof(DataType);
545 DataType* gpu_data_input6 =
static_cast<DataType*
>(sycl_device.allocate(input6TensorBuffSize));
548 sycl_device.memcpyHostToDevice(gpu_data_input6, input6.
data(), input6TensorBuffSize);
549 gpu_tensor.template chip<4l>(5
l).device(sycl_device)=gpu_input6;
550 sycl_device.memcpyDeviceToHost(tensor.
data(), gpu_data_tensor, tensorBuffSize);
552 for (
int i = 0;
i < sizeDim1; ++
i) {
553 for (
int j = 0;
j < sizeDim2; ++
j) {
554 for (
int k = 0; k <sizeDim3; ++k) {
555 for (
int l = 0;
l < sizeDim4; ++
l) {
556 for (
int m = 0;
m < sizeDim5; ++
m) {
569 gpu_tensor.device(sycl_device)=gpu_input1;
573 DataType* gpu_data_input7 =
static_cast<DataType*
>(sycl_device.allocate(tensorBuffSize));
576 sycl_device.memcpyHostToDevice(gpu_data_input7, input7.
data(), tensorBuffSize);
577 gpu_tensor.chip(0
l,0
l).device(sycl_device)=gpu_input7.chip(0
l,0
l);
578 sycl_device.memcpyDeviceToHost(tensor.
data(), gpu_data_tensor, tensorBuffSize);
580 for (
int i = 0;
i < sizeDim1; ++
i) {
581 for (
int j = 0;
j < sizeDim2; ++
j) {
582 for (
int k = 0; k <sizeDim3; ++k) {
583 for (
int l = 0;
l < sizeDim4; ++
l) {
584 for (
int m = 0;
m < sizeDim5; ++
m) {
595 sycl_device.deallocate(gpu_data_tensor);
596 sycl_device.deallocate(gpu_data_input1);
597 sycl_device.deallocate(gpu_data_input2);
598 sycl_device.deallocate(gpu_data_input3);
599 sycl_device.deallocate(gpu_data_input4);
600 sycl_device.deallocate(gpu_data_input5);
601 sycl_device.deallocate(gpu_data_input6);
602 sycl_device.deallocate(gpu_data_input7);
607 QueueInterface queueInterface(
s);
608 auto sycl_device = Eigen::SyclDevice(&queueInterface);
615 test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
620 for (
const auto& device :Eigen::get_sycl_supported_devices()) {
621 CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));