16 #define EIGEN_TEST_NO_LONGDOUBLE 
   17 #define EIGEN_TEST_NO_COMPLEX 
   19 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t 
   20 #define EIGEN_USE_SYCL 
   24 #include <unsupported/Eigen/CXX11/Tensor> 
   27 using Eigen::SyclDevice;
 
   31 template <
typename DataType, 
int DataLayout, 
typename IndexType>
 
   46   DataType* gpu_data1  = 
static_cast<DataType*
>(sycl_device.allocate(tensor1.
size()*
sizeof(DataType)));
 
   47   DataType* gpu_data2  = 
static_cast<DataType*
>(sycl_device.allocate(tensor2.
size()*
sizeof(DataType)));
 
   48   DataType* gpu_data3  = 
static_cast<DataType*
>(sycl_device.allocate(tensor3.
size()*
sizeof(DataType)));
 
   49   DataType* gpu_data4  = 
static_cast<DataType*
>(sycl_device.allocate(tensor4.
size()*
sizeof(DataType)));
 
   56   sycl_device.memcpyHostToDevice(gpu_data1, tensor1.
data(),(tensor1.
size())*
sizeof(DataType));
 
   58   gpu2.device(sycl_device)=gpu1.reshape(dim2);
 
   59   sycl_device.memcpyDeviceToHost(tensor2.
data(), gpu_data2,(tensor1.
size())*
sizeof(DataType));
 
   61   gpu3.device(sycl_device)=gpu1.reshape(dim3);
 
   62   sycl_device.memcpyDeviceToHost(tensor3.
data(), gpu_data3,(tensor3.
size())*
sizeof(DataType));
 
   64   gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
 
   65   sycl_device.memcpyDeviceToHost(tensor4.
data(), gpu_data4,(tensor4.
size())*
sizeof(DataType));
 
   66   for (IndexType 
i = 0; 
i < 2; ++
i){
 
   67     for (IndexType 
j = 0; 
j < 3; ++
j){
 
   68       for (IndexType 
k = 0; 
k < 7; ++
k){
 
   82   sycl_device.deallocate(gpu_data1);
 
   83   sycl_device.deallocate(gpu_data2);
 
   84   sycl_device.deallocate(gpu_data3);
 
   85   sycl_device.deallocate(gpu_data4);
 
   89 template<
typename DataType, 
int DataLayout, 
typename IndexType>
 
  101   DataType* gpu_data1  = 
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
 
  102   DataType* gpu_data2  = 
static_cast<DataType*
>(sycl_device.allocate(tensor2d.
size()*
sizeof(DataType)));
 
  103   DataType* gpu_data3  = 
static_cast<DataType*
>(sycl_device.allocate(tensor5d.
size()*
sizeof(DataType)));
 
  109   sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
 
  111   gpu2.reshape(dim1).device(sycl_device)=gpu1;
 
  112   sycl_device.memcpyDeviceToHost(tensor2d.
data(), gpu_data2,(tensor2d.
size())*
sizeof(DataType));
 
  114   gpu3.reshape(dim1).device(sycl_device)=gpu1;
 
  115   sycl_device.memcpyDeviceToHost(tensor5d.
data(), gpu_data3,(tensor5d.
size())*
sizeof(DataType));
 
  118   for (IndexType 
i = 0; 
i < 2; ++
i){
 
  119     for (IndexType 
j = 0; 
j < 3; ++
j){
 
  120       for (IndexType 
k = 0; 
k < 7; ++
k){
 
  131   sycl_device.deallocate(gpu_data1);
 
  132   sycl_device.deallocate(gpu_data2);
 
  133   sycl_device.deallocate(gpu_data3);
 
  137 template <
typename DataType, 
int DataLayout, 
typename IndexType>
 
  140   IndexType sizeDim1 = 2;
 
  141   IndexType sizeDim2 = 3;
 
  142   IndexType sizeDim3 = 5;
 
  143   IndexType sizeDim4 = 7;
 
  144   IndexType sizeDim5 = 11;
 
  151   DataType* gpu_data1  = 
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
 
  152   DataType* gpu_data2  = 
static_cast<DataType*
>(sycl_device.allocate(slice1.
size()*
sizeof(DataType)));
 
  157   sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
 
  159   sycl_device.memcpyDeviceToHost(slice1.
data(), gpu_data2,(slice1.
size())*
sizeof(DataType));
 
  165   DataType* gpu_data3  = 
static_cast<DataType*
>(sycl_device.allocate(slice2.
size()*
sizeof(DataType)));
 
  169   gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
 
  170   sycl_device.memcpyDeviceToHost(slice2.
data(), gpu_data3,(slice2.
size())*
sizeof(DataType));
 
  171   for (IndexType 
i = 0; 
i < 2; ++
i) {
 
  172     for (IndexType 
j = 0; 
j < 2; ++
j) {
 
  173       for (IndexType 
k = 0; 
k < 3; ++
k) {
 
  178   sycl_device.deallocate(gpu_data1);
 
  179   sycl_device.deallocate(gpu_data2);
 
  180   sycl_device.deallocate(gpu_data3);
 
  184 template <
typename DataType, 
int DataLayout, 
typename IndexType>
 
  187   IndexType sizeDim1 = 2;
 
  188   IndexType sizeDim2 = 3;
 
  189   IndexType sizeDim3 = 5;
 
  190   IndexType sizeDim4 = 7;
 
  191   IndexType sizeDim5 = 11;
 
  194   Index5 indicesStart(1
L,2
L,3
L,4
L,5
L);
 
  195   Index5 indicesStop(2
L,3
L,4
L,5
L,6
L);
 
  196   Index5 lengths(1
L,1
L,1
L,1
L,1
L);
 
  206   DataType* gpu_data1  = 
static_cast<DataType*
>(sycl_device.allocate(tensor.
size()*
sizeof(DataType)));
 
  207   DataType* gpu_data2  = 
static_cast<DataType*
>(sycl_device.allocate(slice1.
size()*
sizeof(DataType)));
 
  208   DataType* gpu_data_stride2  = 
static_cast<DataType*
>(sycl_device.allocate(slice_stride1.
size()*
sizeof(DataType)));
 
  216   sycl_device.memcpyHostToDevice(gpu_data1, tensor.
data(),(tensor.
size())*
sizeof(DataType));
 
  218   sycl_device.memcpyDeviceToHost(slice1.
data(), gpu_data2,(slice1.
size())*
sizeof(DataType));
 
  220   gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,
strides);
 
  221   sycl_device.memcpyDeviceToHost(slice_stride1.
data(), gpu_data_stride2,(slice_stride1.
size())*
sizeof(DataType));
 
  230   DataType* gpu_data3  = 
static_cast<DataType*
>(sycl_device.allocate(slice2.
size()*
sizeof(DataType)));
 
  231   DataType* gpu_data_stride3  = 
static_cast<DataType*
>(sycl_device.allocate(strideSlice2.
size()*
sizeof(DataType)));
 
  236   Index5 strides2(1
L,1
L,1
L,1
L,1
L);
 
  237   Index5 indicesStart2(1
L,1
L,3
L,4
L,5
L);
 
  238   Index5 indicesStop2(2
L,2
L,5
L,6
L,8
L);
 
  240   gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
 
  241   sycl_device.memcpyDeviceToHost(slice2.
data(), gpu_data3,(slice2.
size())*
sizeof(DataType));
 
  243   gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
 
  244   sycl_device.memcpyDeviceToHost(strideSlice2.
data(), gpu_data_stride3,(strideSlice2.
size())*
sizeof(DataType));
 
  246   for (IndexType 
i = 0; 
i < 2; ++
i) {
 
  247     for (IndexType 
j = 0; 
j < 2; ++
j) {
 
  248       for (IndexType 
k = 0; 
k < 3; ++
k) {
 
  254   sycl_device.deallocate(gpu_data1);
 
  255   sycl_device.deallocate(gpu_data2);
 
  256   sycl_device.deallocate(gpu_data3);
 
  259 template<
typename DataType, 
int DataLayout, 
typename IndexType>
 
  264   IndexType sizeDim1 = 7
L;
 
  265   IndexType sizeDim2 = 11
L;
 
  268   IndexType sliceDim1 = 2;
 
  269   IndexType sliceDim2 = 3;
 
  271   Tensor2f 
slice(sliceRange);
 
  273   Index2 indicesStart(3
L,4
L);
 
  274   Index2 indicesStop(5
L,7
L);
 
  275   Index2 lengths(2
L,3
L);
 
  277   DataType* gpu_data1  = 
static_cast<DataType*
>(sycl_device.allocate(tensor.size()*
sizeof(DataType)));
 
  278   DataType* gpu_data2  = 
static_cast<DataType*
>(sycl_device.allocate(tensor2.
size()*
sizeof(DataType)));
 
  279   DataType* gpu_data3  = 
static_cast<DataType*
>(sycl_device.allocate(
slice.size()*
sizeof(DataType)));
 
  286   sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*
sizeof(DataType));
 
  287   gpu2.device(sycl_device)=gpu1;
 
  290   sycl_device.memcpyHostToDevice(gpu_data3, 
slice.data(),(
slice.size())*
sizeof(DataType));
 
  293   gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3;
 
  294   gpu2.stridedSlice(indicesStart,indicesStop,
strides).device(sycl_device)=gpu3;
 
  295   sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*
sizeof(DataType));
 
  296   sycl_device.memcpyDeviceToHost(tensor2.
data(), gpu_data2,(tensor2.
size())*
sizeof(DataType));
 
  298   for(IndexType 
i=0;
i<sizeDim1;
i++)
 
  299     for(IndexType 
j=0;
j<sizeDim2;
j++){
 
  302   sycl_device.deallocate(gpu_data1);
 
  303   sycl_device.deallocate(gpu_data2);
 
  304   sycl_device.deallocate(gpu_data3);
 
  307 template <
typename OutIndex, 
typename DSizes>
 
  310   for (
int i = 0; 
i < DSizes::count; ++
i) {
 
  316 template <
class DataType, 
int DataLayout, 
typename IndexType, 
typename ConvertedIndexType>
 
  325   TensorI64 out_tensor_gpu(tensor_range);
 
  326   TensorI64 out_tensor_cpu(tensor_range);
 
  327   out_tensor_cpu.setRandom();
 
  329   TensorI64 sub_tensor(slice_range);
 
  330   sub_tensor.setRandom();
 
  332   DataType* out_gpu_data = 
static_cast<DataType*
>(sycl_device.allocate(out_tensor_cpu.size() * 
sizeof(DataType)));
 
  333   DataType* sub_gpu_data = 
static_cast<DataType*
>(sycl_device.allocate(sub_tensor.size() * 
sizeof(DataType)));
 
  334   TensorMI64 out_gpu(out_gpu_data, tensor_range);
 
  335   TensorMI64 sub_gpu(sub_gpu_data, slice_range);
 
  337   sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * 
sizeof(DataType));
 
  338   sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * 
sizeof(DataType));
 
  342   TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
 
  343   TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
 
  344   TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
 
  345   TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
 
  347   out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
 
  349   out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
 
  351   sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * 
sizeof(DataType));
 
  353   for (IndexType 
i = 0; 
i < out_tensor_cpu.size(); ++
i) {
 
  354     auto exp = out_tensor_cpu(
i);
 
  355     auto val = out_tensor_gpu(
i);
 
  357       std::cout << 
"#" << 
i << 
" got " << val << 
" but expected " << 
exp << std::endl;
 
  361   sycl_device.deallocate(out_gpu_data);
 
  362   sycl_device.deallocate(sub_gpu_data);
 
  367   QueueInterface queueInterface(
s);
 
  368   auto sycl_device = Eigen::SyclDevice(&queueInterface);
 
  369   test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
 
  370   test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
 
  371   test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
 
  372   test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
 
  373   test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
 
  374   test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
 
  375   test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
 
  376   test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
 
  377   test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
 
  378   test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
 
  379   run_eigen<float, RowMajor, long, int>(sycl_device); 
 
  383   for (
const auto& device :Eigen::get_sycl_supported_devices()) {
 
  384     CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));