cxx11_tensor_contract_sycl.cpp
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016
5 // Mehdi Goli Codeplay Software Ltd.
6 // Ralph Potter Codeplay Software Ltd.
7 // Luke Iwanski Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 
14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
16 
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
19 
20 #include <algorithm>
21 #include <chrono>
22 #include <ctime>
23 #include <iostream>
24 
25 #include "main.h"
26 
27 #include <unsupported/Eigen/CXX11/Tensor>
28 
29 using Eigen::array;
30 using Eigen::SyclDevice;
31 using Eigen::Tensor;
32 using Eigen::TensorMap;
33 
34 template <int DataLayout, typename DataType, typename IndexType,
35  typename Device>
36 void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
37  IndexType k_size, IndexType n_size) {
39  DimPair;
40  static const DataType error_threshold = DataType(1e-4);
41  // with these dimensions, the output has 300 * 140 elements, which is
42  // more than 30 * 1024, which is the number of threads in blocks on
43  // a 15 SM GK110 GPU
44  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
45  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
46  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
47  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
48  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
49  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
50  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
51  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
52 
53  t_left.setRandom();
54  t_right.setRandom();
55 
56  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
57  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
58  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
59 
60  DataType *d_t_left =
61  static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
62  DataType *d_t_right =
63  static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
64  DataType *d_t_result =
65  static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
66 
68  gpu_t_left(d_t_left, left_dims);
70  gpu_t_right(d_t_right, right_dims);
72  gpu_t_result(d_t_result, result_dims);
73 
74  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
75  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
76 
77  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
78  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
79  t_result_bytes);
80 
81  t_result = t_left.contract(t_right, dims);
82 
83  for (IndexType i = 0; i < t_result.size(); i++) {
84  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
85  t_result(i) - t_result_gpu(i)))) < error_threshold) {
86  continue;
87  }
88  if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
89  error_threshold)) {
90  continue;
91  }
92 
93  std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
94  << ", mismatch detected at IndexType " << i << ": " << t_result(i)
95  << " vs " << t_result_gpu(i) << std::endl;
96  VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
97  }
98  sycl_device.deallocate(d_t_left);
99  sycl_device.deallocate(d_t_right);
100  sycl_device.deallocate(d_t_result);
101 }
102 
103 template <int DataLayout, typename DataType, typename IndexType,
104  typename Device>
105 void test_sycl_contraction_m(const Device &sycl_device) {
106  for (IndexType k = 32; k < 256; k++) {
107  test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
108  128);
109  }
110 }
111 
112 template <int DataLayout, typename DataType, typename IndexType,
113  typename Device>
114 void test_sycl_contraction_k(const Device &sycl_device) {
115  for (IndexType k = 32; k < 256; k++) {
116  test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
117  128);
118  }
119 }
120 
121 template <int DataLayout, typename DataType, typename IndexType,
122  typename Device>
123 void test_sycl_contraction_n(const Device &sycl_device) {
124  for (IndexType k = 32; k < 256; k++) {
125  test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
126  128, k);
127  }
128 }
129 
130 template <int DataLayout, typename DataType, typename IndexType,
131  typename Device>
132 void test_sycl_contraction_sizes(const Device &sycl_device) {
133  IndexType m_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255,
134  257, 511, 512, 513, 1023, 1024, 1025};
135 
136  IndexType n_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255,
137  257, 511, 512, 513, 1023, 1024, 1025};
138 
139  IndexType k_sizes[] = {31, 39, 63, 64, 65, 95, 96, 127, 129,
140  255, 257, 511, 512, 513, 1023, 1024, 1025};
141 
142  for (IndexType i = 0; i < 15; i++) {
143  for (IndexType j = 0; j < 15; j++) {
144  for (IndexType k = 0; k < 17; k++) {
145  test_sycl_contraction<DataLayout, DataType, IndexType>(
146  sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
147  }
148  }
149  }
150 }
151 
152 template <int DataLayout, typename DataType, typename IndexType,
153  typename Device>
154 void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
155  IndexType k_size, IndexType n_size) {
157  DimPair;
158  static const DataType error_threshold = DataType(1e-4);
159  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
160  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
161  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
162 
163  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
164  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
165  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
166  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
167 
168  t_left.setRandom();
169  t_right.setRandom();
170 
171  // Allocate buffers twice as big to check for invalid read and write
172  auto padded_left_size = 2 * t_left.size();
173  auto padded_right_size = 2 * t_right.size();
174  auto padded_result_size = 2 * t_result.size();
175 
176  std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
177  std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
178  std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
179 
180  DataType *d_t_left =
181  static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
182  DataType *d_t_right =
183  static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
184  DataType *d_t_result =
185  static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
186 
187  // TensorMaps are still of the same size than the Tensors
189  gpu_t_left(d_t_left, left_dims);
191  gpu_t_right(d_t_right, right_dims);
193  gpu_t_result(d_t_result, result_dims);
194 
195  // Write nan after the actual buffer to propagate nans everywhere in case of
196  // invalid reads
197  DataType nan = std::numeric_limits<DataType>::quiet_NaN();
198  auto host_left_data = new DataType[padded_left_size];
199  std::copy_n(t_left.data(), t_left.size(), host_left_data);
200  std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
201  auto host_right_data = new DataType[padded_right_size];
202  std::copy_n(t_right.data(), t_right.size(), host_right_data);
203  std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
204  auto host_result_data = new DataType[padded_result_size];
205  std::fill_n(host_result_data, padded_result_size, nan);
206 
207  sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
208  sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
209  sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
210 
211  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
212  sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
213 
214  t_result = t_left.contract(t_right, dims);
215 
216  for (IndexType i = 0; i < t_result.size(); i++) {
217  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
218  t_result(i) - host_result_data[i]))) < error_threshold) {
219  continue;
220  }
221  if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
222  error_threshold)) {
223  continue;
224  }
225  if (std::isnan(host_result_data[i])) {
226  std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
227  << ", invalid read detected at IndexType " << i << ": "
228  << t_result(i) << " vs " << host_result_data[i] << std::endl;
229  } else {
230  std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
231  << ", mismatch detected at IndexType " << i << ": "
232  << t_result(i) << " vs " << host_result_data[i] << std::endl;
233  }
234  VERIFY_IS_APPROX(host_result_data[i], t_result(i));
235  }
236  // Make sure that the rest of the result is still nans
237  for (IndexType i = t_result.size(); i < padded_result_size; i++) {
238  if (std::isnan(host_result_data[i])) {
239  continue;
240  }
241  std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
242  << ", invalid write detected at IndexType " << i << ": "
243  << host_result_data[i] << std::endl;
244  VERIFY_IS_APPROX(host_result_data[i], t_result(i));
245  }
246  sycl_device.deallocate(d_t_left);
247  sycl_device.deallocate(d_t_right);
248  sycl_device.deallocate(d_t_result);
249 
250  delete[] host_left_data;
251  delete[] host_right_data;
252  delete[] host_result_data;
253 }
254 
255 template <int DataLayout, typename DataType, typename IndexType,
256  typename Device>
257 void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
258  IndexType n_size) {
259  // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
260  // ")" << std::endl;
261  // with these dimensions, the output has 300 * 140 elements, which is
262  // more than 30 * 1024, which is the number of threads in blocks on
263  // a 15 SM GK110 GPU
265  DimPair;
266  static const DataType error_threshold = DataType(1e-4);
267  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
268  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
271  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
272  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
273  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
274  t_left.setRandom();
275  t_right.setRandom();
276 
277  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
278  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
279  std::size_t t_result_bytes = sizeof(DataType);
280 
281  DataType *d_t_left =
282  static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
283  DataType *d_t_right =
284  static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
285  DataType *d_t_result =
286  static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
287 
289  gpu_t_left(d_t_left, left_dims);
291  gpu_t_right(d_t_right, right_dims);
293  gpu_t_result(d_t_result);
294 
295  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
296  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
297 
298  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
299  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
300  t_result_bytes);
301 
302  t_result = t_left.contract(t_right, dims);
303 
304  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
305  t_result() - t_result_gpu()))) > error_threshold &&
306  !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
307  std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
308  << " : mismatch detected: " << t_result() << " vs "
309  << t_result_gpu() << std::endl;
310  VERIFY_IS_APPROX(t_result_gpu(), t_result());
311  }
312 
313  sycl_device.deallocate(d_t_left);
314  sycl_device.deallocate(d_t_right);
315  sycl_device.deallocate(d_t_result);
316 }
317 
318 template <int DataLayout, typename DataType, typename IndexType,
319  typename Device>
320 void contraction_batch(const Device &sycl_device, IndexType m_size,
321  IndexType k_size, IndexType n_size, IndexType m_batch,
322  IndexType start, IndexType limit) {
324  DimPair;
325  static const DataType error_threshold = DataType(1e-4);
326  typedef Eigen::array<IndexType, 3> TensorDim;
328  TensorDim left_dims = {{m_batch, k_size, m_size}};
329  TensorDim right_dims = {{m_batch, n_size, k_size}};
330  TensorDim res_dims = {{m_batch, m_size, n_size}};
331  Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
332 
333  TensorType t_left(left_dims);
334  TensorType t_right(right_dims);
335  TensorType t_result_gpu(res_dims);
336  TensorType t_result(res_dims);
337 
338  t_left.setRandom();
339  t_right.setRandom();
340 
341  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
342  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
343  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
344 
345  DataType *d_t_left =
346  static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
347  DataType *d_t_right =
348  static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
349  DataType *d_t_result =
350  static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
351 
352  Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
353  Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
354  Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
355 
356  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
357  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
358  for (int i = start; i < limit; ++i) {
359  auto x = gpu_t_left.template chip<0>(i);
360  auto y = gpu_t_right.template chip<0>(i);
361  auto z = gpu_t_result.template chip<0>(i);
362  z.device(sycl_device) = x.contract(y, contract_pairs);
363  }
364  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
365  t_result_bytes);
366 
367  for (int i = start; i < limit; ++i) {
368  auto x = t_left.template chip<0>(i);
369  auto y = t_right.template chip<0>(i);
370  auto z = t_result.template chip<0>(i);
371  z = x.contract(y, contract_pairs);
372  }
373 
374  for (IndexType i = 0; i < t_result.size(); i++) {
375  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
376  t_result(i) - t_result_gpu(i)))) < error_threshold) {
377  continue;
378  }
379  if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
380  error_threshold)) {
381  continue;
382  }
383  std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
384  << " vs " << t_result_gpu(i) << std::endl;
385  VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
386  }
387  sycl_device.deallocate(d_t_left);
388  sycl_device.deallocate(d_t_right);
389  sycl_device.deallocate(d_t_result);
390 }
391 
392 template <int DataLayout, typename DataType, typename IndexType,
393  typename Device>
394 void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
395  IndexType k_size, IndexType n_size) {
397  DimPair;
398  static const DataType error_threshold = DataType(1e-4);
399  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
400  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
401  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
402  Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
403 
405  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
406  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
408 
409  t_left.setRandom();
410  t_right.setRandom();
411 
412  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
413  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
414  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
415 
416  DataType *d_t_left =
417  static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
418  DataType *d_t_right =
419  static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
420  DataType *d_t_result =
421  static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
422 
424  gpu_t_left(d_t_left, left_dims);
426  gpu_t_right(d_t_right, right_dims);
428  gpu_t_result(d_t_result, res_dims);
429 
430  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
431  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
432 
433  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
434  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
435  t_result_bytes);
436 
437  t_result = t_left.contract(t_right, dims);
438 
439  for (IndexType j = 0; j < m_size; j++) {
440  for (IndexType i = 0; i < n_size; i++) {
441  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
442  t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
443  continue;
444  }
445  if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
446  error_threshold)) {
447  continue;
448  }
449  std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
450  << ", mismatch detected at IndexType m: " << j << " n: " << i
451  << " CPU : " << t_result(j, i)
452  << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
453  VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
454  }
455  }
456  sycl_device.deallocate(d_t_left);
457  sycl_device.deallocate(d_t_right);
458  sycl_device.deallocate(d_t_result);
459 }
460 
461 template <int DataLayout, typename DataType, typename IndexType,
462  typename Device>
463 void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
464  IndexType k_size, IndexType n_size) {
466  DimPair;
467  static const DataType error_threshold = DataType(1e-4);
468  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
469  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
470  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
471  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
472 
474  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
475  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
477 
478  t_left.setRandom();
479  t_right.setRandom();
480 
481  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
482  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
483  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
484 
485  DataType *d_t_left =
486  static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
487  DataType *d_t_right =
488  static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
489  DataType *d_t_result =
490  static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
491 
493  gpu_t_left(d_t_left, left_dims);
495  gpu_t_right(d_t_right, right_dims);
497  gpu_t_result(d_t_result, res_dims);
498 
499  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
500  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
501 
502  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
503  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
504  t_result_bytes);
505 
506  t_result = t_left.contract(t_right, dims);
507 
508  for (IndexType i = 0; i < t_result.size(); i++) {
509  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
510  t_result(i) - t_result_gpu(i)))) < error_threshold) {
511  continue;
512  }
513  if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
514  error_threshold)) {
515  continue;
516  }
517  std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
518  << ", mismatch detected at IndexType " << i << ": " << t_result(i)
519  << " vs " << t_result_gpu(i) << std::endl;
520  VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
521  }
522  sycl_device.deallocate(d_t_left);
523  sycl_device.deallocate(d_t_right);
524  sycl_device.deallocate(d_t_result);
525 }
526 
527 template <int DataLayout, typename DataType, typename IndexType,
528  typename Device>
529 void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
530  IndexType k_size, IndexType n_size) {
532  DimPair;
533  static const DataType error_threshold = DataType(1e-4);
534  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
535  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
536  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
537  Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
538 
540  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
541  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
543 
544  t_left.setRandom();
545  t_right.setRandom();
546 
547  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
548  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
549  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
550 
551  DataType *d_t_left =
552  static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
553  DataType *d_t_right =
554  static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
555  DataType *d_t_result =
556  static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
557 
559  gpu_t_left(d_t_left, left_dims);
561  gpu_t_right(d_t_right, right_dims);
563  gpu_t_result(d_t_result, res_dims);
564 
565  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
566  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
567 
568  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
569  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
570  t_result_bytes);
571 
572  t_result = t_left.contract(t_right, dims);
573 
574  for (IndexType i = 0; i < t_result.size(); i++) {
575  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
576  t_result(i) - t_result_gpu(i)))) < error_threshold) {
577  continue;
578  }
579  if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
580  error_threshold)) {
581  continue;
582  }
583  std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
584  << ", mismatch detected at IndexType " << i << ": " << t_result(i)
585  << " vs " << t_result_gpu(i) << std::endl;
586 
587  VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
588  }
589  sycl_device.deallocate(d_t_left);
590  sycl_device.deallocate(d_t_right);
591  sycl_device.deallocate(d_t_result);
592 }
593 
594 template <typename Dev>
595 void inline tensorOutofBound(const Dev &sycl_device) {
596  typedef float DataType;
597  typedef int64_t IndexType;
598  std::chrono::time_point<std::chrono::system_clock> start, end;
599  start = std::chrono::system_clock::now();
600  // Test out of bound for Tensor-Tensor
601  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
602  1024);
603  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
604  4096);
605  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
606  2048);
607  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
608  1024);
609  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
610  784);
611  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
612  10);
613  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
614  513);
615  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
616  783);
617  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
618  784);
619  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
620  11);
621  end = std::chrono::system_clock::now();
622  std::chrono::duration<double> elapsed_seconds = end - start;
623  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
624  std::cout << "tensor out of bound tests finished computation at "
625  << std::ctime(&end_time)
626  << "elapsed time: " << elapsed_seconds.count() << "s\n";
627 }
628 
629 template <typename Dev>
630 void inline tensorTensor(const Dev &sycl_device) {
631  typedef float DataType;
632  typedef int64_t IndexType;
633  std::chrono::time_point<std::chrono::system_clock> start, end;
634  start = std::chrono::system_clock::now();
635  // Tensor Tensor Contraction
636  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
637  128);
638  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
639  128);
640  end = std::chrono::system_clock::now();
641  std::chrono::duration<double> elapsed_seconds = end - start;
642  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
643  std::cout << "tensor tensor tests finished computation at "
644  << std::ctime(&end_time)
645  << "elapsed time: " << elapsed_seconds.count() << "s\n";
646 }
647 
648 template <typename Dev>
649 void inline tensorTensor_m(const Dev &sycl_device) {
650  typedef float DataType;
651  typedef int64_t IndexType;
652  std::chrono::time_point<std::chrono::system_clock> start, end;
653  start = std::chrono::system_clock::now();
654  // Tensor Tensor Contraction
655  test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
656  test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
657 
658  end = std::chrono::system_clock::now();
659  std::chrono::duration<double> elapsed_seconds = end - start;
660  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
661  std::cout << "tensor tensor tests finished computation at "
662  << std::ctime(&end_time)
663  << "elapsed time: " << elapsed_seconds.count() << "s\n";
664 }
665 
666 template <typename Dev>
667 void inline tensorTensor_n(const Dev &sycl_device) {
668  typedef float DataType;
669  typedef int64_t IndexType;
670  std::chrono::time_point<std::chrono::system_clock> start, end;
671  start = std::chrono::system_clock::now();
672  // Tensor Tensor Contraction
673  test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
674  test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
675 
676  end = std::chrono::system_clock::now();
677  std::chrono::duration<double> elapsed_seconds = end - start;
678  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
679  std::cout << "tensor tensor tests finished computation at "
680  << std::ctime(&end_time)
681  << "elapsed time: " << elapsed_seconds.count() << "s\n";
682 }
683 
684 template <typename Dev>
685 void inline tensorTensor_k(const Dev &sycl_device) {
686  typedef float DataType;
687  typedef int64_t IndexType;
688  std::chrono::time_point<std::chrono::system_clock> start, end;
689  start = std::chrono::system_clock::now();
690  test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
691  test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
692 
693  end = std::chrono::system_clock::now();
694  std::chrono::duration<double> elapsed_seconds = end - start;
695  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
696  std::cout << "tensor tensor tests finished computation at "
697  << std::ctime(&end_time)
698  << "elapsed time: " << elapsed_seconds.count() << "s\n";
699 }
700 
701 template <typename Dev>
702 void inline tensorTensor_sizes(const Dev &sycl_device) {
703  typedef float DataType;
704  typedef int64_t IndexType;
705  std::chrono::time_point<std::chrono::system_clock> start, end;
706  start = std::chrono::system_clock::now();
707  // Tensor Tensor Contraction
708  test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
709  test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
710 
711  end = std::chrono::system_clock::now();
712  std::chrono::duration<double> elapsed_seconds = end - start;
713  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
714  std::cout << "tensor tensor tests finished computation at "
715  << std::ctime(&end_time)
716  << "elapsed time: " << elapsed_seconds.count() << "s\n";
717 }
718 template <typename Dev>
719 void inline vectorVector(const Dev &sycl_device) {
720  typedef float DataType;
721  typedef int64_t IndexType;
722  std::chrono::time_point<std::chrono::system_clock> start, end;
723  start = std::chrono::system_clock::now();
724  // VECTOR-VECTOR
725  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
726  1025);
727  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
728  1025);
729  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
730  1024);
731  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
732  1024);
733  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
734  1023);
735  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
736  1023);
737 
738  end = std::chrono::system_clock::now();
739  std::chrono::duration<double> elapsed_seconds = end - start;
740  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
741  std::cout << "contracted tensor tests finished computation at "
742  << std::ctime(&end_time)
743  << "elapsed time: " << elapsed_seconds.count() << "s\n";
744 }
745 
746 template <typename Dev>
747 void inline vectorTensor(const Dev &sycl_device) {
748  typedef float DataType;
749  typedef int64_t IndexType;
750  std::chrono::time_point<std::chrono::system_clock> start, end;
751  start = std::chrono::system_clock::now();
752  // Vector-Tensor
753  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
754  1025);
755  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
756  1025);
757  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
758  1024);
759  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
760  1024);
761  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
762  1023);
763  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
764  1023);
765 
766  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
767  4097);
768  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
769  4097);
770  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
771  4096);
772  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
773  4096);
774  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
775  4095);
776  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
777  4095);
778  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
779  32);
780 
781  end = std::chrono::system_clock::now();
782  std::chrono::duration<double> elapsed_seconds = end - start;
783  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
784  std::cout << "finished computation at " << std::ctime(&end_time)
785  << "elapsed time: " << elapsed_seconds.count() << "s\n";
786 }
787 
788 template <typename Dev>
789 void inline tensorVector(const Dev &sycl_device) {
790  typedef float DataType;
791  typedef int64_t IndexType;
792  std::chrono::time_point<std::chrono::system_clock> start, end;
793  start = std::chrono::system_clock::now();
794  // Matrix-Vector
795  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
796  1);
797  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
798  1);
799  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
800  1);
801  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
802  1);
803  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
804  1);
805  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
806  1);
807  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
808  1);
809  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
810  1);
811  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
812  1);
813  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
814  1);
815  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
816  1);
817  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
818  1);
819 // If the GEMV disabled it will creates one kernel to calculate the contraction.
820 // Therefore the acumuation of float number will overflow the precision
821 // threshold for float and cause the test to fail. While it the GMV multiple
822 // kernel will be created and each one run the overflow of accumutation breaks
823 // among the kernels.
824 #ifndef EIGEN_SYCL_DISABLE_GEMV
825  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
826  1);
827 #endif
828 
829  end = std::chrono::system_clock::now();
830  std::chrono::duration<double> elapsed_seconds = end - start;
831  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
832  std::cout << "finished computation at " << std::ctime(&end_time)
833  << "elapsed time: " << elapsed_seconds.count() << "s\n";
834 }
835 
836 template <typename Dev>
837 void inline tensorScalar(const Dev &sycl_device) {
838  typedef float DataType;
839  typedef int64_t IndexType;
840  std::chrono::time_point<std::chrono::system_clock> start, end;
841  start = std::chrono::system_clock::now();
842  // SCALAR Contraction
843  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
844  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
845  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
846  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
847  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
848  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
849 
850  end = std::chrono::system_clock::now();
851  std::chrono::duration<double> elapsed_seconds = end - start;
852  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
853  std::cout << "finished computation at " << std::ctime(&end_time)
854  << "elapsed time: " << elapsed_seconds.count() << "s\n";
855 }
856 
857 template <typename Dev>
858 void inline skinnyTensor_row(const Dev &sycl_device) {
859  typedef float DataType;
860  typedef int64_t IndexType;
861  std::chrono::time_point<std::chrono::system_clock> start, end;
862  start = std::chrono::system_clock::now();
863  // Tensor Tensor Contraction
864  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
865  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
866  257);
867  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
868  256);
869  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
870  16);
871  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
872  17);
873  end = std::chrono::system_clock::now();
874  std::chrono::duration<double> elapsed_seconds = end - start;
875  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
876  std::cout << "finished computation at " << std::ctime(&end_time)
877  << "elapsed time: " << elapsed_seconds.count() << "s\n";
878 }
879 
880 template <typename Dev>
881 void inline skinnyTensor_col(const Dev &sycl_device) {
882  typedef float DataType;
883  typedef int64_t IndexType;
884  std::chrono::time_point<std::chrono::system_clock> start, end;
885  start = std::chrono::system_clock::now();
886  // Tensor Tensor Contraction
887  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
888  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
889  257);
890  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
891  256);
892  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
893  16);
894  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
895  17);
896  end = std::chrono::system_clock::now();
897  std::chrono::duration<double> elapsed_seconds = end - start;
898  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
899  std::cout << "finished computation at " << std::ctime(&end_time)
900  << "elapsed time: " << elapsed_seconds.count() << "s\n";
901 }
902 
903 template <typename Dev>
904 void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
905  typedef float DataType;
906  typedef int64_t IndexType;
907  std::chrono::time_point<std::chrono::system_clock> start, end;
908  start = std::chrono::system_clock::now();
909 
910  contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
911  0, 4);
912  contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
913  0, 4);
914  end = std::chrono::system_clock::now();
915  std::chrono::duration<double> elapsed_seconds = end - start;
916  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
917  std::cout << "finished computation at " << std::ctime(&end_time)
918  << "elapsed time: " << elapsed_seconds.count() << "s\n";
919 }
920 
921 template <typename Dev>
923  const Dev &sycl_device) {
924  typedef float DataType;
925  typedef int64_t IndexType;
926  std::chrono::time_point<std::chrono::system_clock> start, end;
927  start = std::chrono::system_clock::now();
928 
929  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
930  8);
931  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
932  32);
933  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
934  64);
935  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
936  2048, 1024);
937  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
938  10, 1024);
939  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
940  1024, 1024);
941  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
942  4096, 1024);
943  end = std::chrono::system_clock::now();
944  std::chrono::duration<double> elapsed_seconds = end - start;
945  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
946  std::cout << "finished computation at " << std::ctime(&end_time)
947  << "elapsed time: " << elapsed_seconds.count() << "s\n";
948 }
949 
950 template <typename Dev>
952  const Dev &sycl_device) {
953  typedef float DataType;
954  typedef int64_t IndexType;
955  std::chrono::time_point<std::chrono::system_clock> start, end;
956  start = std::chrono::system_clock::now();
957 
958  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
959  16);
960  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
961  17);
962  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
963  32);
964  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
965  64);
966  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
967  1024, 1024);
968  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
969  1024, 4096);
970  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
971  1024, 2048);
972  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
973  1024, 784);
974  end = std::chrono::system_clock::now();
975  std::chrono::duration<double> elapsed_seconds = end - start;
976  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
977  std::cout << "finished computation at " << std::ctime(&end_time)
978  << "elapsed time: " << elapsed_seconds.count() << "s\n";
979 }
980 
981 template <typename Dev>
983  const Dev &sycl_device) {
984  typedef float DataType;
985  typedef int64_t IndexType;
986  std::chrono::time_point<std::chrono::system_clock> start, end;
987  start = std::chrono::system_clock::now();
988 
989  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
990  17);
991  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
992  32);
993  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
994  16, 64);
995  end = std::chrono::system_clock::now();
996  std::chrono::duration<double> elapsed_seconds = end - start;
997  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
998  std::cout << "finished computation at " << std::ctime(&end_time)
999  << "elapsed time: " << elapsed_seconds.count() << "s\n";
1000 }
1001 
1002 EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
1003  for (const auto &device : Eigen::get_sycl_supported_devices()) {
1004  std::cout << "Running on "
1005  << device.template get_info<cl::sycl::info::device::name>()
1006  << std::endl;
1007  QueueInterface queueInterface(device);
1008  auto sycl_device = Eigen::SyclDevice(&queueInterface);
1009  CALL_SUBTEST_1(tensorOutofBound(sycl_device));
1010  CALL_SUBTEST_2(tensorTensor(sycl_device));
1011  CALL_SUBTEST_2(tensorTensor_m(sycl_device));
1012  CALL_SUBTEST_2(tensorTensor_n(sycl_device));
1013  CALL_SUBTEST_2(tensorTensor_k(sycl_device));
1014  CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
1015  CALL_SUBTEST_3(vectorVector(sycl_device));
1016  CALL_SUBTEST_4(vectorTensor(sycl_device));
1017  CALL_SUBTEST_5(tensorVector(sycl_device));
1018  CALL_SUBTEST_6(tensorScalar(sycl_device));
1019  CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
1020  CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
1025  }
1026 }
skinnyTensor_row
void skinnyTensor_row(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:858
Eigen::Tensor
The tensor class.
Definition: Tensor.h:63
skinnyTensor_col
void skinnyTensor_col(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:881
tensorTensor_n
void tensorTensor_n(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:667
test_sycl_contraction_m
void test_sycl_contraction_m(const Device &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:105
array
int array[24]
Definition: Map_general_stride.cpp:1
e
Array< double, 1, 3 > e(1./3., 0.5, 2.)
tensor_contraction_lhs_transposed_per_device
void tensor_contraction_lhs_transposed_per_device(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:922
Eigen::array
Definition: EmulateArray.h:21
vectorVector
void vectorVector(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:719
test_sycl_contraction_sizes
void test_sycl_contraction_sizes(const Device &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:132
tensorTensor_k
void tensorTensor_k(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:685
DataLayout
static const int DataLayout
Definition: cxx11_tensor_image_patch_sycl.cpp:24
x
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
Definition: gnuplot_common_settings.hh:12
tensorVector
void tensorVector(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:789
Eigen::internal::isApprox
EIGEN_DEVICE_FUNC bool isApprox(const Scalar &x, const Scalar &y, const typename NumTraits< Scalar >::Real &precision=NumTraits< Scalar >::dummy_precision())
Definition: Eigen/src/Core/MathFunctions.h:1947
tensor_contraction_rhs_transposed_per_device
void tensor_contraction_rhs_transposed_per_device(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:951
isnan
#define isnan(X)
Definition: main.h:93
CALL_SUBTEST_11
#define CALL_SUBTEST_11(FUNC)
Definition: split_test_helper.h:64
tensor_contraction_both_transposed_per_device
void tensor_contraction_both_transposed_per_device(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:982
CALL_SUBTEST_9
#define CALL_SUBTEST_9(FUNC)
Definition: split_test_helper.h:52
tensorScalar
void tensorScalar(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:837
boost::multiprecision::fabs
Real fabs(const Real &a)
Definition: boostmultiprec.cpp:119
CALL_SUBTEST_4
#define CALL_SUBTEST_4(FUNC)
Definition: split_test_helper.h:22
CALL_SUBTEST_3
#define CALL_SUBTEST_3(FUNC)
Definition: split_test_helper.h:16
EIGEN_DECLARE_TEST
EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl)
Definition: cxx11_tensor_contract_sycl.cpp:1002
CALL_SUBTEST_1
#define CALL_SUBTEST_1(FUNC)
Definition: split_test_helper.h:4
test_scalar
void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
Definition: cxx11_tensor_contract_sycl.cpp:257
j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
int64_t
signed __int64 int64_t
Definition: ms_stdint.h:94
contraction_both_transposed
void contraction_both_transposed(const Device &sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
Definition: cxx11_tensor_contract_sycl.cpp:529
test_sycl_contraction
static void test_sycl_contraction(const Device &sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
Definition: cxx11_tensor_contract_sycl.cpp:36
tensorTensor
void tensorTensor(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:630
CALL_SUBTEST_10
#define CALL_SUBTEST_10(FUNC)
Definition: split_test_helper.h:58
Eigen::TensorMap
A tensor expression mapping an existing array of data.
Definition: TensorForwardDeclarations.h:52
CALL_SUBTEST_5
#define CALL_SUBTEST_5(FUNC)
Definition: split_test_helper.h:28
pybind_wrapper_test_script.z
z
Definition: pybind_wrapper_test_script.py:61
test_sycl_contraction_k
void test_sycl_contraction_k(const Device &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:114
tensorTensor_m
void tensorTensor_m(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:649
contraction_batch
void contraction_batch(const Device &sycl_device, IndexType m_size, IndexType k_size, IndexType n_size, IndexType m_batch, IndexType start, IndexType limit)
Definition: cxx11_tensor_contract_sycl.cpp:320
tensor_contraction_batch_per_device
void tensor_contraction_batch_per_device(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:904
CALL_SUBTEST_6
#define CALL_SUBTEST_6(FUNC)
Definition: split_test_helper.h:34
CALL_SUBTEST_2
#define CALL_SUBTEST_2(FUNC)
Definition: split_test_helper.h:10
size_t
std::size_t size_t
Definition: wrap/pybind11/include/pybind11/detail/common.h:490
test_no_out_of_bounds
static void test_no_out_of_bounds(const Device &sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
Definition: cxx11_tensor_contract_sycl.cpp:154
y
Scalar * y
Definition: level1_cplx_impl.h:124
VERIFY_IS_APPROX
#define VERIFY_IS_APPROX(a, b)
Definition: integer_types.cpp:15
Eigen::TensorBase< Tensor< Scalar_, NumIndices_, Options_, IndexType_ > >::setRandom
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor< Scalar_, NumIndices_, Options_, IndexType_ > & setRandom()
Definition: TensorBase.h:996
vectorTensor
void vectorTensor(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:747
tensorTensor_sizes
void tensorTensor_sizes(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:702
main.h
Eigen::Tensor::data
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
Definition: Tensor.h:104
error_threshold
static const float error_threshold
Definition: cxx11_tensor_convolution_sycl.cpp:32
Eigen::Tensor::size
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const
Definition: Tensor.h:103
contraction_rhs_transposed
void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
Definition: cxx11_tensor_contract_sycl.cpp:394
tensorOutofBound
void tensorOutofBound(const Dev &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:595
Eigen::placeholders::end
static const EIGEN_DEPRECATED end_t end
Definition: IndexedViewHelper.h:181
CALL_SUBTEST_7
#define CALL_SUBTEST_7(FUNC)
Definition: split_test_helper.h:40
CALL_SUBTEST_8
#define CALL_SUBTEST_8(FUNC)
Definition: split_test_helper.h:46
DimPair
Tensor< float, 1 >::DimensionPair DimPair
Definition: cxx11_tensor_contraction.cpp:17
test_sycl_contraction_n
void test_sycl_contraction_n(const Device &sycl_device)
Definition: cxx11_tensor_contract_sycl.cpp:123
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9
contraction_lhs_transposed
void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
Definition: cxx11_tensor_contract_sycl.cpp:463


gtsam
Author(s):
autogenerated on Sat Nov 16 2024 04:02:08