cxx11_tensor_reduction_sycl.cpp
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2015
5 // Mehdi Goli Codeplay Software Ltd.
6 // Ralph Potter Codeplay Software Ltd.
7 // Luke Iwanski Codeplay Software Ltd.
8 // Contact: <eigen@codeplay.com>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 
14 #define EIGEN_TEST_NO_LONGDOUBLE
15 #define EIGEN_TEST_NO_COMPLEX
16 
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #define EIGEN_USE_SYCL
19 #define EIGEN_HAS_CONSTEXPR 1
20 
21 #include "main.h"
22 
23 #include <unsupported/Eigen/CXX11/Tensor>
24 
25 template <typename DataType, int DataLayout, typename IndexType>
27  const Eigen::SyclDevice& sycl_device) {
28  const IndexType num_rows = 753;
29  const IndexType num_cols = 537;
30  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
31 
32  array<IndexType, 2> outRange = {{1, 1}};
33 
35  Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
36  Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
37 
38  in.setRandom();
39  auto dim = DSizes<IndexType, 2>(1, 1);
40  full_redux = in.sum().reshape(dim);
41 
42  DataType* gpu_in_data = static_cast<DataType*>(
43  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
44  DataType* gpu_out_data = (DataType*)sycl_device.allocate(
45  sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
46 
48  tensorRange);
50  outRange);
51  sycl_device.memcpyHostToDevice(
52  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
53  out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
54  sycl_device.memcpyDeviceToHost(
55  full_redux_gpu.data(), gpu_out_data,
56  (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
57  // Check that the CPU and GPU reductions return the same result.
58  std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
59  << ", CPU FULL: " << full_redux(0, 0) << "\n";
60  VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
61  sycl_device.deallocate(gpu_in_data);
62  sycl_device.deallocate(gpu_out_data);
63 }
64 
65 template <typename DataType, int DataLayout, typename IndexType>
67  const Eigen::SyclDevice& sycl_device) {
69  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
70  const IndexType num_rows = 64;
71  const IndexType num_cols = 64;
72  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
73  const IndexType n_elems = internal::array_prod(tensor_range);
74 
75  data_tensor in(tensor_range);
76  scalar_tensor full_redux;
77  scalar_tensor full_redux_gpu;
78 
79  in.setRandom();
80  array<IndexType, 2> tensor_offset_range(tensor_range);
81  tensor_offset_range[0] -= 1;
82 
83  const IndexType offset = 64;
84  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
85  full_redux = in_offset.sum();
86 
87  DataType* gpu_in_data =
88  static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
89  DataType* gpu_out_data =
90  static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
91 
92  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
93  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
94  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
95  n_elems * sizeof(DataType));
96  out_gpu.device(sycl_device) = in_gpu.sum();
97  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
98  sizeof(DataType));
99 
100  // Check that the CPU and GPU reductions return the same result.
101  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
102 
103  sycl_device.deallocate(gpu_in_data);
104  sycl_device.deallocate(gpu_out_data);
105 }
106 
107 template <typename DataType, int DataLayout, typename IndexType>
109  const Eigen::SyclDevice& sycl_device) {
110  const IndexType num_rows = 4096;
111  const IndexType num_cols = 4096;
112  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
113 
117 
118  in.setRandom();
119 
120  full_redux = in.maximum();
121 
122  DataType* gpu_in_data = static_cast<DataType*>(
123  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
124  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
125 
127  tensorRange);
129  sycl_device.memcpyHostToDevice(
130  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
131  out_gpu.device(sycl_device) = in_gpu.maximum();
132  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
133  sizeof(DataType));
134  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
135  sycl_device.deallocate(gpu_in_data);
136  sycl_device.deallocate(gpu_out_data);
137 }
138 
139 template <typename DataType, int DataLayout, typename IndexType>
141  const Eigen::SyclDevice& sycl_device) {
142  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
143  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
144  const IndexType num_rows = 64;
145  const IndexType num_cols = 64;
146  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
147  const IndexType n_elems = internal::array_prod(tensor_range);
148 
149  data_tensor in(tensor_range);
150  scalar_tensor full_redux;
151  scalar_tensor full_redux_gpu;
152 
153  in.setRandom();
154  array<IndexType, 2> tensor_offset_range(tensor_range);
155  tensor_offset_range[0] -= 1;
156  // Set the initial value to be the max.
157  // As we don't include this in the reduction the result should not be 2.
158  in(0) = static_cast<DataType>(2);
159 
160  const IndexType offset = 64;
161  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
162  full_redux = in_offset.maximum();
163  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
164 
165  DataType* gpu_in_data =
166  static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
167  DataType* gpu_out_data =
168  static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
169 
170  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
171  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
172  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
173  n_elems * sizeof(DataType));
174  out_gpu.device(sycl_device) = in_gpu.maximum();
175  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
176  sizeof(DataType));
177 
178  // Check that the CPU and GPU reductions return the same result.
179  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
180 
181  sycl_device.deallocate(gpu_in_data);
182  sycl_device.deallocate(gpu_out_data);
183 }
184 
185 template <typename DataType, int DataLayout, typename IndexType>
187  const Eigen::SyclDevice& sycl_device) {
188  const IndexType num_rows = 4096;
189  const IndexType num_cols = 4096;
190  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
191  array<IndexType, 1> argRange = {{num_cols}};
193  red_axis[0] = 0;
194  // red_axis[1]=1;
196  Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
197  Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
198  Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
199  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
200  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
203 
204  in.setRandom();
205  in_arg1.setRandom();
206  in_arg2.setRandom();
207 
208  DataType* gpu_in_data = static_cast<DataType*>(
209  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
210  DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
211  in_arg1.dimensions().TotalSize() * sizeof(DataType)));
212  DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
213  in_arg2.dimensions().TotalSize() * sizeof(DataType)));
214  bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
215  out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
216  bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
217  out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
218 
219  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
220 
222  tensorRange);
224  gpu_in_arg1_data, tensorRange);
226  gpu_in_arg2_data, tensorRange);
228  gpu_out_arg_data, argRange);
230  gpu_out_arg__gpu_helper_data, argRange);
232 
233  // CPU VERSION
234  out_arg_cpu =
235  (in_arg1.argmax(1) == in_arg2.argmax(1))
236  .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
237  full_redux = (out_arg_cpu.template cast<float>())
238  .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
239 
240  // GPU VERSION
241  sycl_device.memcpyHostToDevice(
242  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
243  sycl_device.memcpyHostToDevice(
244  gpu_in_arg1_data, in_arg1.data(),
245  (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
246  sycl_device.memcpyHostToDevice(
247  gpu_in_arg2_data, in_arg2.data(),
248  (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
249  out_Argout_gpu_helper.device(sycl_device) =
250  (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
251  out_Argout_gpu.device(sycl_device) =
252  (out_Argout_gpu_helper)
253  .select(out_Argout_gpu.constant(true),
254  out_Argout_gpu.constant(false));
255  out_gpu.device(sycl_device) =
256  (out_Argout_gpu.template cast<float>())
257  .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
258  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
259  sizeof(DataType));
260  // Check that the CPU and GPU reductions return the same result.
261  std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
262  << '\n';
263  VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
264  sycl_device.deallocate(gpu_in_data);
265  sycl_device.deallocate(gpu_in_arg1_data);
266  sycl_device.deallocate(gpu_in_arg2_data);
267  sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
268  sycl_device.deallocate(gpu_out_arg_data);
269  sycl_device.deallocate(gpu_out_data);
270 }
271 
272 template <typename DataType, int DataLayout, typename IndexType>
274  const Eigen::SyclDevice& sycl_device) {
275  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
276  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
277  const IndexType num_rows = 64;
278  const IndexType num_cols = 64;
279  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
280  const IndexType n_elems = internal::array_prod(tensor_range);
281 
282  data_tensor in(tensor_range);
283  scalar_tensor full_redux;
284  scalar_tensor full_redux_gpu;
285 
286  in.setRandom();
287  array<IndexType, 2> tensor_offset_range(tensor_range);
288  tensor_offset_range[0] -= 1;
289 
290  const IndexType offset = 64;
291  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
292  full_redux = in_offset.mean();
293  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
294 
295  DataType* gpu_in_data =
296  static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
297  DataType* gpu_out_data =
298  static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
299 
300  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
301  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
302  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
303  n_elems * sizeof(DataType));
304  out_gpu.device(sycl_device) = in_gpu.mean();
305  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
306  sizeof(DataType));
307 
308  // Check that the CPU and GPU reductions return the same result.
309  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
310 
311  sycl_device.deallocate(gpu_in_data);
312  sycl_device.deallocate(gpu_out_data);
313 }
314 
315 template <typename DataType, int DataLayout, typename IndexType>
317  const Eigen::SyclDevice& sycl_device) {
318  // This is a particular case which illustrates a possible problem when the
319  // number of local threads in a workgroup is even, but is not a power of two.
320  using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
321  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
322  // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
323  // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
324  const IndexType n_elems = 8707;
325  array<IndexType, 1> tensor_range = {{n_elems}};
326 
327  data_tensor in(tensor_range);
328  DataType full_redux;
329  DataType full_redux_gpu;
330  TensorMap<scalar_tensor> red_cpu(&full_redux);
331  TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
332 
333  const DataType const_val = static_cast<DataType>(0.6391);
334  in = in.constant(const_val);
335 
336  Eigen::IndexList<Eigen::type2index<0>> red_axis;
337  red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
338  VERIFY_IS_APPROX(const_val, red_cpu());
339 
340  DataType* gpu_in_data =
341  static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
342  DataType* gpu_out_data =
343  static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
344 
345  TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
346  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
347  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
348  n_elems * sizeof(DataType));
349  out_gpu.device(sycl_device) =
350  in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
351  sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
352  sizeof(DataType));
353 
354  // Check that the CPU and GPU reductions return the same result.
355  VERIFY_IS_APPROX(full_redux_gpu, full_redux);
356 
357  sycl_device.deallocate(gpu_in_data);
358  sycl_device.deallocate(gpu_out_data);
359 }
360 
361 template <typename DataType, int DataLayout, typename IndexType>
363  const Eigen::SyclDevice& sycl_device) {
364  const IndexType num_rows = 876;
365  const IndexType num_cols = 953;
366  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
367 
371 
372  in.setRandom();
373 
374  full_redux = in.minimum();
375 
376  DataType* gpu_in_data = static_cast<DataType*>(
377  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
378  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
379 
381  tensorRange);
383 
384  sycl_device.memcpyHostToDevice(
385  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
386  out_gpu.device(sycl_device) = in_gpu.minimum();
387  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
388  sizeof(DataType));
389  // Check that the CPU and GPU reductions return the same result.
390  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
391  sycl_device.deallocate(gpu_in_data);
392  sycl_device.deallocate(gpu_out_data);
393 }
394 
395 template <typename DataType, int DataLayout, typename IndexType>
397  const Eigen::SyclDevice& sycl_device) {
398  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
399  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
400  const IndexType num_rows = 64;
401  const IndexType num_cols = 64;
402  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
403  const IndexType n_elems = internal::array_prod(tensor_range);
404 
405  data_tensor in(tensor_range);
406  scalar_tensor full_redux;
407  scalar_tensor full_redux_gpu;
408 
409  in.setRandom();
410  array<IndexType, 2> tensor_offset_range(tensor_range);
411  tensor_offset_range[0] -= 1;
412  // Set the initial value to be the min.
413  // As we don't include this in the reduction the result should not be -2.
414  in(0) = static_cast<DataType>(-2);
415 
416  const IndexType offset = 64;
417  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
418  full_redux = in_offset.minimum();
419  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
420 
421  DataType* gpu_in_data =
422  static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
423  DataType* gpu_out_data =
424  static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
425 
426  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
427  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
428  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
429  n_elems * sizeof(DataType));
430  out_gpu.device(sycl_device) = in_gpu.minimum();
431  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
432  sizeof(DataType));
433 
434  // Check that the CPU and GPU reductions return the same result.
435  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
436 
437  sycl_device.deallocate(gpu_in_data);
438  sycl_device.deallocate(gpu_out_data);
439 }
440 template <typename DataType, int DataLayout, typename IndexType>
442  const Eigen::SyclDevice& sycl_device) {
443  IndexType dim_x = 145;
444  IndexType dim_y = 1;
445  IndexType dim_z = 67;
446 
447  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
449  red_axis[0] = 0;
450  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
451 
453  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
454  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
455 
456  in.setRandom();
457 
458  redux = in.maximum(red_axis);
459 
460  DataType* gpu_in_data = static_cast<DataType*>(
461  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
462  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
463  redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
464 
466  tensorRange);
468  gpu_out_data, reduced_tensorRange);
469 
470  sycl_device.memcpyHostToDevice(
471  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
472  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
473  sycl_device.memcpyDeviceToHost(
474  redux_gpu.data(), gpu_out_data,
475  redux_gpu.dimensions().TotalSize() * sizeof(DataType));
476 
477  // Check that the CPU and GPU reductions return the same result.
478  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
479  for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
480  VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
481 
482  sycl_device.deallocate(gpu_in_data);
483  sycl_device.deallocate(gpu_out_data);
484 }
485 
486 template <typename DataType, int DataLayout, typename IndexType>
488  const Eigen::SyclDevice& sycl_device) {
489  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
490  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
491 
492  const IndexType num_rows = 64;
493  const IndexType num_cols = 64;
494  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
495  array<IndexType, 1> reduced_range = {{num_cols}};
496  const IndexType n_elems = internal::array_prod(tensor_range);
497  const IndexType n_reduced = num_cols;
498 
499  data_tensor in(tensor_range);
500  reduced_tensor redux;
501  reduced_tensor redux_gpu(reduced_range);
502 
503  in.setRandom();
504  array<IndexType, 2> tensor_offset_range(tensor_range);
505  tensor_offset_range[0] -= 1;
506  // Set maximum value outside of the considered range.
507  for (IndexType i = 0; i < n_reduced; i++) {
508  in(i) = static_cast<DataType>(2);
509  }
510 
512  red_axis[0] = 0;
513 
514  const IndexType offset = 64;
515  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
516  redux = in_offset.maximum(red_axis);
517  for (IndexType i = 0; i < n_reduced; i++) {
518  VERIFY_IS_NOT_EQUAL(redux(i), in(i));
519  }
520 
521  DataType* gpu_in_data =
522  static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
523  DataType* gpu_out_data = static_cast<DataType*>(
524  sycl_device.allocate(n_reduced * sizeof(DataType)));
525 
526  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
527  TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
528  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
529  n_elems * sizeof(DataType));
530  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
531  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
532  n_reduced * sizeof(DataType));
533 
534  // Check that the CPU and GPU reductions return the same result.
535  for (IndexType i = 0; i < n_reduced; i++) {
536  VERIFY_IS_APPROX(redux_gpu(i), redux(i));
537  }
538 
539  sycl_device.deallocate(gpu_in_data);
540  sycl_device.deallocate(gpu_out_data);
541 }
542 
543 template <typename DataType, int DataLayout, typename IndexType>
545  const Eigen::SyclDevice& sycl_device) {
546  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
547  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
548 
549  const IndexType num_rows = 64;
550  const IndexType num_cols = 64;
551  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
552  array<IndexType, 1> full_reduced_range = {{num_rows}};
553  array<IndexType, 1> reduced_range = {{num_rows - 1}};
554  const IndexType n_elems = internal::array_prod(tensor_range);
555  const IndexType n_reduced = reduced_range[0];
556 
557  data_tensor in(tensor_range);
558  reduced_tensor redux(full_reduced_range);
559  reduced_tensor redux_gpu(reduced_range);
560 
561  in.setRandom();
562  redux.setZero();
563  array<IndexType, 2> tensor_offset_range(tensor_range);
564  tensor_offset_range[0] -= 1;
565  // Set maximum value outside of the considered range.
566  for (IndexType i = 0; i < n_reduced; i++) {
567  in(i) = static_cast<DataType>(2);
568  }
569 
571  red_axis[0] = 1;
572 
573  const IndexType offset = 64;
574  // Introduce an offset in both the input and the output.
575  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
576  TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
577  red_offset = in_offset.maximum(red_axis);
578 
579  // Check that the first value hasn't been changed and that the reduced values
580  // are not equal to the previously set maximum in the input outside the range.
581  VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
582  for (IndexType i = 0; i < n_reduced; i++) {
583  VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
584  }
585 
586  DataType* gpu_in_data =
587  static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
588  DataType* gpu_out_data = static_cast<DataType*>(
589  sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
590 
591  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
592  TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
593  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
594  n_elems * sizeof(DataType));
595  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
596  sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
597  n_reduced * sizeof(DataType));
598 
599  // Check that the CPU and GPU reductions return the same result.
600  for (IndexType i = 0; i < n_reduced; i++) {
601  VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
602  }
603 
604  sycl_device.deallocate(gpu_in_data);
605  sycl_device.deallocate(gpu_out_data);
606 }
607 
608 template <typename DataType, int DataLayout, typename IndexType>
610  const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
611  array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
613  red_axis[0] = 0;
614  array<IndexType, 1> reduced_tensorRange = {{dim_y}};
615 
617  Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
618  Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
619 
620  in.setRandom();
621  redux = in.sum(red_axis);
622 
623  DataType* gpu_in_data = static_cast<DataType*>(
624  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
625  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
626  redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
627 
629  tensorRange);
631  gpu_out_data, reduced_tensorRange);
632 
633  sycl_device.memcpyHostToDevice(
634  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
635  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
636  sycl_device.memcpyDeviceToHost(
637  redux_gpu.data(), gpu_out_data,
638  redux_gpu.dimensions().TotalSize() * sizeof(DataType));
639 
640  // Check that the CPU and GPU reductions return the same result.
641  for (IndexType i = 0; i < redux.size(); i++) {
642  VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
643  }
644  sycl_device.deallocate(gpu_in_data);
645  sycl_device.deallocate(gpu_out_data);
646 }
647 
648 template <typename DataType, int DataLayout, typename IndexType>
650  const Eigen::SyclDevice& sycl_device) {
651  IndexType dim_x = 145;
652  IndexType dim_y = 1;
653  IndexType dim_z = 67;
654 
655  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
657  red_axis[0] = 0;
658  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
659 
661  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
662  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
663 
664  in.setRandom();
665 
666  redux = in.mean(red_axis);
667 
668  DataType* gpu_in_data = static_cast<DataType*>(
669  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
670  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
671  redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
672 
674  tensorRange);
676  gpu_out_data, reduced_tensorRange);
677 
678  sycl_device.memcpyHostToDevice(
679  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
680  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
681  sycl_device.memcpyDeviceToHost(
682  redux_gpu.data(), gpu_out_data,
683  redux_gpu.dimensions().TotalSize() * sizeof(DataType));
684 
685  // Check that the CPU and GPU reductions return the same result.
686  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
687  for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
688  VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
689 
690  sycl_device.deallocate(gpu_in_data);
691  sycl_device.deallocate(gpu_out_data);
692 }
693 
694 template <typename DataType, int DataLayout, typename IndexType>
696  const Eigen::SyclDevice& sycl_device) {
697  IndexType dim_x = 64;
698  IndexType dim_y = 1;
699  IndexType dim_z = 32;
700 
701  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
703  red_axis[0] = 2;
704  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
705 
707  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
708  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
709 
710  in.setRandom();
711 
712  redux = in.mean(red_axis);
713 
714  DataType* gpu_in_data = static_cast<DataType*>(
715  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
716  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
717  redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
718 
720  tensorRange);
722  gpu_out_data, reduced_tensorRange);
723 
724  sycl_device.memcpyHostToDevice(
725  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
726  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
727  sycl_device.memcpyDeviceToHost(
728  redux_gpu.data(), gpu_out_data,
729  redux_gpu.dimensions().TotalSize() * sizeof(DataType));
730  // Check that the CPU and GPU reductions return the same result.
731  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
732  for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
733  VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
734 
735  sycl_device.deallocate(gpu_in_data);
736  sycl_device.deallocate(gpu_out_data);
737 }
738 
739 template <typename DataType, int DataLayout, typename IndexType>
741  const Eigen::SyclDevice& sycl_device) {
742  IndexType dim_x = 64;
743  IndexType dim_y = 1;
744  IndexType dim_z = 32;
745 
746  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
748  red_axis[0] = 2;
749  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
750 
752  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
753  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
754 
755  in.setRandom();
756 
757  redux = in.sum(red_axis);
758 
759  DataType* gpu_in_data = static_cast<DataType*>(
760  sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
761  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
762  redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
763 
765  tensorRange);
767  gpu_out_data, reduced_tensorRange);
768 
769  sycl_device.memcpyHostToDevice(
770  gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
771  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
772  sycl_device.memcpyDeviceToHost(
773  redux_gpu.data(), gpu_out_data,
774  redux_gpu.dimensions().TotalSize() * sizeof(DataType));
775  // Check that the CPU and GPU reductions return the same result.
776  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
777  for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
778  VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
779 
780  sycl_device.deallocate(gpu_in_data);
781  sycl_device.deallocate(gpu_out_data);
782 }
783 
784 template <typename DataType, int DataLayout, typename IndexType>
786  const Eigen::SyclDevice& sycl_device) {
787  auto tensorRange = Sizes<64, 32>(64, 32);
788  // auto red_axis = Sizes<0,1>(0,1);
789  Eigen::IndexList<Eigen::type2index<1>> red_axis;
790  auto reduced_tensorRange = Sizes<64>(64);
794 
795  in_fix.setRandom();
796 
797  redux_fix = in_fix.sum(red_axis);
798 
799  DataType* gpu_in_data = static_cast<DataType*>(
800  sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
801  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
802  redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
803 
805  gpu_in_data, tensorRange);
807  gpu_out_data, reduced_tensorRange);
808 
809  sycl_device.memcpyHostToDevice(
810  gpu_in_data, in_fix.data(),
811  (in_fix.dimensions().TotalSize()) * sizeof(DataType));
812  out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
813  sycl_device.memcpyDeviceToHost(
814  redux_gpu_fix.data(), gpu_out_data,
815  redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
816  // Check that the CPU and GPU reductions return the same result.
817  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
818  VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
819  }
820 
821  sycl_device.deallocate(gpu_in_data);
822  sycl_device.deallocate(gpu_out_data);
823 }
824 
825 template <typename DataType, int DataLayout, typename IndexType>
827  const Eigen::SyclDevice& sycl_device) {
828  auto tensorRange = Sizes<64, 32>(64, 32);
829  Eigen::IndexList<Eigen::type2index<1>> red_axis;
830  auto reduced_tensorRange = Sizes<64>(64);
834 
835  in_fix.setRandom();
836  redux_fix = in_fix.mean(red_axis);
837 
838  DataType* gpu_in_data = static_cast<DataType*>(
839  sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
840  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
841  redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
842 
844  gpu_in_data, tensorRange);
846  gpu_out_data, reduced_tensorRange);
847 
848  sycl_device.memcpyHostToDevice(
849  gpu_in_data, in_fix.data(),
850  (in_fix.dimensions().TotalSize()) * sizeof(DataType));
851  out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
852  sycl_device.memcpyDeviceToHost(
853  redux_gpu_fix.data(), gpu_out_data,
854  redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
855  sycl_device.synchronize();
856  // Check that the CPU and GPU reductions return the same result.
857  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
858  VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
859  }
860 
861  sycl_device.deallocate(gpu_in_data);
862  sycl_device.deallocate(gpu_out_data);
863 }
864 
865 // SYCL supports a generic case of reduction where the accumulator is a
866 // different type than the input data This is an example on how to get if a
867 // Tensor contains nan and/or inf in one reduction
868 template <typename InT, typename OutT>
870  static const bool PacketAccess = false;
871  static const bool IsStateful = false;
872 
873  static constexpr OutT InfBit = 1;
874  static constexpr OutT NanBit = 2;
875 
877  OutT* accum) const {
878  if (Eigen::numext::isinf(x))
879  *accum |= InfBit;
880  else if (Eigen::numext::isnan(x))
881  *accum |= NanBit;
882  }
883 
885  OutT* accum) const {
886  *accum |= x;
887  }
888 
890  return OutT(0);
891  }
892 
893  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
894  return accum;
895  }
896 };
897 
898 template <typename DataType, typename AccumType, int DataLayout,
899  typename IndexType>
901  const Eigen::SyclDevice& sycl_device) {
902  constexpr IndexType InSize = 64;
903  auto tensorRange = Sizes<InSize>(InSize);
904  Eigen::IndexList<Eigen::type2index<0>> dims;
905  auto reduced_tensorRange = Sizes<>();
908 
910 
911  in_fix.setRandom();
912 
913  size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
914  DataType* gpu_in_data =
915  static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
916  AccumType* gpu_out_data =
917  static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
918 
920  gpu_in_data, tensorRange);
922  gpu_out_data, reduced_tensorRange);
923 
924  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
925  out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
926  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
927  sizeof(AccumType));
928  VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
929 
930  sycl_device.deallocate(gpu_in_data);
931  sycl_device.deallocate(gpu_out_data);
932 }
933 
934 template <typename DataType, typename Dev>
935 void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
936  test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
937  test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
938  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
939  test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
940  test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
941  test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
942 
943  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
944  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
945  test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
946  sycl_device);
947  test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
948  sycl_device);
949  sycl_device.synchronize();
950 }
951 
952 template <typename DataType, typename Dev>
953 void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
954  test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
955  sycl_device);
956  test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
957  sycl_device);
958  test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
959  sycl_device);
960  test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
961  sycl_device);
962  test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
963  sycl_device);
964  test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
965  sycl_device);
966  test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
967  sycl_device);
968  test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
969  sycl_device);
970  test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
971  sycl_device);
972  sycl_device.synchronize();
973 }
974 
975 template <typename DataType, typename Dev>
976 void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
977  test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
978  4197, 4097);
979  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
980  4197, 4097);
981  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
982  129, 8);
983  test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
984  test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
985  sycl_device);
986  sycl_device.synchronize();
987 }
988 
989 template <typename DataType, typename Dev>
990 void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
991  test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
992  test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
993  sycl_device);
994  test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
995  test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
996  test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
997  test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
998  sycl_device.synchronize();
999 }
1000 
1001 EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
1002  for (const auto& device : Eigen::get_sycl_supported_devices()) {
1003  std::cout << "Running on "
1004  << device.template get_info<cl::sycl::info::device::name>()
1005  << std::endl;
1006  QueueInterface queueInterface(device);
1007  auto sycl_device = Eigen::SyclDevice(&queueInterface);
1008  CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
1009  CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
1011  sycl_reduction_test_first_dim_per_device<float>(sycl_device));
1012  CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
1013  }
1014 }
Eigen::TensorFixedSize::dimensions
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE Dimensions & dimensions() const
Definition: TensorFixedSize.h:64
Eigen::Tensor
The tensor class.
Definition: Tensor.h:63
test_last_dim_reductions_sum_sycl
static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:740
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
CustomReducer::IsStateful
static const bool IsStateful
Definition: cxx11_tensor_reduction_sycl.cpp:871
test_full_reductions_sum_with_offset_sycl
static void test_full_reductions_sum_with_offset_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:66
Eigen::array
Definition: EmulateArray.h:21
CustomReducer
Definition: cxx11_tensor_reduction_sycl.cpp:869
VERIFY_IS_EQUAL
#define VERIFY_IS_EQUAL(a, b)
Definition: main.h:386
CustomReducer::reduce
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, OutT *accum) const
Definition: cxx11_tensor_reduction_sycl.cpp:884
DataLayout
static const int DataLayout
Definition: cxx11_tensor_image_patch_sycl.cpp:24
x
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
Definition: gnuplot_common_settings.hh:12
Eigen::TensorBase< Tensor< Scalar_, NumIndices_, Options_, IndexType_ > >::reshape
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE TensorReshapingOp< const NewDimensions, const Tensor< Scalar_, NumIndices_, Options_, IndexType_ > > reshape(const NewDimensions &newDimensions) const
Definition: TensorBase.h:1055
Eigen::numext::isinf
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool() isinf(const Eigen::bfloat16 &h)
Definition: BFloat16.h:665
test_last_dim_reductions_max_with_offset_sycl
static void test_last_dim_reductions_max_with_offset_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:544
sycl_reduction_test_last_dim_per_device
void sycl_reduction_test_last_dim_per_device(const Dev &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:990
test_first_dim_reductions_max_sycl
static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:441
sycl_reduction_test_first_dim_per_device
void sycl_reduction_test_first_dim_per_device(const Dev &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:976
test_last_dim_reductions_mean_sycl
static void test_last_dim_reductions_mean_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:695
CustomReducer::initialize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const
Definition: cxx11_tensor_reduction_sycl.cpp:889
Eigen::DSizes
Definition: TensorDimensions.h:263
test_last_reductions_mean_sycl
static void test_last_reductions_mean_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:826
Eigen::Sizes
Definition: TensorDimensions.h:93
CustomReducer::finalize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const
Definition: cxx11_tensor_reduction_sycl.cpp:893
CALL_SUBTEST_4
#define CALL_SUBTEST_4(FUNC)
Definition: split_test_helper.h:22
test_full_reductions_mean_sycl
static void test_full_reductions_mean_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:186
test_full_reductions_mean_with_odd_offset_sycl
static void test_full_reductions_mean_with_odd_offset_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:316
CALL_SUBTEST_3
#define CALL_SUBTEST_3(FUNC)
Definition: split_test_helper.h:16
test_full_reductions_max_sycl
static void test_full_reductions_max_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:108
sycl_reduction_test_full_per_device
void sycl_reduction_test_full_per_device(const Dev &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:935
CALL_SUBTEST_1
#define CALL_SUBTEST_1(FUNC)
Definition: split_test_helper.h:4
CustomReducer::reduce
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, OutT *accum) const
Definition: cxx11_tensor_reduction_sycl.cpp:876
test_last_reductions_sum_sycl
static void test_last_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:785
CustomReducer::InfBit
static constexpr OutT InfBit
Definition: cxx11_tensor_reduction_sycl.cpp:873
j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Eigen::Tensor::dimensions
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE Dimensions & dimensions() const
Definition: Tensor.h:102
Eigen::TensorMap::data
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data()
Definition: TensorMap.h:137
sycl_reduction_full_offset_per_device
void sycl_reduction_full_offset_per_device(const Dev &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:953
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
Eigen::DSizes::TotalSize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
Definition: TensorDimensions.h:271
Eigen::numext::isnan
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool() isnan(const Eigen::bfloat16 &h)
Definition: BFloat16.h:659
Eigen::TensorMap
A tensor expression mapping an existing array of data.
Definition: TensorForwardDeclarations.h:52
test_first_dim_reductions_mean_sycl
static void test_first_dim_reductions_mean_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:649
CustomReducer::PacketAccess
static const bool PacketAccess
Definition: cxx11_tensor_reduction_sycl.cpp:870
CustomReducer::NanBit
static constexpr OutT NanBit
Definition: cxx11_tensor_reduction_sycl.cpp:874
CALL_SUBTEST_2
#define CALL_SUBTEST_2(FUNC)
Definition: split_test_helper.h:10
EIGEN_DECLARE_TEST
EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl)
Definition: cxx11_tensor_reduction_sycl.cpp:1001
Eigen::internal::array_prod
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition: TensorDimensions.h:140
offset
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition: gnuplot_common_settings.hh:64
VERIFY_IS_APPROX
#define VERIFY_IS_APPROX(a, b)
Definition: integer_types.cpp:15
Eigen::TensorBase< Tensor< Scalar_, NumIndices_, Options_, IndexType_ > >::setRandom
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor< Scalar_, NumIndices_, Options_, IndexType_ > & setRandom()
Definition: TensorBase.h:996
VERIFY_IS_NOT_EQUAL
#define VERIFY_IS_NOT_EQUAL(a, b)
Definition: main.h:387
main.h
test_full_reductions_min_with_offset_sycl
static void test_full_reductions_min_with_offset_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:396
Eigen::Tensor::data
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
Definition: Tensor.h:104
Eigen::internal::MeanReducer
Definition: TensorFunctors.h:111
test_first_dim_reductions_sum_sycl
static void test_first_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device, IndexType dim_x, IndexType dim_y)
Definition: cxx11_tensor_reduction_sycl.cpp:609
Eigen::TensorFixedSize
The fixed sized version of the tensor class.
Definition: TensorFixedSize.h:27
Eigen::Tensor::size
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const
Definition: Tensor.h:103
test_full_reductions_mean_with_offset_sycl
static void test_full_reductions_mean_with_offset_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:273
test_full_reductions_min_sycl
static void test_full_reductions_min_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:362
Eigen::TensorFixedSize::data
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
Definition: TensorFixedSize.h:66
test_full_reductions_max_with_offset_sycl
static void test_full_reductions_max_with_offset_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:140
test_full_reductions_custom_sycl
static void test_full_reductions_custom_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:900
test_full_reductions_sum_sycl
static void test_full_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:26
test_first_dim_reductions_max_with_offset_sycl
static void test_first_dim_reductions_max_with_offset_sycl(const Eigen::SyclDevice &sycl_device)
Definition: cxx11_tensor_reduction_sycl.cpp:487
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9


gtsam
Author(s):
autogenerated on Tue Jan 7 2025 04:02:07