TensorConvolutionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17 
18 namespace Eigen {
19 
29 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
30  typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
32 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
33  typename Kernel_accessor, typename Buffer_accessor>
34 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
35  Buffer_accessor, convolution_type::CONV1D> {
36  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
39  Evaluator device_evaluator;
40  Kernel_accessor kernel_filter;
41  Buffer_accessor buffer_acc;
43  const size_t kernelSize;
44  const cl::sycl::range<2> input_range;
45  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
46  Buffer_accessor buffer_acc_,
48  const size_t kernelSize_, const cl::sycl::range<2> input_range_)
49  : local_acc(local_acc_),
50  device_evaluator(device_evaluator_),
51  kernel_filter(kernel_filter_),
52  buffer_acc(buffer_acc_),
53  indexMapper(indexMapper_),
54  kernelSize(kernelSize_),
55  input_range(input_range_) {}
56 
57  template <typename BooleanDim2>
58  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
59  return (boolean_check[0] && boolean_check[1]);
60  }
61  void operator()(cl::sycl::nd_item<2> itemID) {
62  auto buffer_ptr = buffer_acc.get_pointer();
63  auto kernel_ptr = kernel_filter.get_pointer();
64  // the required row to be calculated for the for each plane in shered memory
65  const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
66  const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
67  const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
68  const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
70  for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
71  const size_t local_index = i + plane_kernel_offset;
72  const size_t tensor_index =
73  plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
74 
75  local_acc[local_index] =
76  (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
77  ? device_evaluator.coeff(tensor_index)
78  : CoeffReturnType(0);
79  }
80 
81  itemID.barrier(cl::sycl::access::fence_space::local_space);
82 
83  // calculate the convolution // output start x
84  const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
85  if (boundary_check(itemID.get_global_id() < input_range)) {
86  CoeffReturnType result = static_cast<CoeffReturnType>(0);
87  const size_t index = plane_kernel_offset + itemID.get_local_id(0);
88  for (size_t k = 0; k < kernelSize; ++k) {
89  result += (local_acc[k + index] * kernel_ptr[k]);
90  }
91  const size_t tensor_index =
92  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
93  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
94  buffer_ptr[tensor_index] = result;
95  }
96  }
97 };
98 
99 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
100  typename Kernel_accessor, typename Buffer_accessor>
101 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
102  Buffer_accessor, convolution_type::CONV2D> {
103  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
106  Evaluator device_evaluator;
107  Kernel_accessor kernel_filter;
108  Buffer_accessor buffer_acc;
110  const cl::sycl::range<2> kernel_size;
111  const cl::sycl::range<3> input_range;
112  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
113  Buffer_accessor buffer_acc_,
115  const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
116  : local_acc(local_acc_),
117  device_evaluator(device_evaluator_),
118  kernel_filter(kernel_filter_),
119  buffer_acc(buffer_acc_),
120  indexMapper(indexMapper_),
121  kernel_size(kernel_size_),
122  input_range(input_range_) {}
123  template <typename BooleanDim3>
124  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
125  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
126  }
127 
128  void operator()(cl::sycl::nd_item<3> itemID) {
129  auto buffer_ptr = buffer_acc.get_pointer();
130  auto kernel_ptr = kernel_filter.get_pointer();
131  // the required row to be calculated for the for each plane in shered memory
132  const auto num_input = cl::sycl::range<2>{
133  (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
134 
135  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
136  const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
137 
138  const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
139  itemID.get_group(1) * itemID.get_local_range()[1]};
140 
141  // fill the local memory
142  bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
143  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
144  const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
145  bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
146  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
147  const size_t local_index = i + local_input_offset;
148  const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
149  i + input_offset[0], j + input_offset[1]);
150  local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
151  in_range_dim1 && in_range_dim2)
152  ? device_evaluator.coeff(tensor_index)
153  : CoeffReturnType(0);
154  }
155  }
156 
157  itemID.barrier(cl::sycl::access::fence_space::local_space);
158 
159  // output offset start for each thread
160  const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
161  itemID.get_group(1) * itemID.get_local_range()[1]};
162 
163  if (boundary_check(itemID.get_global_id() < input_range)) {
164  CoeffReturnType result = static_cast<CoeffReturnType>(0);
165 
166  for (size_t j = 0; j < kernel_size[1]; j++) {
167  size_t kernel_offset = kernel_size[0] * j;
168  const size_t index =
169  (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
170  for (size_t i = 0; i < kernel_size[0]; i++) {
171  result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
172  }
173  }
174  const size_t tensor_index =
175  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
176  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
177  itemID.get_local_id(1) + output_offset[1]);
178 
179  buffer_ptr[tensor_index] = result;
180  }
181  }
182 };
183 
184 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
185  typename Kernel_accessor, typename Buffer_accessor>
186 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
187  Buffer_accessor, convolution_type::CONV3D> {
188  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
191  Evaluator device_evaluator;
192  Kernel_accessor kernel_filter;
193  Buffer_accessor buffer_acc;
195  const cl::sycl::range<3> kernel_size;
196  const cl::sycl::range<3> input_range;
197  const size_t numP;
198 
199  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
200  Buffer_accessor buffer_acc_,
202  const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
203  const size_t numP_)
204  : local_acc(local_acc_),
205  device_evaluator(device_evaluator_),
206  kernel_filter(kernel_filter_),
207  buffer_acc(buffer_acc_),
208  indexMapper(indexMapper_),
209  kernel_size(kernel_size_),
210  input_range(input_range_),
211  numP(numP_) {}
212  template <typename BooleanDim3>
213  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
214  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
215  }
216  void operator()(cl::sycl::nd_item<3> itemID) {
217  auto buffer_ptr = buffer_acc.get_pointer();
218  auto kernel_ptr = kernel_filter.get_pointer();
219  const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
220 
221  const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
222 
223  const auto output_offset =
224  cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
225 
226  for (size_t p = 0; p < numP; p++) {
228  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
229  for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
230  size_t local_index_dim2 = num_input[0] * num_input[1] * k;
231  bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
232  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
233  bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
234  size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
235  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
236  bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
237  const size_t local_index = local_index_dim1 + i;
238  const size_t tensor_index =
239  plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
240  i + input_offset[0], j + input_offset[1], k + input_offset[2]);
241  local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
242  }
243  }
244  }
245  itemID.barrier(cl::sycl::access::fence_space::local_space);
246 
247  // calculate the convolution
248 
249  if (boundary_check(itemID.get_global_id() < input_range)) {
250  CoeffReturnType result = static_cast<CoeffReturnType>(0);
251  for (size_t k = 0; k < kernel_size[2]; k++) {
252  for (size_t j = 0; j < kernel_size[1]; j++) {
253  for (size_t i = 0; i < kernel_size[0]; i++) {
254  const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
255  const size_t local_index =
256  ((i + itemID.get_local_id(0)) +
257  num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
258 
259  result += (local_acc[local_index] * kernel_ptr[kernel_index]);
260  }
261  }
262  }
263  const size_t tensor_index =
265  indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
266  buffer_ptr[tensor_index] = result;
267  }
268 
269  itemID.barrier(cl::sycl::access::fence_space::local_space);
270  }
271  }
272 };
273 
274 template <typename Indices, typename InputArgType, typename KernelArgType>
275 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
277 
278  static const int NumDims =
280  static const int NumKernelDims = internal::array_size<Indices>::value;
281  typedef typename XprType::Index Index;
284  typedef const Eigen::SyclDevice Device;
287  typedef typename InputArgType::Scalar Scalar;
288  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
292 
293  enum {
296  PacketAccess = false,
297  BlockAccess = false,
298  PreferBlockAccess = false,
300  CoordAccess = false, // to be implemented
301  RawAccess = false
302  };
303 
304  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
306  //===--------------------------------------------------------------------===//
307 
308  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
309  : m_inputImpl(op.inputExpression(), device),
310  m_kernelArg(op.kernelExpression()),
311  m_kernelImpl(op.kernelExpression(), device),
312  m_indices(op.indices()),
313  m_buf(NULL),
314  m_kernel(NULL),
315  m_local_kernel(false),
316  m_device(device) {
319  YOU_MADE_A_PROGRAMMING_MISTAKE);
320 
321  const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
323  m_kernelImpl.dimensions();
324 
325  m_dimensions = m_inputImpl.dimensions();
326  for (int i = 0; i < NumKernelDims; ++i) {
327  const Index index = op.indices()[i];
328  const Index input_dim = input_dims[index];
329  const Index kernel_dim = kernel_dims[i];
330  const Index result_dim = input_dim - kernel_dim + 1;
331  m_dimensions[index] = result_dim;
332  }
333  }
334 
335  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
336 
337  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
338  preloadKernel();
339  m_inputImpl.evalSubExprsIfNeeded(NULL);
340  if (data) {
341  executeEval(data);
342  return false;
343  } else {
344  m_buf = (EvaluatorPointerType)m_device.get(
345  (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
346  executeEval(m_buf);
347  return true;
348  }
349  }
350 
352  m_inputImpl.cleanup();
353  if (m_buf) {
354  m_device.deallocate_temp(m_buf);
355  m_buf = NULL;
356  }
357  if (m_local_kernel) {
358  m_device.deallocate_temp(m_kernel);
359  m_local_kernel = false;
360  }
361  m_kernel = NULL;
362  }
364  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
366  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
367 
369  // Don't make a local copy of the kernel unless we have to (i.e. it's an
370  // expression that needs to be evaluated)
371  typename KernelStorage::Type in_place = m_kernelImpl.data();
372  if (in_place) {
373  m_kernel = in_place;
374  m_local_kernel = false;
375  } else {
376  ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
377  EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
379  EvalTo evalToTmp(m_device.get(local), m_kernelArg);
382  m_kernel = local;
383  m_local_kernel = true;
384  }
385  }
386 
387  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
389  typedef typename InputEvaluator::Dimensions InputDims;
390  switch (NumKernelDims) {
391  case 1: {
392  const size_t numX = dimensions()[m_indices[0]];
393  const size_t numP = dimensions().TotalSize() / numX;
394  const auto input_dim = std::array<size_t, 2>{numX, numP};
395  auto global_range = cl::sycl::range<2>{};
396  auto local_range = cl::sycl::range<2>{};
397  const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
398 
399  m_device.parallel_for_setup(input_dim, global_range, local_range);
400  const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
401  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
402  const array<Index, 1> indices{{m_indices[0]}};
403  const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
404  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
405 
406  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
407  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
408  ConvKernel;
409 
410  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
411  m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
412  indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
413  break;
414  }
415 
416  case 2: {
417  auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
418  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
419  auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
420  (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
421  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
422  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
423  const size_t numP = dimensions().TotalSize() / (numX * numY);
424  auto input_dim = std::array<size_t, 3>{numX, numY, numP};
425 
426  auto global_range = cl::sycl::range<3>{};
427  auto local_range = cl::sycl::range<3>{};
428 
429  m_device.parallel_for_setup(input_dim, global_range, local_range);
430 
431  const size_t local_memory_size =
432  (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
433  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
434  const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
435  const array<Index, 2> kernel_dims{
436  {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
437  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
438  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
439  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
440  ConvKernel;
441  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
442  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
443  indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
444  break;
445  }
446 
447  case 3: {
448  auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
449  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
450  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
451 
452  auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
453  (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
454  (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
455 
456  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
457  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
458  const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
459  auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
460  const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
461 
462  const array<Index, 3> indices{
463  {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
464  const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
465  m_kernelImpl.dimensions()[kernel_index[1]],
466  m_kernelImpl.dimensions()[kernel_index[2]]}};
467 
468  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
469 
470  auto global_range = cl::sycl::range<3>{};
471  auto local_range = cl::sycl::range<3>{};
472 
473  m_device.parallel_for_setup(input_dim, global_range, local_range);
474  auto local_memory_range = (local_range + kernel_size - 1);
475  const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
476 
477  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
478  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
479  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
480  ConvKernel;
481  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483  indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
484  break;
485  }
486 
487  default: {
488  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
489  THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
490  }
491  }
492  }
493 
494  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
495  eigen_assert(m_buf != NULL);
496  eigen_assert(index < m_dimensions.TotalSize());
497  return m_buf[index];
498  }
499 
500  template <int LoadMode>
501  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
502  eigen_assert(m_buf != NULL);
503  eigen_assert(index < m_dimensions.TotalSize());
504  return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
505  }
506 
508  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
509  // model.
510  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
511  // We ignore the use of fused multiply-add.
512  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
513  const double firstIndex_compute_cost =
514  NumDims *
515  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
516  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
517  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
518  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
519  }
520  // binding placeholder accessors to a command group handler for SYCL
521  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
522  m_kernelImpl.bind(cgh);
523  m_inputImpl.bind(cgh);
524  m_buf.bind(cgh);
525  m_kernel.bind(cgh);
526  }
527 
528  private:
529  // No assignment (copies are needed by the kernels)
530  TensorEvaluator &operator=(const TensorEvaluator &);
532  KernelArgType m_kernelArg;
535  Dimensions m_dimensions;
536  EvaluatorPointerType m_buf;
539  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
540 }; // namespace Eigen
541 
542 } // end namespace Eigen
543 
544 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
used by sycl in order to build the sycl buffer
SCALAR Scalar
Definition: bench_gemm.cpp:46
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
A cost model used to limit the number of threads used for evaluating tensor expression.
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper_, const size_t kernelSize_, const cl::sycl::range< 2 > input_range_)
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:127
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper_, const cl::sycl::range< 3 > kernel_size_, const cl::sycl::range< 3 > input_range_, const size_t numP_)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Values result
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
#define eigen_assert(x)
Definition: Macros.h:1037
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
int data[]
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper_, const cl::sycl::range< 2 > kernel_size_, const cl::sycl::range< 3 > input_range_)
#define NULL
Definition: ccolamd.c:609
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
used by sycl in order to build the sycl buffer
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen::internal::traits< TensorConvolutionOp >::Index Index
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
float * p
std::vector< size_t > Indices
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:50
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
const std::vector< size_t > dimensions
std::ptrdiff_t j
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:36:49