TensorConvolution.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 
24 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
25 class IndexMapper {
26  public:
27  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
28  const array<Index, NumKernelDims>& indices) {
29 
30  array<Index, NumDims> dimensions = input_dims;
31  for (int i = 0; i < NumKernelDims; ++i) {
32  const Index index = indices[i];
33  const Index input_dim = input_dims[index];
34  const Index kernel_dim = kernel_dims[i];
35  const Index result_dim = input_dim - kernel_dim + 1;
36  dimensions[index] = result_dim;
37  }
38 
39  array<Index, NumDims> inputStrides;
40  array<Index, NumDims> outputStrides;
41  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42  inputStrides[0] = 1;
43  outputStrides[0] = 1;
44  for (int i = 1; i < NumDims; ++i) {
45  inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46  outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47  }
48  } else {
49  inputStrides[NumDims - 1] = 1;
50  outputStrides[NumDims - 1] = 1;
51  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54  }
55  }
56 
57  array<Index, NumDims> gpuInputDimensions;
58  array<Index, NumDims> gpuOutputDimensions;
61  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62  ? 0
63  : NumDims - NumKernelDims;
64  for (int i = 0; i < NumKernelDims; ++i) {
65  const Index index = i + offset;
66  ordering[index] = indices[i];
67  tmp[indices[i]] = -1;
68  gpuInputDimensions[index] = input_dims[indices[i]];
69  gpuOutputDimensions[index] = dimensions[indices[i]];
70  }
71 
72  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73  ? NumKernelDims
74  : 0;
75  for (int i = 0; i < NumDims; ++i) {
76  if (tmp[i] >= 0) {
77  ordering[written] = i;
78  gpuInputDimensions[written] = input_dims[i];
79  gpuOutputDimensions[written] = dimensions[i];
80  ++written;
81  }
82  }
83 
84  for (int i = 0; i < NumDims; ++i) {
85  m_inputStrides[i] = inputStrides[ordering[i]];
86  m_outputStrides[i] = outputStrides[ordering[i]];
87  }
88 
89  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90  for (int i = 0; i < NumDims; ++i) {
91  if (i > NumKernelDims) {
93  m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
95  m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
96  } else {
97  m_gpuInputStrides[i] = 1;
98  m_gpuOutputStrides[i] = 1;
99  }
100  }
101  } else {
102  for (int i = NumDims - 1; i >= 0; --i) {
103  if (static_cast<size_t>(i + 1) < offset) {
105  m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
107  m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
108  } else {
109  m_gpuInputStrides[i] = 1;
110  m_gpuOutputStrides[i] = 1;
111  }
112  }
113  }
114  }
115 
117  Index inputIndex = 0;
118  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119  for (int d = NumDims - 1; d > NumKernelDims; --d) {
120  const Index idx = p / m_gpuInputStrides[d];
121  inputIndex += idx * m_inputStrides[d];
122  p -= idx * m_gpuInputStrides[d];
123  }
124  inputIndex += p * m_inputStrides[NumKernelDims];
125  } else {
126  std::ptrdiff_t limit = 0;
127  if (NumKernelDims < NumDims) {
128  limit = NumDims - NumKernelDims - 1;
129  }
130  for (int d = 0; d < limit; ++d) {
131  const Index idx = p / m_gpuInputStrides[d];
132  inputIndex += idx * m_inputStrides[d];
133  p -= idx * m_gpuInputStrides[d];
134  }
135  inputIndex += p * m_inputStrides[limit];
136  }
137  return inputIndex;
138  }
139 
141  Index outputIndex = 0;
142  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143  for (int d = NumDims - 1; d > NumKernelDims; --d) {
144  const Index idx = p / m_gpuOutputStrides[d];
145  outputIndex += idx * m_outputStrides[d];
146  p -= idx * m_gpuOutputStrides[d];
147  }
148  outputIndex += p * m_outputStrides[NumKernelDims];
149  } else {
150  std::ptrdiff_t limit = 0;
151  if (NumKernelDims < NumDims) {
152  limit = NumDims - NumKernelDims - 1;
153  }
154  for (int d = 0; d < limit; ++d) {
155  const Index idx = p / m_gpuOutputStrides[d];
156  outputIndex += idx * m_outputStrides[d];
157  p -= idx * m_gpuOutputStrides[d];
158  }
159  outputIndex += p * m_outputStrides[limit];
160  }
161  return outputIndex;
162  }
163 
165  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166  ? 0
167  : NumDims - NumKernelDims;
168  return i * m_inputStrides[offset];
169  }
170 
172  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173  ? 0
174  : NumDims - NumKernelDims;
175  return i * m_outputStrides[offset];
176  }
177 
179  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180  ? 0
181  : NumDims - NumKernelDims;
182  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183  }
184 
186  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187  ? 0
188  : NumDims - NumKernelDims;
189  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190  }
191 
193  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194  ? 0
195  : NumDims - NumKernelDims;
196  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197  k * m_inputStrides[offset + 2];
198  }
199 
201  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202  ? 0
203  : NumDims - NumKernelDims;
204  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205  k * m_outputStrides[offset + 2];
206  }
207 
208  private:
214 };
215 
216 
217 
218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220 {
221  // Type promotion to handle the case where the types of the lhs and the rhs are different.
222  typedef typename promote_storage_type<typename InputXprType::Scalar,
228  typedef typename InputXprType::Nested LhsNested;
229  typedef typename KernelXprType::Nested RhsNested;
232  static const int NumDimensions = traits<InputXprType>::NumDimensions;
233  static const int Layout = traits<InputXprType>::Layout;
236 
237  enum {
238  Flags = 0
239  };
240 };
241 
242 template<typename Dimensions, typename InputXprType, typename KernelXprType>
243 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
244 {
246 };
247 
248 template<typename Dimensions, typename InputXprType, typename KernelXprType>
249 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
250 {
252 };
253 
254 } // end namespace internal
255 
256 
257 
258 template<typename Indices, typename InputXprType, typename KernelXprType>
259 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
260 {
261  public:
264  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
265  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
269 
270  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
271  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
272 
274  const Indices& indices() const { return m_indices; }
275 
279  inputExpression() const { return m_input_xpr; }
280 
283  kernelExpression() const { return m_kernel_xpr; }
284 
285  protected:
286  typename InputXprType::Nested m_input_xpr;
287  typename KernelXprType::Nested m_kernel_xpr;
289 };
290 
291 
292 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
293 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
294 {
296 
298  static const int NumKernelDims = internal::array_size<Indices>::value;
299  typedef typename XprType::Index Index;
301 
302  typedef typename XprType::Scalar Scalar;
305  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
308 
309  enum {
312  BlockAccess = false,
313  PreferBlockAccess = false,
315  CoordAccess = false, // to be implemented
316  RawAccess = false
317  };
318 
319  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
321  //===--------------------------------------------------------------------===//
322 
323  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
324  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
325  {
326  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
327 
328  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
329  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
330 
331  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
332  m_inputStride[0] = 1;
333  for (int i = 1; i < NumDims; ++i) {
334  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
335  }
336  } else {
337  m_inputStride[NumDims - 1] = 1;
338  for (int i = NumDims - 2; i >= 0; --i) {
339  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
340  }
341  }
342 
343  m_dimensions = m_inputImpl.dimensions();
344  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
345  for (int i = 0; i < NumKernelDims; ++i) {
346  const Index index = op.indices()[i];
347  const Index input_dim = input_dims[index];
348  const Index kernel_dim = kernel_dims[i];
349  const Index result_dim = input_dim - kernel_dim + 1;
350  m_dimensions[index] = result_dim;
351  if (i > 0) {
352  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
353  } else {
354  m_kernelStride[0] = 1;
355  }
356  m_indexStride[i] = m_inputStride[index];
357  }
358 
359  m_outputStride[0] = 1;
360  for (int i = 1; i < NumDims; ++i) {
361  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
362  }
363  } else {
364  for (int i = NumKernelDims - 1; i >= 0; --i) {
365  const Index index = op.indices()[i];
366  const Index input_dim = input_dims[index];
367  const Index kernel_dim = kernel_dims[i];
368  const Index result_dim = input_dim - kernel_dim + 1;
369  m_dimensions[index] = result_dim;
370  if (i < NumKernelDims - 1) {
371  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
372  } else {
373  m_kernelStride[NumKernelDims - 1] = 1;
374  }
375  m_indexStride[i] = m_inputStride[index];
376  }
377 
378  m_outputStride[NumDims - 1] = 1;
379  for (int i = NumDims - 2; i >= 0; --i) {
380  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
381  }
382  }
383  }
384 
385  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
386 
388  m_inputImpl.evalSubExprsIfNeeded(NULL);
389  preloadKernel();
390  return true;
391  }
393  m_inputImpl.cleanup();
394  if (m_local_kernel) {
395  m_device.deallocate((void*)m_kernel);
396  m_local_kernel = false;
397  }
398  m_kernel = NULL;
399  }
400 
401  void evalTo(typename XprType::Scalar* buffer) {
402  evalSubExprsIfNeeded(NULL);
403  for (int i = 0; i < dimensions().TotalSize(); ++i) {
404  buffer[i] += coeff(i);
405  }
406  cleanup();
407  }
408 
409  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
410  {
411  CoeffReturnType result = CoeffReturnType(0);
412  convolve(firstInput(index), 0, NumKernelDims-1, result);
413  return result;
414  }
415 
416  template<int LoadMode>
417  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
418  {
419  Index indices[2] = {index, index+PacketSize-1};
420  Index startInputs[2] = {0, 0};
421  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
422  for (int i = NumDims - 1; i > 0; --i) {
423  const Index idx0 = indices[0] / m_outputStride[i];
424  const Index idx1 = indices[1] / m_outputStride[i];
425  startInputs[0] += idx0 * m_inputStride[i];
426  startInputs[1] += idx1 * m_inputStride[i];
427  indices[0] -= idx0 * m_outputStride[i];
428  indices[1] -= idx1 * m_outputStride[i];
429  }
430  } else {
431  for (int i = 0; i < NumDims - 1; ++i) {
432  const Index idx0 = indices[0] / m_outputStride[i];
433  const Index idx1 = indices[1] / m_outputStride[i];
434  startInputs[0] += idx0 * m_inputStride[i];
435  startInputs[1] += idx1 * m_inputStride[i];
436  indices[0] -= idx0 * m_outputStride[i];
437  indices[1] -= idx1 * m_outputStride[i];
438  }
439  }
440  startInputs[0] += indices[0];
441  startInputs[1] += indices[1];
442 
443  if (startInputs[1]-startInputs[0] == PacketSize-1) {
444  PacketReturnType result = internal::pset1<PacketReturnType>(0);
445  convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
446  return result;
447  } else {
448  EIGEN_ALIGN_MAX Scalar data[PacketSize];
449  data[0] = Scalar(0);
450  convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
451  for (int i = 1; i < PacketSize-1; ++i) {
452  data[i] = Scalar(0);
453  convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
454  }
455  data[PacketSize-1] = Scalar(0);
456  convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
457  return internal::pload<PacketReturnType>(data);
458  }
459  }
460 
462  costPerCoeff(bool vectorized) const {
463  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
464  // We ignore the use of fused multiply-add.
465  const double convolve_compute_cost =
466  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
467  const double firstIndex_compute_cost =
468  NumDims *
469  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
470  TensorOpCost::DivCost<Index>());
471  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
472  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
473  m_kernelImpl.costPerCoeff(vectorized) +
474  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
475  PacketSize));
476  }
477 
478  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
479 
480  private:
482  Index startInput = 0;
483  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
484  for (int i = NumDims - 1; i > 0; --i) {
485  const Index idx = index / m_outputStride[i];
486  startInput += idx * m_inputStride[i];
487  index -= idx * m_outputStride[i];
488  }
489  } else {
490  for (int i = 0; i < NumDims - 1; ++i) {
491  const Index idx = index / m_outputStride[i];
492  startInput += idx * m_inputStride[i];
493  index -= idx * m_outputStride[i];
494  }
495  }
496  startInput += index;
497  return startInput;
498  }
499 
500  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
501  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
502  const Index input = firstIndex + j * m_indexStride[DimIndex];
503  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
504  if (DimIndex > 0) {
505  convolve(input, kernel, DimIndex-1, accum);
506  } else {
507  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
508  }
509  }
510  }
511 
512  template <typename Packet>
513  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
514  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
515  const Index input = firstIndex + j * m_indexStride[DimIndex];
516  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
517  if (DimIndex > 0) {
518  convolvePacket(input, kernel, DimIndex-1, accum);
519  } else {
520  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
521  }
522  }
523  }
524 
526  // Don't make a local copy of the kernel unless we have to (i.e. it's an
527  // expression that needs to be evaluated)
528  const Scalar* in_place = m_kernelImpl.data();
529  if (in_place) {
530  m_kernel = in_place;
531  m_local_kernel = false;
532  } else {
533  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
534  Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
536  EvalTo evalToTmp(local, m_kernelArg);
539 
540  m_kernel = local;
541  m_local_kernel = true;
542  }
543  }
544 
547 
552  Dimensions m_dimensions;
553 
554  KernelArgType m_kernelArg;
555  const Scalar* m_kernel;
558 };
559 
560 
561 
562 
563 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
564 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
565 
566 template <int StaticKernelSize>
567 struct GetKernelSize {
568  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
569  return StaticKernelSize;
570  }
571 };
572 template <>
573 struct GetKernelSize<Dynamic> {
574  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
575  return kernelSize;
576  }
577 };
578 
579 template <typename InputEvaluator, typename Index, typename InputDims,
580  int StaticKernelSize>
581 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
582  InputEvaluator eval,
584  indexMapper,
585  const float* __restrict kernel, const int numPlanes, const int numX,
586  const int maxX, const int kernelSize, float* buffer) {
587 #if defined(EIGEN_HIPCC)
588  HIP_DYNAMIC_SHARED(float, s)
589 #else
590  extern __shared__ float s[];
591 #endif
592 
593  const int first_x = blockIdx.x * maxX;
594  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
595  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
596  const int num_x_output = last_x - first_x + 1;
597 
598  const int first_plane = blockIdx.y * blockDim.y;
599  const int plane_stride = blockDim.y * gridDim.y;
600 
601  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
602  // Load inputs to shared memory
603  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
604  const int plane_kernel_offset = threadIdx.y * num_x_input;
605  #pragma unroll
606  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
607  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
608  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
609  }
610 
611  __syncthreads();
612 
613  // Compute the convolution
614  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
615 
616  #pragma unroll
617  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
618  const int kernel_offset = plane_kernel_offset + i;
619  float result = 0.0f;
620  #pragma unroll
621  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
622  result += s[k + kernel_offset] * kernel[k];
623  }
624  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
625  buffer[tensor_index] = result;
626  }
627  __syncthreads();
628  }
629 };
630 
631 template <typename InputEvaluator, typename Index, typename InputDims,
632  int StaticKernelSizeX, int StaticKernelSizeY>
633 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
634  InputEvaluator eval,
636  indexMapper,
637  const float* __restrict kernel, const int numPlanes, const int numX,
638  const int maxX, const int numY, const int maxY, const int kernelSizeX,
639  const int kernelSizeY, float* buffer) {
640 #if defined(EIGEN_HIPCC)
641  HIP_DYNAMIC_SHARED(float, s)
642 #else
643  extern __shared__ float s[];
644 #endif
645 
646  const int first_x = blockIdx.x * maxX;
647  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
648  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
649  const int num_x_output = last_x - first_x + 1;
650 
651  const int first_y = blockIdx.y * maxY;
652  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
653  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
654  const int num_y_output = last_y - first_y + 1;
655 
656  const int first_plane = blockIdx.z * blockDim.z;
657  const int plane_stride = blockDim.z * gridDim.z;
658 
659  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
660 
661  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
662  const int plane_kernel_offset = threadIdx.z * num_y_input;
663 
664  // Load inputs to shared memory
665  #pragma unroll
666  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
667  const int input_offset = num_x_input * (j + plane_kernel_offset);
668  #pragma unroll
669  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
670  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
671  s[i + input_offset] = eval.coeff(tensor_index);
672  }
673  }
674 
675  __syncthreads();
676 
677  // Convolution
678  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
679 
680  #pragma unroll
681  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
682  #pragma unroll
683  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
684  float result = 0.0f;
685  #pragma unroll
686  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
687  const int kernel_offset = kernelSizeX * l;
688  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
689  #pragma unroll
690  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
691  result += s[k + input_offset] * kernel[k + kernel_offset];
692  }
693  }
694  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
695  buffer[tensor_index] = result;
696  }
697  }
698 
699  __syncthreads();
700  }
701 };
702 
703 template <typename InputEvaluator, typename Index, typename InputDims>
704 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
705  InputEvaluator eval,
707  indexMapper,
708  const float* __restrict kernel, const size_t numPlanes, const size_t numX,
709  const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
710  const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
711  const size_t kernelSizeZ, float* buffer) {
712 #if defined(EIGEN_HIPCC)
713  HIP_DYNAMIC_SHARED(float, s)
714 #else
715  extern __shared__ float s[];
716 #endif
717 
718  // Load inputs to shared memory
719  const int first_x = blockIdx.x * maxX;
720  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
721  const int num_x_input = last_x - first_x + kernelSizeX;
722 
723  const int first_y = blockIdx.y * maxY;
724  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
725  const int num_y_input = last_y - first_y + kernelSizeY;
726 
727  const int first_z = blockIdx.z * maxZ;
728  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
729  const int num_z_input = last_z - first_z + kernelSizeZ;
730 
731  for (int p = 0; p < numPlanes; ++p) {
732 
733  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
734  const int plane_kernel_offset = 0;
735 
736  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
737  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
738  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
739  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
740  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
741  }
742  }
743  }
744 
745  __syncthreads();
746 
747  // Convolution
748  const int num_z_output = last_z - first_z + 1;
749  const int num_y_output = last_y - first_y + 1;
750  const int num_x_output = last_x - first_x + 1;
751  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
752 
753  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
754  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
755  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
756  float result = 0.0f;
757  for (int n = 0; n < kernelSizeZ; ++n) {
758  for (int m = 0; m < kernelSizeY; ++m) {
759  for (int l = 0; l < kernelSizeX; ++l) {
760  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
761  }
762  }
763  }
764  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
765  buffer[tensor_index] = result;
766  }
767  }
768  }
769  __syncthreads();
770  }
771 };
772 
773 
774 
775 template<typename Indices, typename InputArgType, typename KernelArgType>
776 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
777 {
779 
781  static const int NumKernelDims = internal::array_size<Indices>::value;
782  typedef typename XprType::Index Index;
784  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
785 
786  enum {
788  PacketAccess = false,
789  BlockAccess = false,
790  PreferBlockAccess = false,
792  CoordAccess = false, // to be implemented
793  RawAccess = false
794  };
795 
796  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
798  //===--------------------------------------------------------------------===//
799 
800  TensorEvaluator(const XprType& op, const GpuDevice& device)
801  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
802  {
803  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
804 
805  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
806  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
807 
808  m_dimensions = m_inputImpl.dimensions();
809  for (int i = 0; i < NumKernelDims; ++i) {
810  const Index index = op.indices()[i];
811  const Index input_dim = input_dims[index];
812  const Index kernel_dim = kernel_dims[i];
813  const Index result_dim = input_dim - kernel_dim + 1;
814  m_dimensions[index] = result_dim;
815  }
816  }
817 
818  typedef typename XprType::CoeffReturnType CoeffReturnType;
820  typedef typename InputArgType::Scalar Scalar;
821  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
822 
823  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
824 
825  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
826  preloadKernel();
827  m_inputImpl.evalSubExprsIfNeeded(NULL);
828  if (data) {
829  executeEval(data);
830  return false;
831  } else {
832  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
833  executeEval(m_buf);
834  return true;
835  }
836  }
837 
838  EIGEN_STRONG_INLINE void cleanup() {
839  m_inputImpl.cleanup();
840  if (m_buf) {
841  m_device.deallocate(m_buf);
842  m_buf = NULL;
843  }
844  if (m_local_kernel) {
845  m_device.deallocate((void*)m_kernel);
846  m_local_kernel = false;
847  }
848  m_kernel = NULL;
849  }
850 
851  EIGEN_STRONG_INLINE void preloadKernel() {
852  // Don't make a local copy of the kernel unless we have to (i.e. it's an
853  // expression that needs to be evaluated)
854  const Scalar* in_place = m_kernelImpl.data();
855  if (in_place) {
856  m_kernel = in_place;
857  m_local_kernel = false;
858  } else {
859  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
860  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
862  EvalTo evalToTmp(local, m_kernelArg);
865 
866  m_kernel = local;
867  m_local_kernel = true;
868  }
869  }
870 
871  static unsigned int ceil(unsigned int num, unsigned int denom) {
872  const unsigned int rounded_toward_zero = num / denom;
873  if (num > rounded_toward_zero * denom) {
874  return rounded_toward_zero + 1;
875  }
876  return rounded_toward_zero;
877  }
878 
879  void executeEval(Scalar* data) const {
880  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
881 
882  const int maxSharedMem = m_device.sharedMemPerBlock();
883  const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
884  const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
885  const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
886  const int warpSize = 32;
887 
888  switch (NumKernelDims) {
889  case 1: {
890  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
891 
892  const int numX = dimensions()[m_indices[0]];
893  const int numP = dimensions().TotalSize() / numX;
894  int maxX;
895  dim3 block_size;
896 
897  const int single_stride_dim =
898  static_cast<int>(Layout) == static_cast<int>(ColMajor)
899  ? 0
900  : m_inputImpl.dimensions().rank() - 1;
901  if (m_indices[0] == single_stride_dim) {
902  // Maximum the reuse
903  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
904  maxX = numext::mini<int>(inner_dim, numX);
905  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
906  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
907  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
908  }
909  else {
910  // Read as much as possible alongside the inner most dimension, that is the plane
911  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
912  const int maxP = numext::mini<int>(inner_dim, numP);
913  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
914 
915  block_size.x = numext::mini(warpSize, maxX);
916  block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
917  }
918 
919  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
920  gpu_assert(shared_mem <= maxSharedMem);
921 
922  const int num_x_blocks = ceil(numX, maxX);
923  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
924  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
925 
926  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
927 
928 
929  //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
930 
931  const array<Index, 1> indices(m_indices[0]);
932  const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
934  m_inputImpl.dimensions(), kernel_dims, indices);
935  switch(kernel_size) {
936  case 4: {
937  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
938  break;
939  }
940  case 7: {
941  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
942  break;
943  }
944  default: {
945  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
946  }
947  }
948  break;
949  }
950 
951  case 2: {
952  const int idxX =
953  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
954  const int idxY =
955  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
956  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
957  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
958 
959  const int numX = dimensions()[m_indices[idxX]];
960  const int numY = dimensions()[m_indices[idxY]];
961  const int numP = dimensions().TotalSize() / (numX*numY);
962 
963  const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
964 
965  // Snap maxX to warp size
966  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
967  const int maxX = numext::mini<int>(inner_dim, numX);
968  const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
969  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
970 
971  dim3 block_size;
972  block_size.x = numext::mini(1024, maxX);
973  block_size.y = numext::mini<int>(1024/block_size.x, maxY);
974  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
975 
976  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
977  gpu_assert(shared_mem <= maxSharedMem);
978 
979  const int num_x_blocks = ceil(numX, maxX);
980  const int num_y_blocks = ceil(numY, maxY);
981  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
982  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
983 
984  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
985 
986 
987  //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
988 
989  const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
990  const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
991  m_kernelImpl.dimensions()[idxY]);
993  m_inputImpl.dimensions(), kernel_dims, indices);
994  switch (kernel_size_x) {
995  case 4: {
996  switch (kernel_size_y) {
997  case 7: {
998  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
999  break;
1000  }
1001  default: {
1002  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1003  break;
1004  }
1005  }
1006  break;
1007  }
1008  case 7: {
1009  switch (kernel_size_y) {
1010  case 4: {
1011  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1012  break;
1013  }
1014  default: {
1015  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1016  break;
1017  }
1018  }
1019  break;
1020  }
1021  default: {
1022  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1023  break;
1024  }
1025  }
1026  break;
1027  }
1028 
1029  case 3: {
1030  const int idxX =
1031  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1032  const int idxY =
1033  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1034  const int idxZ =
1035  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1036 
1037  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1038  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1039  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1040 
1041  const int numX = dimensions()[m_indices[idxX]];
1042  const int numY = dimensions()[m_indices[idxY]];
1043  const int numZ = dimensions()[m_indices[idxZ]];
1044  const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1045 
1046  const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1047  const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1048  const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1049 
1050  dim3 block_size;
1051  block_size.x = numext::mini(32, maxX);
1052  block_size.y = numext::mini(32, maxY);
1053  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1054  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1055 
1056  const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1057  gpu_assert(shared_mem <= maxSharedMem);
1058 
1059  //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1060  const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1061  m_indices[idxZ]);
1062  const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1063  m_kernelImpl.dimensions()[idxY],
1064  m_kernelImpl.dimensions()[idxZ]);
1066  m_inputImpl.dimensions(), kernel_dims, indices);
1067 
1068  LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1069  break;
1070  }
1071 
1072  default: {
1073  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1074  }
1075  }
1076  }
1077 
1078  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1079  {
1080  eigen_assert(m_buf);
1081  eigen_assert(index < m_dimensions.TotalSize());
1082  return m_buf[index];
1083  }
1084 
1085  template<int LoadMode>
1086  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1087  {
1088  eigen_assert(m_buf);
1089  eigen_assert(index < m_dimensions.TotalSize());
1090  return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1091  }
1092 
1094  costPerCoeff(bool vectorized) const {
1095  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1096  // model.
1097  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1098  // We ignore the use of fused multiply-add.
1099  const double convolve_compute_cost =
1100  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1101  const double firstIndex_compute_cost =
1102  NumDims *
1103  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1104  TensorOpCost::DivCost<Index>());
1105  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1106  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1107  m_kernelImpl.costPerCoeff(vectorized) +
1108  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1109  PacketSize));
1110  }
1111 
1112  private:
1113  // No assignment (copies are needed by the kernels)
1114  TensorEvaluator& operator = (const TensorEvaluator&);
1115 
1118  KernelArgType m_kernelArg;
1119  Indices m_indices;
1120  Dimensions m_dimensions;
1121  Scalar* m_buf;
1122  const Scalar* m_kernel;
1123  bool m_local_kernel;
1124 
1125  const GpuDevice& m_device;
1126 };
1127 #endif
1128 
1129 
1130 } // end namespace Eigen
1131 
1132 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Matrix3f m
SCALAR Scalar
Definition: bench_gemm.cpp:46
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:510
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
KernelXprType::Nested m_kernel_xpr
#define __restrict
Definition: gkregex.h:522
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
int n
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
dim3 threadIdx
Definition: gpu_common.h:19
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
A cost model used to limit the number of threads used for evaluating tensor expression.
EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet &accum) const
InputXprType::Nested m_input_xpr
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:127
#define EIGEN_ALIGN_MAX
array< Index, NumDims > m_gpuInputStrides
static enum @1107 ordering
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename InputXprType::Nested >::type & inputExpression() const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const
promote_storage_type< typename InputXprType::Scalar, typename KernelXprType::Scalar >::ret Scalar
promote_storage_type< typename traits< InputXprType >::StorageKind, typename traits< KernelXprType >::StorageKind >::ret StorageKind
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
static const Line3 l(Rot3(), 1, 1)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
const TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > & type
Values result
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
#define eigen_assert(x)
Definition: Macros.h:1037
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
int data[]
RealScalar s
array< Index, NumDims > m_gpuOutputStrides
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
conditional< Pointer_type_promotion< typename InputXprType::Scalar, Scalar >::val, typename traits< InputXprType >::PointerType, typename traits< KernelXprType >::PointerType >::type PointerType
array< Index, NumDims > m_outputStrides
#define NULL
Definition: ccolamd.c:609
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
array< Index, NumDims > m_inputStrides
Derived::Dimensions Dimensions
DenseIndex ret
The tensor base class.
Definition: TensorBase.h:973
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename KernelXprType::Nested >::type & kernelExpression() const
Eigen::internal::traits< TensorConvolutionOp >::Index Index
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
float * p
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const
std::vector< size_t > Indices
EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType &accum) const
Eigen::NumTraits< Scalar >::Real RealScalar
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:50
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const
Generic expression where a coefficient-wise unary operator is applied to an expression.
Definition: CwiseUnaryOp.h:55
const std::vector< size_t > dimensions
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
dim3 blockIdx
Definition: gpu_common.h:19
Eigen::internal::nested< TensorConvolutionOp >::type Nested
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
std::ptrdiff_t j
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())
Definition: pytypes.h:1370
promote_index_type< typename traits< InputXprType >::Index, typename traits< KernelXprType >::Index >::type Index
dim3 blockDim
Definition: gpu_common.h:19


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:36:49