TensorConvolution.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 
24 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
25 class IndexMapper {
26  public:
27  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
28  const array<Index, NumKernelDims>& indices) {
29 
30  array<Index, NumDims> dimensions = input_dims;
31  for (int i = 0; i < NumKernelDims; ++i) {
32  const Index index = indices[i];
33  const Index input_dim = input_dims[index];
34  const Index kernel_dim = kernel_dims[i];
35  const Index result_dim = input_dim - kernel_dim + 1;
36  dimensions[index] = result_dim;
37  }
38 
39  array<Index, NumDims> inputStrides;
40  array<Index, NumDims> outputStrides;
41  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42  inputStrides[0] = 1;
43  outputStrides[0] = 1;
44  for (int i = 1; i < NumDims; ++i) {
45  inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46  outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47  }
48  } else {
49  inputStrides[NumDims - 1] = 1;
50  outputStrides[NumDims - 1] = 1;
51  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54  }
55  }
56 
57  array<Index, NumDims> cudaInputDimensions;
58  array<Index, NumDims> cudaOutputDimensions;
59  array<Index, NumDims> tmp = dimensions;
60  array<Index, NumDims> ordering;
61  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62  ? 0
63  : NumDims - NumKernelDims;
64  for (int i = 0; i < NumKernelDims; ++i) {
65  const Index index = i + offset;
66  ordering[index] = indices[i];
67  tmp[indices[i]] = -1;
68  cudaInputDimensions[index] = input_dims[indices[i]];
69  cudaOutputDimensions[index] = dimensions[indices[i]];
70  }
71 
72  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73  ? NumKernelDims
74  : 0;
75  for (int i = 0; i < NumDims; ++i) {
76  if (tmp[i] >= 0) {
77  ordering[written] = i;
78  cudaInputDimensions[written] = input_dims[i];
79  cudaOutputDimensions[written] = dimensions[i];
80  ++written;
81  }
82  }
83 
84  for (int i = 0; i < NumDims; ++i) {
85  m_inputStrides[i] = inputStrides[ordering[i]];
86  m_outputStrides[i] = outputStrides[ordering[i]];
87  }
88 
89  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90  for (int i = 0; i < NumDims; ++i) {
91  if (i > NumKernelDims) {
93  m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
95  m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
96  } else {
97  m_cudaInputStrides[i] = 1;
98  m_cudaOutputStrides[i] = 1;
99  }
100  }
101  } else {
102  for (int i = NumDims - 1; i >= 0; --i) {
103  if (i + 1 < offset) {
104  m_cudaInputStrides[i] =
105  m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
107  m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
108  } else {
109  m_cudaInputStrides[i] = 1;
110  m_cudaOutputStrides[i] = 1;
111  }
112  }
113  }
114  }
115 
117  Index inputIndex = 0;
118  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119  for (int d = NumDims - 1; d > NumKernelDims; --d) {
120  const Index idx = p / m_cudaInputStrides[d];
121  inputIndex += idx * m_inputStrides[d];
122  p -= idx * m_cudaInputStrides[d];
123  }
124  inputIndex += p * m_inputStrides[NumKernelDims];
125  } else {
126  std::ptrdiff_t limit = 0;
127  if (NumKernelDims < NumDims) {
128  limit = NumDims - NumKernelDims - 1;
129  }
130  for (int d = 0; d < limit; ++d) {
131  const Index idx = p / m_cudaInputStrides[d];
132  inputIndex += idx * m_inputStrides[d];
133  p -= idx * m_cudaInputStrides[d];
134  }
135  inputIndex += p * m_inputStrides[limit];
136  }
137  return inputIndex;
138  }
139 
141  Index outputIndex = 0;
142  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143  for (int d = NumDims - 1; d > NumKernelDims; --d) {
144  const Index idx = p / m_cudaOutputStrides[d];
145  outputIndex += idx * m_outputStrides[d];
146  p -= idx * m_cudaOutputStrides[d];
147  }
148  outputIndex += p * m_outputStrides[NumKernelDims];
149  } else {
150  std::ptrdiff_t limit = 0;
151  if (NumKernelDims < NumDims) {
152  limit = NumDims - NumKernelDims - 1;
153  }
154  for (int d = 0; d < limit; ++d) {
155  const Index idx = p / m_cudaOutputStrides[d];
156  outputIndex += idx * m_outputStrides[d];
157  p -= idx * m_cudaOutputStrides[d];
158  }
159  outputIndex += p * m_outputStrides[limit];
160  }
161  return outputIndex;
162  }
163 
165  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166  ? 0
167  : NumDims - NumKernelDims;
168  return i * m_inputStrides[offset];
169  }
170 
172  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173  ? 0
174  : NumDims - NumKernelDims;
175  return i * m_outputStrides[offset];
176  }
177 
179  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180  ? 0
181  : NumDims - NumKernelDims;
182  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183  }
184 
186  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187  ? 0
188  : NumDims - NumKernelDims;
189  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190  }
191 
193  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194  ? 0
195  : NumDims - NumKernelDims;
196  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197  k * m_inputStrides[offset + 2];
198  }
199 
201  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202  ? 0
203  : NumDims - NumKernelDims;
204  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205  k * m_outputStrides[offset + 2];
206  }
207 
208  private:
214 };
215 
216 
217 
218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220 {
221  // Type promotion to handle the case where the types of the lhs and the rhs are different.
222  typedef typename promote_storage_type<typename InputXprType::Scalar,
223  typename KernelXprType::Scalar>::ret Scalar;
228  typedef typename InputXprType::Nested LhsNested;
229  typedef typename KernelXprType::Nested RhsNested;
232  static const int NumDimensions = traits<InputXprType>::NumDimensions;
233  static const int Layout = traits<InputXprType>::Layout;
234 
235  enum {
236  Flags = 0
237  };
238 };
239 
240 template<typename Dimensions, typename InputXprType, typename KernelXprType>
241 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
242 {
244 };
245 
246 template<typename Dimensions, typename InputXprType, typename KernelXprType>
247 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
248 {
250 };
251 
252 } // end namespace internal
253 
254 
255 
256 template<typename Indices, typename InputXprType, typename KernelXprType>
257 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
258 {
259  public:
262  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
263  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
267 
268  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
269  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
270 
271  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
272  const Indices& indices() const { return m_indices; }
273 
275  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
277  inputExpression() const { return m_input_xpr; }
278 
279  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
281  kernelExpression() const { return m_kernel_xpr; }
282 
283  protected:
284  typename InputXprType::Nested m_input_xpr;
285  typename KernelXprType::Nested m_kernel_xpr;
286  const Indices m_indices;
287 };
288 
289 
290 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
291 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
292 {
294 
296  static const int NumKernelDims = internal::array_size<Indices>::value;
297  typedef typename XprType::Index Index;
299 
300  typedef typename XprType::Scalar Scalar;
304 
305  enum {
309  CoordAccess = false, // to be implemented
310  RawAccess = false
311  };
312 
313  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
314  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
315  {
316  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
317 
318  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
319  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
320 
321  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
322  m_inputStride[0] = 1;
323  for (int i = 1; i < NumDims; ++i) {
324  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
325  }
326  } else {
327  m_inputStride[NumDims - 1] = 1;
328  for (int i = NumDims - 2; i >= 0; --i) {
329  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
330  }
331  }
332 
333  m_dimensions = m_inputImpl.dimensions();
334  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
335  for (int i = 0; i < NumKernelDims; ++i) {
336  const Index index = op.indices()[i];
337  const Index input_dim = input_dims[index];
338  const Index kernel_dim = kernel_dims[i];
339  const Index result_dim = input_dim - kernel_dim + 1;
340  m_dimensions[index] = result_dim;
341  if (i > 0) {
342  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
343  } else {
344  m_kernelStride[0] = 1;
345  }
346  m_indexStride[i] = m_inputStride[index];
347  }
348 
349  m_outputStride[0] = 1;
350  for (int i = 1; i < NumDims; ++i) {
351  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
352  }
353  } else {
354  for (int i = NumKernelDims - 1; i >= 0; --i) {
355  const Index index = op.indices()[i];
356  const Index input_dim = input_dims[index];
357  const Index kernel_dim = kernel_dims[i];
358  const Index result_dim = input_dim - kernel_dim + 1;
359  m_dimensions[index] = result_dim;
360  if (i < NumKernelDims - 1) {
361  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
362  } else {
363  m_kernelStride[NumKernelDims - 1] = 1;
364  }
365  m_indexStride[i] = m_inputStride[index];
366  }
367 
368  m_outputStride[NumDims - 1] = 1;
369  for (int i = NumDims - 2; i >= 0; --i) {
370  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
371  }
372  }
373  }
374 
375  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
376 
377  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
378  m_inputImpl.evalSubExprsIfNeeded(NULL);
379  preloadKernel();
380  return true;
381  }
382  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
383  m_inputImpl.cleanup();
384  if (m_local_kernel) {
385  m_device.deallocate((void*)m_kernel);
386  m_local_kernel = false;
387  }
388  m_kernel = NULL;
389  }
390 
391  void evalTo(typename XprType::Scalar* buffer) {
392  evalSubExprsIfNeeded(NULL);
393  for (int i = 0; i < dimensions().TotalSize(); ++i) {
394  buffer[i] += coeff(i);
395  }
396  cleanup();
397  }
398 
399  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
400  {
401  CoeffReturnType result = CoeffReturnType(0);
402  convolve(firstInput(index), 0, NumKernelDims-1, result);
403  return result;
404  }
405 
406  template<int LoadMode>
407  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
408  {
409  Index indices[2] = {index, index+PacketSize-1};
410  Index startInputs[2] = {0, 0};
411  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
412  for (int i = NumDims - 1; i > 0; --i) {
413  const Index idx0 = indices[0] / m_outputStride[i];
414  const Index idx1 = indices[1] / m_outputStride[i];
415  startInputs[0] += idx0 * m_inputStride[i];
416  startInputs[1] += idx1 * m_inputStride[i];
417  indices[0] -= idx0 * m_outputStride[i];
418  indices[1] -= idx1 * m_outputStride[i];
419  }
420  } else {
421  for (int i = 0; i < NumDims - 1; ++i) {
422  const Index idx0 = indices[0] / m_outputStride[i];
423  const Index idx1 = indices[1] / m_outputStride[i];
424  startInputs[0] += idx0 * m_inputStride[i];
425  startInputs[1] += idx1 * m_inputStride[i];
426  indices[0] -= idx0 * m_outputStride[i];
427  indices[1] -= idx1 * m_outputStride[i];
428  }
429  }
430  startInputs[0] += indices[0];
431  startInputs[1] += indices[1];
432 
433  if (startInputs[1]-startInputs[0] == PacketSize-1) {
434  PacketReturnType result = internal::pset1<PacketReturnType>(0);
435  convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
436  return result;
437  } else {
438  EIGEN_ALIGN_MAX Scalar data[PacketSize];
439  data[0] = Scalar(0);
440  convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
441  for (int i = 1; i < PacketSize-1; ++i) {
442  data[i] = Scalar(0);
443  convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
444  }
445  data[PacketSize-1] = Scalar(0);
446  convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
447  return internal::pload<PacketReturnType>(data);
448  }
449  }
450 
451  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
452  costPerCoeff(bool vectorized) const {
453  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
454  // We ignore the use of fused multiply-add.
455  const double convolve_compute_cost =
456  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
457  const double firstIndex_compute_cost =
458  NumDims *
459  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
460  TensorOpCost::DivCost<Index>());
461  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
462  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
463  m_kernelImpl.costPerCoeff(vectorized) +
464  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
465  PacketSize));
466  }
467 
468  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
469 
470  private:
471  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
472  Index startInput = 0;
473  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
474  for (int i = NumDims - 1; i > 0; --i) {
475  const Index idx = index / m_outputStride[i];
476  startInput += idx * m_inputStride[i];
477  index -= idx * m_outputStride[i];
478  }
479  } else {
480  for (int i = 0; i < NumDims - 1; ++i) {
481  const Index idx = index / m_outputStride[i];
482  startInput += idx * m_inputStride[i];
483  index -= idx * m_outputStride[i];
484  }
485  }
486  startInput += index;
487  return startInput;
488  }
489 
490  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
491  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
492  const Index input = firstIndex + j * m_indexStride[DimIndex];
493  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
494  if (DimIndex > 0) {
495  convolve(input, kernel, DimIndex-1, accum);
496  } else {
497  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
498  }
499  }
500  }
501 
502  template <typename Packet>
503  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
504  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
505  const Index input = firstIndex + j * m_indexStride[DimIndex];
506  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
507  if (DimIndex > 0) {
508  convolvePacket(input, kernel, DimIndex-1, accum);
509  } else {
510  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
511  }
512  }
513  }
514 
515  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
516  // Don't make a local copy of the kernel unless we have to (i.e. it's an
517  // expression that needs to be evaluated)
518  const Scalar* in_place = m_kernelImpl.data();
519  if (in_place) {
520  m_kernel = in_place;
521  m_local_kernel = false;
522  } else {
523  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
524  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
526  EvalTo evalToTmp(local, m_kernelArg);
529 
530  m_kernel = local;
531  m_local_kernel = true;
532  }
533  }
534 
537 
542  Dimensions m_dimensions;
543 
544  KernelArgType m_kernelArg;
545  const Scalar* m_kernel;
547  const Device& m_device;
548 };
549 
550 
551 
552 
553 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
555 
556 template <int StaticKernelSize>
557 struct GetKernelSize {
558  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
559  return StaticKernelSize;
560  }
561 };
562 template <>
563 struct GetKernelSize<Dynamic> {
564  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
565  return kernelSize;
566  }
567 };
568 
569 template <typename InputEvaluator, typename Index, typename InputDims,
570  int StaticKernelSize>
571 __global__ void EigenConvolutionKernel1D(
572  InputEvaluator eval,
574  indexMapper,
575  const float* __restrict kernel, const int numPlanes, const int numX,
576  const int maxX, const int kernelSize, float* buffer) {
577  extern __shared__ float s[];
578 
579  const int first_x = blockIdx.x * maxX;
580  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
581  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
582  const int num_x_output = last_x - first_x + 1;
583 
584  const int first_plane = blockIdx.y * blockDim.y;
585  const int plane_stride = blockDim.y * gridDim.y;
586 
587  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
588  // Load inputs to shared memory
589  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
590  const int plane_kernel_offset = threadIdx.y * num_x_input;
591  #pragma unroll
592  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
593  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
594  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
595  }
596 
597  __syncthreads();
598 
599  // Compute the convolution
600  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
601 
602  #pragma unroll
603  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
604  const int kernel_offset = plane_kernel_offset + i;
605  float result = 0.0f;
606  #pragma unroll
607  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
608  result += s[k + kernel_offset] * kernel[k];
609  }
610  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
611  buffer[tensor_index] = result;
612  }
613  __syncthreads();
614  }
615 };
616 
617 template <typename InputEvaluator, typename Index, typename InputDims,
618  int StaticKernelSizeX, int StaticKernelSizeY>
619 __global__ void EigenConvolutionKernel2D(
620  InputEvaluator eval,
622  indexMapper,
623  const float* __restrict kernel, const int numPlanes, const int numX,
624  const int maxX, const int numY, const int maxY, const int kernelSizeX,
625  const int kernelSizeY, float* buffer) {
626  extern __shared__ float s[];
627 
628  const int first_x = blockIdx.x * maxX;
629  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
630  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
631  const int num_x_output = last_x - first_x + 1;
632 
633  const int first_y = blockIdx.y * maxY;
634  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
635  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
636  const int num_y_output = last_y - first_y + 1;
637 
638  const int first_plane = blockIdx.z * blockDim.z;
639  const int plane_stride = blockDim.z * gridDim.z;
640 
641  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
642 
643  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
644  const int plane_kernel_offset = threadIdx.z * num_y_input;
645 
646  // Load inputs to shared memory
647  #pragma unroll
648  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
649  const int input_offset = num_x_input * (j + plane_kernel_offset);
650  #pragma unroll
651  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
652  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
653  s[i + input_offset] = eval.coeff(tensor_index);
654  }
655  }
656 
657  __syncthreads();
658 
659  // Convolution
660  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
661 
662  #pragma unroll
663  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
664  #pragma unroll
665  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
666  float result = 0.0f;
667  #pragma unroll
668  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
669  const int kernel_offset = kernelSizeX * l;
670  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
671  #pragma unroll
672  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
673  result += s[k + input_offset] * kernel[k + kernel_offset];
674  }
675  }
676  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
677  buffer[tensor_index] = result;
678  }
679  }
680 
681  __syncthreads();
682  }
683 };
684 
685 template <typename InputEvaluator, typename Index, typename InputDims>
686 __global__ void EigenConvolutionKernel3D(
687  InputEvaluator eval,
689  indexMapper,
690  const float* __restrict kernel, const size_t numPlanes, const size_t numX,
691  const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
692  const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
693  const size_t kernelSizeZ, float* buffer) {
694  extern __shared__ float s[];
695 
696  // Load inputs to shared memory
697  const int first_x = blockIdx.x * maxX;
698  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
699  const int num_x_input = last_x - first_x + kernelSizeX;
700 
701  const int first_y = blockIdx.y * maxY;
702  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
703  const int num_y_input = last_y - first_y + kernelSizeY;
704 
705  const int first_z = blockIdx.z * maxZ;
706  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
707  const int num_z_input = last_z - first_z + kernelSizeZ;
708 
709  for (int p = 0; p < numPlanes; ++p) {
710 
711  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
712  const int plane_kernel_offset = 0;
713 
714  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
715  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
716  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
717  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
718  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
719  }
720  }
721  }
722 
723  __syncthreads();
724 
725  // Convolution
726  const int num_z_output = last_z - first_z + 1;
727  const int num_y_output = last_y - first_y + 1;
728  const int num_x_output = last_x - first_x + 1;
729  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
730 
731  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
732  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
733  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
734  float result = 0.0f;
735  for (int n = 0; n < kernelSizeZ; ++n) {
736  for (int m = 0; m < kernelSizeY; ++m) {
737  for (int l = 0; l < kernelSizeX; ++l) {
738  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
739  }
740  }
741  }
742  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
743  buffer[tensor_index] = result;
744  }
745  }
746  }
747  __syncthreads();
748  }
749 };
750 
751 
752 
753 template<typename Indices, typename InputArgType, typename KernelArgType>
754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
755 {
757 
759  static const int NumKernelDims = internal::array_size<Indices>::value;
760  typedef typename XprType::Index Index;
762  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
763 
764  enum {
766  PacketAccess = false,
768  CoordAccess = false, // to be implemented
769  RawAccess = false
770  };
771 
772  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
773  : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
774  {
775  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
776 
777  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
778  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
779 
780  m_dimensions = m_inputImpl.dimensions();
781  for (int i = 0; i < NumKernelDims; ++i) {
782  const Index index = op.indices()[i];
783  const Index input_dim = input_dims[index];
784  const Index kernel_dim = kernel_dims[i];
785  const Index result_dim = input_dim - kernel_dim + 1;
786  m_dimensions[index] = result_dim;
787  }
788  }
789 
790  typedef typename XprType::CoeffReturnType CoeffReturnType;
792  typedef typename InputArgType::Scalar Scalar;
793  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
794 
795  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
796 
797  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
798  preloadKernel();
799  m_inputImpl.evalSubExprsIfNeeded(NULL);
800  if (data) {
801  executeEval(data);
802  return false;
803  } else {
804  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
805  executeEval(m_buf);
806  return true;
807  }
808  }
809 
810  EIGEN_STRONG_INLINE void cleanup() {
811  m_inputImpl.cleanup();
812  if (m_buf) {
813  m_device.deallocate(m_buf);
814  m_buf = NULL;
815  }
816  if (m_local_kernel) {
817  m_device.deallocate((void*)m_kernel);
818  m_local_kernel = false;
819  }
820  m_kernel = NULL;
821  }
822 
823  EIGEN_STRONG_INLINE void preloadKernel() {
824  // Don't make a local copy of the kernel unless we have to (i.e. it's an
825  // expression that needs to be evaluated)
826  const Scalar* in_place = m_kernelImpl.data();
827  if (in_place) {
828  m_kernel = in_place;
829  m_local_kernel = false;
830  } else {
831  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
832  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
834  EvalTo evalToTmp(local, m_kernelArg);
837 
838  m_kernel = local;
839  m_local_kernel = true;
840  }
841  }
842 
843  static unsigned int ceil(unsigned int num, unsigned int denom) {
844  const unsigned int rounded_toward_zero = num / denom;
845  if (num > rounded_toward_zero * denom) {
846  return rounded_toward_zero + 1;
847  }
848  return rounded_toward_zero;
849  }
850 
851  void executeEval(Scalar* data) const {
852  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
853 
854  const int maxSharedMem = m_device.sharedMemPerBlock();
855  const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
856  const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
857  const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
858  const int warpSize = 32;
859 
860  switch (NumKernelDims) {
861  case 1: {
862  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
863 
864  const int numX = dimensions()[m_indices[0]];
865  const int numP = dimensions().TotalSize() / numX;
866  int maxX;
867  dim3 block_size;
868 
869  const int single_stride_dim =
870  static_cast<int>(Layout) == static_cast<int>(ColMajor)
871  ? 0
872  : m_inputImpl.dimensions().rank() - 1;
873  if (m_indices[0] == single_stride_dim) {
874  // Maximum the reuse
875  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
876  maxX = numext::mini<int>(inner_dim, numX);
877  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
878  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
879  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
880  }
881  else {
882  // Read as much as possible alongside the inner most dimension, that is the plane
883  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
884  const int maxP = numext::mini<int>(inner_dim, numP);
885  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
886 
887  block_size.x = numext::mini(warpSize, maxX);
888  block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
889  }
890 
891  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
892  assert(shared_mem <= maxSharedMem);
893 
894  const int num_x_blocks = ceil(numX, maxX);
895  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
896  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
897 
898  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
899 
900 
901  //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
902 
903  const array<Index, 1> indices(m_indices[0]);
904  const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
906  m_inputImpl.dimensions(), kernel_dims, indices);
907  switch(kernel_size) {
908  case 4: {
909  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
910  break;
911  }
912  case 7: {
913  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
914  break;
915  }
916  default: {
917  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
918  }
919  }
920  break;
921  }
922 
923  case 2: {
924  const int idxX =
925  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
926  const int idxY =
927  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
928  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
929  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
930 
931  const int numX = dimensions()[m_indices[idxX]];
932  const int numY = dimensions()[m_indices[idxY]];
933  const int numP = dimensions().TotalSize() / (numX*numY);
934 
935  const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
936 
937  // Snap maxX to warp size
938  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
939  const int maxX = numext::mini<int>(inner_dim, numX);
940  const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
942 
943  dim3 block_size;
944  block_size.x = numext::mini(1024, maxX);
945  block_size.y = numext::mini<int>(1024/block_size.x, maxY);
946  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
947 
948  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
949  assert(shared_mem <= maxSharedMem);
950 
951  const int num_x_blocks = ceil(numX, maxX);
952  const int num_y_blocks = ceil(numY, maxY);
953  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
954  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
955 
956  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
957 
958 
959  //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
960 
961  const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
962  const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
963  m_kernelImpl.dimensions()[idxY]);
965  m_inputImpl.dimensions(), kernel_dims, indices);
966  switch (kernel_size_x) {
967  case 4: {
968  switch (kernel_size_y) {
969  case 7: {
970  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
971  break;
972  }
973  default: {
974  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
975  break;
976  }
977  }
978  break;
979  }
980  case 7: {
981  switch (kernel_size_y) {
982  case 4: {
983  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
984  break;
985  }
986  default: {
987  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
988  break;
989  }
990  }
991  break;
992  }
993  default: {
994  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
995  break;
996  }
997  }
998  break;
999  }
1000 
1001  case 3: {
1002  const int idxX =
1003  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1004  const int idxY =
1005  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1006  const int idxZ =
1007  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1008 
1009  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1010  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1011  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1012 
1013  const int numX = dimensions()[m_indices[idxX]];
1014  const int numY = dimensions()[m_indices[idxY]];
1015  const int numZ = dimensions()[m_indices[idxZ]];
1016  const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1017 
1018  const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1019  const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1020  const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1021 
1022  dim3 block_size;
1023  block_size.x = numext::mini(32, maxX);
1024  block_size.y = numext::mini(32, maxY);
1025  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1026  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1027 
1028  const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1029  assert(shared_mem <= maxSharedMem);
1030 
1031  //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1032  const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1033  m_indices[idxZ]);
1034  const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1035  m_kernelImpl.dimensions()[idxY],
1036  m_kernelImpl.dimensions()[idxZ]);
1038  m_inputImpl.dimensions(), kernel_dims, indices);
1039 
1040  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1041  break;
1042  }
1043 
1044  default: {
1045  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1046  }
1047  }
1048  }
1049 
1050  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1051  {
1052  eigen_assert(m_buf);
1053  eigen_assert(index < m_dimensions.TotalSize());
1054  return m_buf[index];
1055  }
1056 
1057  template<int LoadMode>
1058  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1059  {
1060  eigen_assert(m_buf);
1061  eigen_assert(index < m_dimensions.TotalSize());
1062  return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1063  }
1064 
1065  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1066  costPerCoeff(bool vectorized) const {
1067  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1068  // model.
1069  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1070  // We ignore the use of fused multiply-add.
1071  const double convolve_compute_cost =
1072  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1073  const double firstIndex_compute_cost =
1074  NumDims *
1075  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1076  TensorOpCost::DivCost<Index>());
1077  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1078  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1079  m_kernelImpl.costPerCoeff(vectorized) +
1080  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1081  PacketSize));
1082  }
1083 
1084  private:
1085  // No assignment (copies are needed by the kernels)
1086  TensorEvaluator& operator = (const TensorEvaluator&);
1087 
1090  KernelArgType m_kernelArg;
1091  Indices m_indices;
1092  Dimensions m_dimensions;
1093  Scalar* m_buf;
1094  const Scalar* m_kernel;
1095  bool m_local_kernel;
1096 
1097  const GpuDevice& m_device;
1098 };
1099 #endif
1100 
1101 
1102 } // end namespace Eigen
1103 
1104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
d
#define EIGEN_STRONG_INLINE
Definition: Macros.h:493
KernelXprType::Nested m_kernel_xpr
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
array< Index, NumDims > m_cudaOutputStrides
array< Index, NumDims > m_cudaInputStrides
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
XmlRpcServer s
Definition: LDLT.h:16
A cost model used to limit the number of threads used for evaluating tensor expression.
InputXprType::Nested m_input_xpr
EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType &accum) const
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:122
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename KernelXprType::Nested >::type & kernelExpression() const
promote_storage_type< typename InputXprType::Scalar, typename KernelXprType::Scalar >::ret Scalar
promote_storage_type< typename traits< InputXprType >::StorageKind, typename traits< KernelXprType >::StorageKind >::ret StorageKind
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const
const TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > & type
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:33
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const
#define eigen_assert(x)
Definition: Macros.h:577
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const
static EIGEN_DEVICE_FUNC void run(const Expression &expr, const Device &device=Device())
array< Index, NumDims > m_outputStrides
array< Index, NumDims > m_inputStrides
Derived::Dimensions Dimensions
The tensor base class.
Definition: TensorBase.h:827
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet &accum) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const
Eigen::internal::traits< TensorConvolutionOp >::Index Index
#define EIGEN_ALIGN_MAX
Definition: Macros.h:755
Eigen::NumTraits< Scalar >::Real RealScalar
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Eigen::internal::nested< TensorConvolutionOp >::type Nested
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:51
promote_index_type< typename traits< InputXprType >::Index, typename traits< KernelXprType >::Index >::type Index
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename InputXprType::Nested >::type & inputExpression() const


hebiros
Author(s): Xavier Artache , Matthew Tesch
autogenerated on Thu Sep 3 2020 04:09:15