TensorConvolution.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 
24 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
25 class IndexMapper {
26  public:
27  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
28  const array<Index, NumKernelDims>& indices) {
29 
30  array<Index, NumDims> dimensions = input_dims;
31  for (int i = 0; i < NumKernelDims; ++i) {
32  const Index index = indices[i];
33  const Index input_dim = input_dims[index];
34  const Index kernel_dim = kernel_dims[i];
35  const Index result_dim = input_dim - kernel_dim + 1;
36  dimensions[index] = result_dim;
37  }
38 
39  array<Index, NumDims> inputStrides;
40  array<Index, NumDims> outputStrides;
41  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42  inputStrides[0] = 1;
43  outputStrides[0] = 1;
44  for (int i = 1; i < NumDims; ++i) {
45  inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46  outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47  }
48  } else {
49  inputStrides[NumDims - 1] = 1;
50  outputStrides[NumDims - 1] = 1;
51  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54  }
55  }
56 
57  array<Index, NumDims> cudaInputDimensions;
58  array<Index, NumDims> cudaOutputDimensions;
61  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62  ? 0
63  : NumDims - NumKernelDims;
64  for (int i = 0; i < NumKernelDims; ++i) {
65  const Index index = i + offset;
66  ordering[index] = indices[i];
67  tmp[indices[i]] = -1;
68  cudaInputDimensions[index] = input_dims[indices[i]];
69  cudaOutputDimensions[index] = dimensions[indices[i]];
70  }
71 
72  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73  ? NumKernelDims
74  : 0;
75  for (int i = 0; i < NumDims; ++i) {
76  if (tmp[i] >= 0) {
77  ordering[written] = i;
78  cudaInputDimensions[written] = input_dims[i];
79  cudaOutputDimensions[written] = dimensions[i];
80  ++written;
81  }
82  }
83 
84  for (int i = 0; i < NumDims; ++i) {
85  m_inputStrides[i] = inputStrides[ordering[i]];
86  m_outputStrides[i] = outputStrides[ordering[i]];
87  }
88 
89  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90  for (int i = 0; i < NumDims; ++i) {
91  if (i > NumKernelDims) {
93  m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
95  m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
96  } else {
97  m_cudaInputStrides[i] = 1;
99  }
100  }
101  } else {
102  for (int i = NumDims - 1; i >= 0; --i) {
103  if (i + 1 < offset) {
105  m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
107  m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
108  } else {
109  m_cudaInputStrides[i] = 1;
110  m_cudaOutputStrides[i] = 1;
111  }
112  }
113  }
114  }
115 
117  Index inputIndex = 0;
118  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119  for (int d = NumDims - 1; d > NumKernelDims; --d) {
120  const Index idx = p / m_cudaInputStrides[d];
121  inputIndex += idx * m_inputStrides[d];
122  p -= idx * m_cudaInputStrides[d];
123  }
124  inputIndex += p * m_inputStrides[NumKernelDims];
125  } else {
126  std::ptrdiff_t limit = 0;
127  if (NumKernelDims < NumDims) {
128  limit = NumDims - NumKernelDims - 1;
129  }
130  for (int d = 0; d < limit; ++d) {
131  const Index idx = p / m_cudaInputStrides[d];
132  inputIndex += idx * m_inputStrides[d];
133  p -= idx * m_cudaInputStrides[d];
134  }
135  inputIndex += p * m_inputStrides[limit];
136  }
137  return inputIndex;
138  }
139 
141  Index outputIndex = 0;
142  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143  for (int d = NumDims - 1; d > NumKernelDims; --d) {
144  const Index idx = p / m_cudaOutputStrides[d];
145  outputIndex += idx * m_outputStrides[d];
146  p -= idx * m_cudaOutputStrides[d];
147  }
148  outputIndex += p * m_outputStrides[NumKernelDims];
149  } else {
150  std::ptrdiff_t limit = 0;
151  if (NumKernelDims < NumDims) {
152  limit = NumDims - NumKernelDims - 1;
153  }
154  for (int d = 0; d < limit; ++d) {
155  const Index idx = p / m_cudaOutputStrides[d];
156  outputIndex += idx * m_outputStrides[d];
157  p -= idx * m_cudaOutputStrides[d];
158  }
159  outputIndex += p * m_outputStrides[limit];
160  }
161  return outputIndex;
162  }
163 
165  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166  ? 0
167  : NumDims - NumKernelDims;
168  return i * m_inputStrides[offset];
169  }
170 
172  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173  ? 0
174  : NumDims - NumKernelDims;
175  return i * m_outputStrides[offset];
176  }
177 
179  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180  ? 0
181  : NumDims - NumKernelDims;
182  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183  }
184 
186  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187  ? 0
188  : NumDims - NumKernelDims;
189  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190  }
191 
193  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194  ? 0
195  : NumDims - NumKernelDims;
196  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197  k * m_inputStrides[offset + 2];
198  }
199 
201  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202  ? 0
203  : NumDims - NumKernelDims;
204  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205  k * m_outputStrides[offset + 2];
206  }
207 
208  private:
214 };
215 
216 
217 
218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220 {
221  // Type promotion to handle the case where the types of the lhs and the rhs are different.
222  typedef typename promote_storage_type<typename InputXprType::Scalar,
228  typedef typename InputXprType::Nested LhsNested;
229  typedef typename KernelXprType::Nested RhsNested;
232  static const int NumDimensions = traits<InputXprType>::NumDimensions;
233  static const int Layout = traits<InputXprType>::Layout;
234 
235  enum {
236  Flags = 0
237  };
238 };
239 
240 template<typename Dimensions, typename InputXprType, typename KernelXprType>
241 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
242 {
244 };
245 
246 template<typename Dimensions, typename InputXprType, typename KernelXprType>
247 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
248 {
250 };
251 
252 } // end namespace internal
253 
254 
255 
256 template<typename Indices, typename InputXprType, typename KernelXprType>
257 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
258 {
259  public:
262  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
263  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
267 
268  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
269  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
270 
271  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
272  const Indices& indices() const { return m_indices; }
273 
275  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
277  inputExpression() const { return m_input_xpr; }
278 
279  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
281  kernelExpression() const { return m_kernel_xpr; }
282 
283  protected:
284  typename InputXprType::Nested m_input_xpr;
285  typename KernelXprType::Nested m_kernel_xpr;
287 };
288 
289 
290 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
291 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
292 {
294 
296  static const int NumKernelDims = internal::array_size<Indices>::value;
297  typedef typename XprType::Index Index;
299 
300  typedef typename XprType::Scalar Scalar;
304 
305  enum {
309  CoordAccess = false, // to be implemented
310  RawAccess = false
311  };
312 
313  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
314  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
315  {
316  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
317 
318  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
319  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
320 
321  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
322  m_inputStride[0] = 1;
323  for (int i = 1; i < NumDims; ++i) {
324  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
325  }
326  } else {
327  m_inputStride[NumDims - 1] = 1;
328  for (int i = NumDims - 2; i >= 0; --i) {
329  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
330  }
331  }
332 
333  m_dimensions = m_inputImpl.dimensions();
334  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
335  for (int i = 0; i < NumKernelDims; ++i) {
336  const Index index = op.indices()[i];
337  const Index input_dim = input_dims[index];
338  const Index kernel_dim = kernel_dims[i];
339  const Index result_dim = input_dim - kernel_dim + 1;
340  m_dimensions[index] = result_dim;
341  if (i > 0) {
342  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
343  } else {
344  m_kernelStride[0] = 1;
345  }
346  m_indexStride[i] = m_inputStride[index];
347  }
348 
349  m_outputStride[0] = 1;
350  for (int i = 1; i < NumDims; ++i) {
351  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
352  }
353  } else {
354  for (int i = NumKernelDims - 1; i >= 0; --i) {
355  const Index index = op.indices()[i];
356  const Index input_dim = input_dims[index];
357  const Index kernel_dim = kernel_dims[i];
358  const Index result_dim = input_dim - kernel_dim + 1;
359  m_dimensions[index] = result_dim;
360  if (i < NumKernelDims - 1) {
361  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
362  } else {
363  m_kernelStride[NumKernelDims - 1] = 1;
364  }
365  m_indexStride[i] = m_inputStride[index];
366  }
367 
368  m_outputStride[NumDims - 1] = 1;
369  for (int i = NumDims - 2; i >= 0; --i) {
370  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
371  }
372  }
373  }
374 
375  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
376 
377  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
378  m_inputImpl.evalSubExprsIfNeeded(NULL);
379  preloadKernel();
380  return true;
381  }
382  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
383  m_inputImpl.cleanup();
384  if (m_local_kernel) {
385  m_device.deallocate((void*)m_kernel);
386  m_local_kernel = false;
387  }
388  m_kernel = NULL;
389  }
390 
391  void evalTo(typename XprType::Scalar* buffer) {
392  evalSubExprsIfNeeded(NULL);
393  for (int i = 0; i < dimensions().TotalSize(); ++i) {
394  buffer[i] += coeff(i);
395  }
396  cleanup();
397  }
398 
399  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
400  {
401  CoeffReturnType result = CoeffReturnType(0);
402  convolve(firstInput(index), 0, NumKernelDims-1, result);
403  return result;
404  }
405 
406  template<int LoadMode>
407  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
408  {
409  Index indices[2] = {index, index+PacketSize-1};
410  Index startInputs[2] = {0, 0};
411  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
412  for (int i = NumDims - 1; i > 0; --i) {
413  const Index idx0 = indices[0] / m_outputStride[i];
414  const Index idx1 = indices[1] / m_outputStride[i];
415  startInputs[0] += idx0 * m_inputStride[i];
416  startInputs[1] += idx1 * m_inputStride[i];
417  indices[0] -= idx0 * m_outputStride[i];
418  indices[1] -= idx1 * m_outputStride[i];
419  }
420  } else {
421  for (int i = 0; i < NumDims - 1; ++i) {
422  const Index idx0 = indices[0] / m_outputStride[i];
423  const Index idx1 = indices[1] / m_outputStride[i];
424  startInputs[0] += idx0 * m_inputStride[i];
425  startInputs[1] += idx1 * m_inputStride[i];
426  indices[0] -= idx0 * m_outputStride[i];
427  indices[1] -= idx1 * m_outputStride[i];
428  }
429  }
430  startInputs[0] += indices[0];
431  startInputs[1] += indices[1];
432 
433  if (startInputs[1]-startInputs[0] == PacketSize-1) {
434  PacketReturnType result = internal::pset1<PacketReturnType>(0);
435  convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
436  return result;
437  } else {
438  EIGEN_ALIGN_MAX Scalar data[PacketSize];
439  data[0] = Scalar(0);
440  convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
441  for (int i = 1; i < PacketSize-1; ++i) {
442  data[i] = Scalar(0);
443  convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
444  }
445  data[PacketSize-1] = Scalar(0);
446  convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
447  return internal::pload<PacketReturnType>(data);
448  }
449  }
450 
451  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
452  costPerCoeff(bool vectorized) const {
453  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
454  // We ignore the use of fused multiply-add.
455  const double convolve_compute_cost =
456  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
457  const double firstIndex_compute_cost =
458  NumDims *
459  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
460  TensorOpCost::DivCost<Index>());
461  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
462  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
463  m_kernelImpl.costPerCoeff(vectorized) +
464  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
465  PacketSize));
466  }
467 
468  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
469 
470  private:
471  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
472  Index startInput = 0;
473  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
474  for (int i = NumDims - 1; i > 0; --i) {
475  const Index idx = index / m_outputStride[i];
476  startInput += idx * m_inputStride[i];
477  index -= idx * m_outputStride[i];
478  }
479  } else {
480  for (int i = 0; i < NumDims - 1; ++i) {
481  const Index idx = index / m_outputStride[i];
482  startInput += idx * m_inputStride[i];
483  index -= idx * m_outputStride[i];
484  }
485  }
486  startInput += index;
487  return startInput;
488  }
489 
490  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
491  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
492  const Index input = firstIndex + j * m_indexStride[DimIndex];
493  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
494  if (DimIndex > 0) {
495  convolve(input, kernel, DimIndex-1, accum);
496  } else {
497  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
498  }
499  }
500  }
501 
502  template <typename Packet>
503  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
504  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
505  const Index input = firstIndex + j * m_indexStride[DimIndex];
506  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
507  if (DimIndex > 0) {
508  convolvePacket(input, kernel, DimIndex-1, accum);
509  } else {
510  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
511  }
512  }
513  }
514 
515  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
516  // Don't make a local copy of the kernel unless we have to (i.e. it's an
517  // expression that needs to be evaluated)
518  const Scalar* in_place = m_kernelImpl.data();
519  if (in_place) {
520  m_kernel = in_place;
521  m_local_kernel = false;
522  } else {
523  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
524  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
526  EvalTo evalToTmp(local, m_kernelArg);
529 
530  m_kernel = local;
531  m_local_kernel = true;
532  }
533  }
534 
537 
542  Dimensions m_dimensions;
543 
544  KernelArgType m_kernelArg;
545  const Scalar* m_kernel;
547  const Device& m_device;
548 };
549 
550 
551 
552 
553 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
555 
556 template <int StaticKernelSize>
557 struct GetKernelSize {
558  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
559  return StaticKernelSize;
560  }
561 };
562 template <>
563 struct GetKernelSize<Dynamic> {
564  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
565  return kernelSize;
566  }
567 };
568 
569 template <typename InputEvaluator, typename Index, typename InputDims,
570  int StaticKernelSize>
571 __global__ void EigenConvolutionKernel1D(
572  InputEvaluator eval,
574  indexMapper,
575  const float* __restrict kernel, const int numPlanes, const int numX,
576  const int maxX, const int kernelSize, float* buffer) {
577  extern __shared__ float s[];
578 
579  const int first_x = blockIdx.x * maxX;
580  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
581  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
582  const int num_x_output = last_x - first_x + 1;
583 
584  const int first_plane = blockIdx.y * blockDim.y;
585  const int plane_stride = blockDim.y * gridDim.y;
586 
587  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
588  // Load inputs to shared memory
589  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
590  const int plane_kernel_offset = threadIdx.y * num_x_input;
591  #pragma unroll
592  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
593  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
594  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
595  }
596 
597  __syncthreads();
598 
599  // Compute the convolution
600  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
601 
602  #pragma unroll
603  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
604  const int kernel_offset = plane_kernel_offset + i;
605  float result = 0.0f;
606  #pragma unroll
607  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
608  result += s[k + kernel_offset] * kernel[k];
609  }
610  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
611  buffer[tensor_index] = result;
612  }
613  __syncthreads();
614  }
615 };
616 
617 template <typename InputEvaluator, typename Index, typename InputDims,
618  int StaticKernelSizeX, int StaticKernelSizeY>
619 __global__ void EigenConvolutionKernel2D(
620  InputEvaluator eval,
622  indexMapper,
623  const float* __restrict kernel, const int numPlanes, const int numX,
624  const int maxX, const int numY, const int maxY, const int kernelSizeX,
625  const int kernelSizeY, float* buffer) {
626  extern __shared__ float s[];
627 
628  const int first_x = blockIdx.x * maxX;
629  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
630  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
631  const int num_x_output = last_x - first_x + 1;
632 
633  const int first_y = blockIdx.y * maxY;
634  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
635  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
636  const int num_y_output = last_y - first_y + 1;
637 
638  const int first_plane = blockIdx.z * blockDim.z;
639  const int plane_stride = blockDim.z * gridDim.z;
640 
641  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
642 
643  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
644  const int plane_kernel_offset = threadIdx.z * num_y_input;
645 
646  // Load inputs to shared memory
647  #pragma unroll
648  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
649  const int input_offset = num_x_input * (j + plane_kernel_offset);
650  #pragma unroll
651  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
652  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
653  s[i + input_offset] = eval.coeff(tensor_index);
654  }
655  }
656 
657  __syncthreads();
658 
659  // Convolution
660  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
661 
662  #pragma unroll
663  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
664  #pragma unroll
665  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
666  float result = 0.0f;
667  #pragma unroll
668  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
669  const int kernel_offset = kernelSizeX * l;
670  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
671  #pragma unroll
672  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
673  result += s[k + input_offset] * kernel[k + kernel_offset];
674  }
675  }
676  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
677  buffer[tensor_index] = result;
678  }
679  }
680 
681  __syncthreads();
682  }
683 };
684 
685 template <typename InputEvaluator, typename Index, typename InputDims>
686 __global__ void EigenConvolutionKernel3D(
687  InputEvaluator eval,
689  indexMapper,
690  const float* __restrict kernel, const size_t numPlanes, const size_t numX,
691  const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
692  const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
693  const size_t kernelSizeZ, float* buffer) {
694  extern __shared__ float s[];
695 
696  // Load inputs to shared memory
697  const int first_x = blockIdx.x * maxX;
698  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
699  const int num_x_input = last_x - first_x + kernelSizeX;
700 
701  const int first_y = blockIdx.y * maxY;
702  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
703  const int num_y_input = last_y - first_y + kernelSizeY;
704 
705  const int first_z = blockIdx.z * maxZ;
706  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
707  const int num_z_input = last_z - first_z + kernelSizeZ;
708 
709  for (int p = 0; p < numPlanes; ++p) {
710 
711  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
712  const int plane_kernel_offset = 0;
713 
714  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
715  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
716  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
717  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
718  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
719  }
720  }
721  }
722 
723  __syncthreads();
724 
725  // Convolution
726  const int num_z_output = last_z - first_z + 1;
727  const int num_y_output = last_y - first_y + 1;
728  const int num_x_output = last_x - first_x + 1;
729  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
730 
731  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
732  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
733  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
734  float result = 0.0f;
735  for (int n = 0; n < kernelSizeZ; ++n) {
736  for (int m = 0; m < kernelSizeY; ++m) {
737  for (int l = 0; l < kernelSizeX; ++l) {
738  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
739  }
740  }
741  }
742  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
743  buffer[tensor_index] = result;
744  }
745  }
746  }
747  __syncthreads();
748  }
749 };
750 
751 
752 
753 template<typename Indices, typename InputArgType, typename KernelArgType>
754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
755 {
757 
759  static const int NumKernelDims = internal::array_size<Indices>::value;
760  typedef typename XprType::Index Index;
762  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
763 
764  enum {
766  PacketAccess = false,
768  CoordAccess = false, // to be implemented
769  RawAccess = false
770  };
771 
772  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
773  : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
774  {
775  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
776 
777  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
778  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
779 
780  m_dimensions = m_inputImpl.dimensions();
781  for (int i = 0; i < NumKernelDims; ++i) {
782  const Index index = op.indices()[i];
783  const Index input_dim = input_dims[index];
784  const Index kernel_dim = kernel_dims[i];
785  const Index result_dim = input_dim - kernel_dim + 1;
786  m_dimensions[index] = result_dim;
787  }
788  }
789 
790  typedef typename XprType::CoeffReturnType CoeffReturnType;
792  typedef typename InputArgType::Scalar Scalar;
793  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
794 
795  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
796 
797  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
798  preloadKernel();
799  m_inputImpl.evalSubExprsIfNeeded(NULL);
800  if (data) {
801  executeEval(data);
802  return false;
803  } else {
804  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
805  executeEval(m_buf);
806  return true;
807  }
808  }
809 
810  EIGEN_STRONG_INLINE void cleanup() {
811  m_inputImpl.cleanup();
812  if (m_buf) {
813  m_device.deallocate(m_buf);
814  m_buf = NULL;
815  }
816  if (m_local_kernel) {
817  m_device.deallocate((void*)m_kernel);
818  m_local_kernel = false;
819  }
820  m_kernel = NULL;
821  }
822 
823  EIGEN_STRONG_INLINE void preloadKernel() {
824  // Don't make a local copy of the kernel unless we have to (i.e. it's an
825  // expression that needs to be evaluated)
826  const Scalar* in_place = m_kernelImpl.data();
827  if (in_place) {
828  m_kernel = in_place;
829  m_local_kernel = false;
830  } else {
831  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
832  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
834  EvalTo evalToTmp(local, m_kernelArg);
837 
838  m_kernel = local;
839  m_local_kernel = true;
840  }
841  }
842 
843  static unsigned int ceil(unsigned int num, unsigned int denom) {
844  const unsigned int rounded_toward_zero = num / denom;
845  if (num > rounded_toward_zero * denom) {
846  return rounded_toward_zero + 1;
847  }
848  return rounded_toward_zero;
849  }
850 
851  void executeEval(Scalar* data) const {
852  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
853 
854  const int maxSharedMem = m_device.sharedMemPerBlock();
855  const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
856  const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
857  const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
858  const int warpSize = 32;
859 
860  switch (NumKernelDims) {
861  case 1: {
862  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
863 
864  const int numX = dimensions()[m_indices[0]];
865  const int numP = dimensions().TotalSize() / numX;
866  int maxX;
867  dim3 block_size;
868 
869  const int single_stride_dim =
870  static_cast<int>(Layout) == static_cast<int>(ColMajor)
871  ? 0
872  : m_inputImpl.dimensions().rank() - 1;
873  if (m_indices[0] == single_stride_dim) {
874  // Maximum the reuse
875  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
876  maxX = numext::mini<int>(inner_dim, numX);
877  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
878  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
879  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
880  }
881  else {
882  // Read as much as possible alongside the inner most dimension, that is the plane
883  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
884  const int maxP = numext::mini<int>(inner_dim, numP);
885  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
886 
887  block_size.x = numext::mini(warpSize, maxX);
888  block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
889  }
890 
891  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
892  assert(shared_mem <= maxSharedMem);
893 
894  const int num_x_blocks = ceil(numX, maxX);
895  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
896  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
897 
898  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
899 
900 
901  //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
902 
903  const array<Index, 1> indices(m_indices[0]);
904  const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
906  m_inputImpl.dimensions(), kernel_dims, indices);
907  switch(kernel_size) {
908  case 4: {
909  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
910  break;
911  }
912  case 7: {
913  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
914  break;
915  }
916  default: {
917  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
918  }
919  }
920  break;
921  }
922 
923  case 2: {
924  const int idxX =
925  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
926  const int idxY =
927  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
928  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
929  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
930 
931  const int numX = dimensions()[m_indices[idxX]];
932  const int numY = dimensions()[m_indices[idxY]];
933  const int numP = dimensions().TotalSize() / (numX*numY);
934 
935  const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
936 
937  // Snap maxX to warp size
938  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
939  const int maxX = numext::mini<int>(inner_dim, numX);
940  const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
942 
943  dim3 block_size;
944  block_size.x = numext::mini(1024, maxX);
945  block_size.y = numext::mini<int>(1024/block_size.x, maxY);
946  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
947 
948  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
949  assert(shared_mem <= maxSharedMem);
950 
951  const int num_x_blocks = ceil(numX, maxX);
952  const int num_y_blocks = ceil(numY, maxY);
953  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
954  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
955 
956  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
957 
958 
959  //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
960 
961  const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
962  const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
963  m_kernelImpl.dimensions()[idxY]);
965  m_inputImpl.dimensions(), kernel_dims, indices);
966  switch (kernel_size_x) {
967  case 4: {
968  switch (kernel_size_y) {
969  case 7: {
970  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
971  break;
972  }
973  default: {
974  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
975  break;
976  }
977  }
978  break;
979  }
980  case 7: {
981  switch (kernel_size_y) {
982  case 4: {
983  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
984  break;
985  }
986  default: {
987  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
988  break;
989  }
990  }
991  break;
992  }
993  default: {
994  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
995  break;
996  }
997  }
998  break;
999  }
1000 
1001  case 3: {
1002  const int idxX =
1003  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1004  const int idxY =
1005  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1006  const int idxZ =
1007  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1008 
1009  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1010  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1011  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1012 
1013  const int numX = dimensions()[m_indices[idxX]];
1014  const int numY = dimensions()[m_indices[idxY]];
1015  const int numZ = dimensions()[m_indices[idxZ]];
1016  const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1017 
1018  const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1019  const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1020  const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1021 
1022  dim3 block_size;
1023  block_size.x = numext::mini(32, maxX);
1024  block_size.y = numext::mini(32, maxY);
1025  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1026  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1027 
1028  const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1029  assert(shared_mem <= maxSharedMem);
1030 
1031  //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1032  const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1033  m_indices[idxZ]);
1034  const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1035  m_kernelImpl.dimensions()[idxY],
1036  m_kernelImpl.dimensions()[idxZ]);
1038  m_inputImpl.dimensions(), kernel_dims, indices);
1039 
1040  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1041  break;
1042  }
1043 
1044  default: {
1045  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1046  }
1047  }
1048  }
1049 
1050  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1051  {
1052  eigen_assert(m_buf);
1053  eigen_assert(index < m_dimensions.TotalSize());
1054  return m_buf[index];
1055  }
1056 
1057  template<int LoadMode>
1058  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1059  {
1060  eigen_assert(m_buf);
1061  eigen_assert(index < m_dimensions.TotalSize());
1062  return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1063  }
1064 
1065  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1066  costPerCoeff(bool vectorized) const {
1067  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1068  // model.
1069  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1070  // We ignore the use of fused multiply-add.
1071  const double convolve_compute_cost =
1072  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1073  const double firstIndex_compute_cost =
1074  NumDims *
1075  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1076  TensorOpCost::DivCost<Index>());
1077  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1078  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1079  m_kernelImpl.costPerCoeff(vectorized) +
1080  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1081  PacketSize));
1082  }
1083 
1084  private:
1085  // No assignment (copies are needed by the kernels)
1086  TensorEvaluator& operator = (const TensorEvaluator&);
1087 
1090  KernelArgType m_kernelArg;
1091  Indices m_indices;
1092  Dimensions m_dimensions;
1093  Scalar* m_buf;
1094  const Scalar* m_kernel;
1095  bool m_local_kernel;
1096 
1097  const GpuDevice& m_device;
1098 };
1099 #endif
1100 
1101 
1102 } // end namespace Eigen
1103 
1104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Matrix3f m
SCALAR Scalar
Definition: bench_gemm.cpp:33
#define EIGEN_STRONG_INLINE
Definition: Macros.h:494
KernelXprType::Nested m_kernel_xpr
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
array< Index, NumDims > m_cudaOutputStrides
array< Index, NumDims > m_cudaInputStrides
static enum @843 ordering
#define __restrict
Definition: gkregex.h:522
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
int n
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
dim3 threadIdx
Definition: cuda_common.h:11
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
A cost model used to limit the number of threads used for evaluating tensor expression.
InputXprType::Nested m_input_xpr
EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType &accum) const
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:124
vector< size_t > dimensions(L.begin(), L.end())
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename KernelXprType::Nested >::type & kernelExpression() const
promote_storage_type< typename InputXprType::Scalar, typename KernelXprType::Scalar >::ret Scalar
promote_storage_type< typename traits< InputXprType >::StorageKind, typename traits< KernelXprType >::StorageKind >::ret StorageKind
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
static const Line3 l(Rot3(), 1, 1)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const
const TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > & type
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
Values result
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:33
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const
#define eigen_assert(x)
Definition: Macros.h:579
int data[]
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const
RealScalar s
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const
static EIGEN_DEVICE_FUNC void run(const Expression &expr, const Device &device=Device())
array< Index, NumDims > m_outputStrides
#define NULL
Definition: ccolamd.c:609
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
array< Index, NumDims > m_inputStrides
Derived::Dimensions Dimensions
The tensor base class.
Definition: TensorBase.h:829
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet &accum) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const
Eigen::internal::traits< TensorConvolutionOp >::Index Index
dim3 blockIdx
Definition: cuda_common.h:11
DenseIndex ret
Definition: level1_impl.h:59
#define EIGEN_ALIGN_MAX
Definition: Macros.h:757
float * p
std::vector< size_t > Indices
dim3 blockDim
Definition: cuda_common.h:11
Eigen::NumTraits< Scalar >::Real RealScalar
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Eigen::internal::nested< TensorConvolutionOp >::type Nested
std::ptrdiff_t j
Definition: pytypes.h:897
promote_index_type< typename traits< InputXprType >::Index, typename traits< KernelXprType >::Index >::type Index
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename InputXprType::Nested >::type & inputExpression() const


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:45:18