TensorConvolution.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 
24 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
25 class IndexMapper {
26  public:
27  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
29 
30  array<Index, NumDims> dimensions = input_dims;
31  for (int i = 0; i < NumKernelDims; ++i) {
32  const Index index = indices[i];
33  const Index input_dim = input_dims[index];
34  const Index kernel_dim = kernel_dims[i];
35  const Index result_dim = input_dim - kernel_dim + 1;
36  dimensions[index] = result_dim;
37  }
38 
39  array<Index, NumDims> inputStrides;
40  array<Index, NumDims> outputStrides;
41  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42  inputStrides[0] = 1;
43  outputStrides[0] = 1;
44  for (int i = 1; i < NumDims; ++i) {
45  inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46  outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47  }
48  } else {
49  inputStrides[NumDims - 1] = 1;
50  outputStrides[NumDims - 1] = 1;
51  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54  }
55  }
56 
57  array<Index, NumDims> gpuInputDimensions;
58  array<Index, NumDims> gpuOutputDimensions;
61  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62  ? 0
63  : NumDims - NumKernelDims;
64  for (int i = 0; i < NumKernelDims; ++i) {
65  const Index index = i + offset;
66  ordering[index] = indices[i];
67  tmp[indices[i]] = -1;
68  gpuInputDimensions[index] = input_dims[indices[i]];
69  gpuOutputDimensions[index] = dimensions[indices[i]];
70  }
71 
72  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73  ? NumKernelDims
74  : 0;
75  for (int i = 0; i < NumDims; ++i) {
76  if (tmp[i] >= 0) {
77  ordering[written] = i;
78  gpuInputDimensions[written] = input_dims[i];
79  gpuOutputDimensions[written] = dimensions[i];
80  ++written;
81  }
82  }
83 
84  for (int i = 0; i < NumDims; ++i) {
85  m_inputStrides[i] = inputStrides[ordering[i]];
86  m_outputStrides[i] = outputStrides[ordering[i]];
87  }
88 
89  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90  for (int i = 0; i < NumDims; ++i) {
91  if (i > NumKernelDims) {
93  m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
95  m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
96  } else {
97  m_gpuInputStrides[i] = 1;
98  m_gpuOutputStrides[i] = 1;
99  }
100  }
101  } else {
102  for (int i = NumDims - 1; i >= 0; --i) {
103  if (static_cast<size_t>(i + 1) < offset) {
105  m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
107  m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
108  } else {
109  m_gpuInputStrides[i] = 1;
110  m_gpuOutputStrides[i] = 1;
111  }
112  }
113  }
114  }
115 
117  Index inputIndex = 0;
118  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119  for (int d = NumDims - 1; d > NumKernelDims; --d) {
120  const Index idx = p / m_gpuInputStrides[d];
121  inputIndex += idx * m_inputStrides[d];
122  p -= idx * m_gpuInputStrides[d];
123  }
124  inputIndex += p * m_inputStrides[NumKernelDims];
125  } else {
126  std::ptrdiff_t limit = 0;
127  if (NumKernelDims < NumDims) {
128  limit = NumDims - NumKernelDims - 1;
129  }
130  for (int d = 0; d < limit; ++d) {
131  const Index idx = p / m_gpuInputStrides[d];
132  inputIndex += idx * m_inputStrides[d];
133  p -= idx * m_gpuInputStrides[d];
134  }
135  inputIndex += p * m_inputStrides[limit];
136  }
137  return inputIndex;
138  }
139 
141  Index outputIndex = 0;
142  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143  for (int d = NumDims - 1; d > NumKernelDims; --d) {
144  const Index idx = p / m_gpuOutputStrides[d];
145  outputIndex += idx * m_outputStrides[d];
146  p -= idx * m_gpuOutputStrides[d];
147  }
148  outputIndex += p * m_outputStrides[NumKernelDims];
149  } else {
150  std::ptrdiff_t limit = 0;
151  if (NumKernelDims < NumDims) {
152  limit = NumDims - NumKernelDims - 1;
153  }
154  for (int d = 0; d < limit; ++d) {
155  const Index idx = p / m_gpuOutputStrides[d];
156  outputIndex += idx * m_outputStrides[d];
157  p -= idx * m_gpuOutputStrides[d];
158  }
159  outputIndex += p * m_outputStrides[limit];
160  }
161  return outputIndex;
162  }
163 
165  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166  ? 0
167  : NumDims - NumKernelDims;
168  return i * m_inputStrides[offset];
169  }
170 
172  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173  ? 0
174  : NumDims - NumKernelDims;
175  return i * m_outputStrides[offset];
176  }
177 
179  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180  ? 0
181  : NumDims - NumKernelDims;
182  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183  }
184 
186  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187  ? 0
188  : NumDims - NumKernelDims;
189  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190  }
191 
193  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194  ? 0
195  : NumDims - NumKernelDims;
196  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197  k * m_inputStrides[offset + 2];
198  }
199 
201  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202  ? 0
203  : NumDims - NumKernelDims;
204  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205  k * m_outputStrides[offset + 2];
206  }
207 
208  private:
214 };
215 
216 
217 
218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220 {
221  // Type promotion to handle the case where the types of the lhs and the rhs are different.
222  typedef typename promote_storage_type<typename InputXprType::Scalar,
228  typedef typename InputXprType::Nested LhsNested;
229  typedef typename KernelXprType::Nested RhsNested;
232  static const int NumDimensions = traits<InputXprType>::NumDimensions;
233  static const int Layout = traits<InputXprType>::Layout;
236 
237  enum {
238  Flags = 0
239  };
240 };
241 
242 template<typename Dimensions, typename InputXprType, typename KernelXprType>
243 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
244 {
246 };
247 
248 template<typename Dimensions, typename InputXprType, typename KernelXprType>
249 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
250 {
252 };
253 
254 } // end namespace internal
255 
256 
257 
258 template<typename Indices, typename InputXprType, typename KernelXprType>
259 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
260 {
261  public:
264  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
265  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
269 
270  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
271  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
272 
274  const Indices& indices() const { return m_indices; }
275 
279  inputExpression() const { return m_input_xpr; }
280 
283  kernelExpression() const { return m_kernel_xpr; }
284 
285  protected:
286  typename InputXprType::Nested m_input_xpr;
287  typename KernelXprType::Nested m_kernel_xpr;
289 };
290 
291 
292 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
293 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
294 {
296 
298  static const int NumKernelDims = internal::array_size<Indices>::value;
299  typedef typename XprType::Index Index;
301 
302  typedef typename XprType::Scalar Scalar;
308 
309  enum {
312  BlockAccess = false,
315  CoordAccess = false, // to be implemented
316  RawAccess = false
317  };
318 
319  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
321  //===--------------------------------------------------------------------===//
322 
323  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
324  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
325  {
326  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
327 
328  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
329  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
330 
331  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
332  m_inputStride[0] = 1;
333  for (int i = 1; i < NumDims; ++i) {
334  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
335  }
336  } else {
337  m_inputStride[NumDims - 1] = 1;
338  for (int i = NumDims - 2; i >= 0; --i) {
339  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
340  }
341  }
342 
343  m_dimensions = m_inputImpl.dimensions();
344  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
345  for (int i = 0; i < NumKernelDims; ++i) {
346  const Index index = op.indices()[i];
347  const Index input_dim = input_dims[index];
348  const Index kernel_dim = kernel_dims[i];
349  const Index result_dim = input_dim - kernel_dim + 1;
350  m_dimensions[index] = result_dim;
351  if (i > 0) {
352  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
353  } else {
354  m_kernelStride[0] = 1;
355  }
356  m_indexStride[i] = m_inputStride[index];
357  }
358 
359  m_outputStride[0] = 1;
360  for (int i = 1; i < NumDims; ++i) {
361  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
362  }
363  } else {
364  for (int i = NumKernelDims - 1; i >= 0; --i) {
365  const Index index = op.indices()[i];
366  const Index input_dim = input_dims[index];
367  const Index kernel_dim = kernel_dims[i];
368  const Index result_dim = input_dim - kernel_dim + 1;
369  m_dimensions[index] = result_dim;
370  if (i < NumKernelDims - 1) {
371  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
372  } else {
373  m_kernelStride[NumKernelDims - 1] = 1;
374  }
375  m_indexStride[i] = m_inputStride[index];
376  }
377 
378  m_outputStride[NumDims - 1] = 1;
379  for (int i = NumDims - 2; i >= 0; --i) {
380  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
381  }
382  }
383  }
384 
385  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
386 
388  m_inputImpl.evalSubExprsIfNeeded(NULL);
389  preloadKernel();
390  return true;
391  }
393  m_inputImpl.cleanup();
394  if (m_local_kernel) {
395  m_device.deallocate((void*)m_kernel);
396  m_local_kernel = false;
397  }
398  m_kernel = NULL;
399  }
400 
401  void evalTo(typename XprType::Scalar* buffer) {
403  for (int i = 0; i < dimensions().TotalSize(); ++i) {
404  buffer[i] += coeff(i);
405  }
406  cleanup();
407  }
408 
410  {
412  convolve(firstInput(index), 0, NumKernelDims-1, result);
413  return result;
414  }
415 
416  template<int LoadMode>
418  {
419  Index indices[2] = {index, index+PacketSize-1};
420  Index startInputs[2] = {0, 0};
421  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
422  for (int i = NumDims - 1; i > 0; --i) {
423  const Index idx0 = indices[0] / m_outputStride[i];
424  const Index idx1 = indices[1] / m_outputStride[i];
425  startInputs[0] += idx0 * m_inputStride[i];
426  startInputs[1] += idx1 * m_inputStride[i];
427  indices[0] -= idx0 * m_outputStride[i];
428  indices[1] -= idx1 * m_outputStride[i];
429  }
430  } else {
431  for (int i = 0; i < NumDims - 1; ++i) {
432  const Index idx0 = indices[0] / m_outputStride[i];
433  const Index idx1 = indices[1] / m_outputStride[i];
434  startInputs[0] += idx0 * m_inputStride[i];
435  startInputs[1] += idx1 * m_inputStride[i];
436  indices[0] -= idx0 * m_outputStride[i];
437  indices[1] -= idx1 * m_outputStride[i];
438  }
439  }
440  startInputs[0] += indices[0];
441  startInputs[1] += indices[1];
442 
443  if (startInputs[1]-startInputs[0] == PacketSize-1) {
444  PacketReturnType result = internal::pset1<PacketReturnType>(0);
445  convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
446  return result;
447  } else {
449  data[0] = Scalar(0);
450  convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
451  for (int i = 1; i < PacketSize-1; ++i) {
452  data[i] = Scalar(0);
453  convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
454  }
455  data[PacketSize-1] = Scalar(0);
456  convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
457  return internal::pload<PacketReturnType>(data);
458  }
459  }
460 
462  costPerCoeff(bool vectorized) const {
463  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
464  // We ignore the use of fused multiply-add.
465  const double convolve_compute_cost =
466  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
467  const double firstIndex_compute_cost =
468  NumDims *
469  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
470  TensorOpCost::DivCost<Index>());
471  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
472  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
473  m_kernelImpl.costPerCoeff(vectorized) +
474  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
475  PacketSize));
476  }
477 
479 
480  private:
482  Index startInput = 0;
483  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
484  for (int i = NumDims - 1; i > 0; --i) {
485  const Index idx = index / m_outputStride[i];
486  startInput += idx * m_inputStride[i];
487  index -= idx * m_outputStride[i];
488  }
489  } else {
490  for (int i = 0; i < NumDims - 1; ++i) {
491  const Index idx = index / m_outputStride[i];
492  startInput += idx * m_inputStride[i];
493  index -= idx * m_outputStride[i];
494  }
495  }
496  startInput += index;
497  return startInput;
498  }
499 
500  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
501  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
502  const Index input = firstIndex + j * m_indexStride[DimIndex];
503  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
504  if (DimIndex > 0) {
505  convolve(input, kernel, DimIndex-1, accum);
506  } else {
507  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
508  }
509  }
510  }
511 
512  template <typename Packet>
513  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
514  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
515  const Index input = firstIndex + j * m_indexStride[DimIndex];
516  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
517  if (DimIndex > 0) {
518  convolvePacket(input, kernel, DimIndex-1, accum);
519  } else {
520  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
521  }
522  }
523  }
524 
526  // Don't make a local copy of the kernel unless we have to (i.e. it's an
527  // expression that needs to be evaluated)
528  const Scalar* in_place = m_kernelImpl.data();
529  if (in_place) {
530  m_kernel = in_place;
531  m_local_kernel = false;
532  } else {
533  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
534  Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
536  EvalTo evalToTmp(local, m_kernelArg);
539 
540  m_kernel = local;
541  m_local_kernel = true;
542  }
543  }
544 
547 
553 
554  KernelArgType m_kernelArg;
555  const Scalar* m_kernel;
558 };
559 
560 
561 
562 
563 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
564 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
565 
566 template <int StaticKernelSize>
567 struct GetKernelSize {
568  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
569  return StaticKernelSize;
570  }
571 };
572 template <>
573 struct GetKernelSize<Dynamic> {
574  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
575  return kernelSize;
576  }
577 };
578 
579 template <typename InputEvaluator, typename Index, typename InputDims,
580  int StaticKernelSize>
581 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
582  InputEvaluator eval,
583  const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
584  indexMapper,
585  const float* __restrict kernel, const int numPlanes, const int numX,
586  const int maxX, const int kernelSize, float* buffer) {
587 #if defined(EIGEN_HIPCC)
588  HIP_DYNAMIC_SHARED(float, s)
589 #else
590  extern __shared__ float s[];
591 #endif
592 
593  const int first_x = blockIdx.x * maxX;
594  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
595  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
596  const int num_x_output = last_x - first_x + 1;
597 
598  const int first_plane = blockIdx.y * blockDim.y;
599  const int plane_stride = blockDim.y * gridDim.y;
600 
601  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
602  // Load inputs to shared memory
603  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
604  const int plane_kernel_offset = threadIdx.y * num_x_input;
605  #pragma unroll
606  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
607  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
608  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
609  }
610 
611  __syncthreads();
612 
613  // Compute the convolution
614  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
615 
616  #pragma unroll
617  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
618  const int kernel_offset = plane_kernel_offset + i;
619  float result = 0.0f;
620  #pragma unroll
621  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
622  result += s[k + kernel_offset] * kernel[k];
623  }
624  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
625  buffer[tensor_index] = result;
626  }
627  __syncthreads();
628  }
629 };
630 
631 template <typename InputEvaluator, typename Index, typename InputDims,
632  int StaticKernelSizeX, int StaticKernelSizeY>
633 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
634  InputEvaluator eval,
635  const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
636  indexMapper,
637  const float* __restrict kernel, const int numPlanes, const int numX,
638  const int maxX, const int numY, const int maxY, const int kernelSizeX,
639  const int kernelSizeY, float* buffer) {
640 #if defined(EIGEN_HIPCC)
641  HIP_DYNAMIC_SHARED(float, s)
642 #else
643  extern __shared__ float s[];
644 #endif
645 
646  const int first_x = blockIdx.x * maxX;
647  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
648  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
649  const int num_x_output = last_x - first_x + 1;
650 
651  const int first_y = blockIdx.y * maxY;
652  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
653  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
654  const int num_y_output = last_y - first_y + 1;
655 
656  const int first_plane = blockIdx.z * blockDim.z;
657  const int plane_stride = blockDim.z * gridDim.z;
658 
659  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
660 
661  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
662  const int plane_kernel_offset = threadIdx.z * num_y_input;
663 
664  // Load inputs to shared memory
665  #pragma unroll
666  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
667  const int input_offset = num_x_input * (j + plane_kernel_offset);
668  #pragma unroll
669  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
670  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
671  s[i + input_offset] = eval.coeff(tensor_index);
672  }
673  }
674 
675  __syncthreads();
676 
677  // Convolution
678  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
679 
680  #pragma unroll
681  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
682  #pragma unroll
683  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
684  float result = 0.0f;
685  #pragma unroll
686  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
687  const int kernel_offset = kernelSizeX * l;
688  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
689  #pragma unroll
690  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
691  result += s[k + input_offset] * kernel[k + kernel_offset];
692  }
693  }
694  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
695  buffer[tensor_index] = result;
696  }
697  }
698 
699  __syncthreads();
700  }
701 };
702 
703 template <typename InputEvaluator, typename Index, typename InputDims>
704 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
705  InputEvaluator eval,
706  const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
707  indexMapper,
708  const float* __restrict kernel, const size_t numPlanes, const size_t numX,
709  const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
710  const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
711  const size_t kernelSizeZ, float* buffer) {
712 #if defined(EIGEN_HIPCC)
713  HIP_DYNAMIC_SHARED(float, s)
714 #else
715  extern __shared__ float s[];
716 #endif
717 
718  // Load inputs to shared memory
719  const int first_x = blockIdx.x * maxX;
720  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
721  const int num_x_input = last_x - first_x + kernelSizeX;
722 
723  const int first_y = blockIdx.y * maxY;
724  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
725  const int num_y_input = last_y - first_y + kernelSizeY;
726 
727  const int first_z = blockIdx.z * maxZ;
728  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
729  const int num_z_input = last_z - first_z + kernelSizeZ;
730 
731  for (int p = 0; p < numPlanes; ++p) {
732 
733  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
734  const int plane_kernel_offset = 0;
735 
736  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
737  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
738  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
739  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
740  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
741  }
742  }
743  }
744 
745  __syncthreads();
746 
747  // Convolution
748  const int num_z_output = last_z - first_z + 1;
749  const int num_y_output = last_y - first_y + 1;
750  const int num_x_output = last_x - first_x + 1;
751  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
752 
753  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
754  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
755  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
756  float result = 0.0f;
757  for (int n = 0; n < kernelSizeZ; ++n) {
758  for (int m = 0; m < kernelSizeY; ++m) {
759  for (int l = 0; l < kernelSizeX; ++l) {
760  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
761  }
762  }
763  }
764  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
765  buffer[tensor_index] = result;
766  }
767  }
768  }
769  __syncthreads();
770  }
771 };
772 
773 
774 
775 template<typename Indices, typename InputArgType, typename KernelArgType>
776 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
777 {
778  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
779 
780  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
781  static const int NumKernelDims = internal::array_size<Indices>::value;
782  typedef typename XprType::Index Index;
783  typedef DSizes<Index, NumDims> Dimensions;
784  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
785 
786  enum {
788  PacketAccess = false,
789  BlockAccess = false,
790  PreferBlockAccess = false,
792  CoordAccess = false, // to be implemented
793  RawAccess = false
794  };
795 
796  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
797  typedef internal::TensorBlockNotImplemented TensorBlock;
798  //===--------------------------------------------------------------------===//
799 
800  TensorEvaluator(const XprType& op, const GpuDevice& device)
801  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
802  {
803  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
804 
805  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
806  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
807 
808  m_dimensions = m_inputImpl.dimensions();
809  for (int i = 0; i < NumKernelDims; ++i) {
810  const Index index = op.indices()[i];
811  const Index input_dim = input_dims[index];
812  const Index kernel_dim = kernel_dims[i];
813  const Index result_dim = input_dim - kernel_dim + 1;
814  m_dimensions[index] = result_dim;
815  }
816  }
817 
818  typedef typename XprType::CoeffReturnType CoeffReturnType;
820  typedef typename InputArgType::Scalar Scalar;
822 
823  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
824 
826  preloadKernel();
827  m_inputImpl.evalSubExprsIfNeeded(NULL);
828  if (data) {
829  executeEval(data);
830  return false;
831  } else {
832  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
833  executeEval(m_buf);
834  return true;
835  }
836  }
837 
839  m_inputImpl.cleanup();
840  if (m_buf) {
841  m_device.deallocate(m_buf);
842  m_buf = NULL;
843  }
844  if (m_local_kernel) {
845  m_device.deallocate((void*)m_kernel);
846  m_local_kernel = false;
847  }
848  m_kernel = NULL;
849  }
850 
851  EIGEN_STRONG_INLINE void preloadKernel() {
852  // Don't make a local copy of the kernel unless we have to (i.e. it's an
853  // expression that needs to be evaluated)
854  const Scalar* in_place = m_kernelImpl.data();
855  if (in_place) {
856  m_kernel = in_place;
857  m_local_kernel = false;
858  } else {
859  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
860  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
861  typedef TensorEvalToOp<const KernelArgType> EvalTo;
862  EvalTo evalToTmp(local, m_kernelArg);
865 
866  m_kernel = local;
867  m_local_kernel = true;
868  }
869  }
870 
871  static unsigned int ceil(unsigned int num, unsigned int denom) {
872  const unsigned int rounded_toward_zero = num / denom;
873  if (num > rounded_toward_zero * denom) {
874  return rounded_toward_zero + 1;
875  }
876  return rounded_toward_zero;
877  }
878 
879  void executeEval(Scalar* data) const {
880  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
881 
882  const int maxSharedMem = m_device.sharedMemPerBlock();
883  const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
884  const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
885  const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
886  const int warpSize = 32;
887 
888  switch (NumKernelDims) {
889  case 1: {
890  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
891 
892  const int numX = dimensions()[m_indices[0]];
893  const int numP = dimensions().TotalSize() / numX;
894  int maxX;
895  dim3 block_size;
896 
897  const int single_stride_dim =
898  static_cast<int>(Layout) == static_cast<int>(ColMajor)
899  ? 0
900  : m_inputImpl.dimensions().rank() - 1;
901  if (m_indices[0] == single_stride_dim) {
902  // Maximum the reuse
903  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
904  maxX = numext::mini<int>(inner_dim, numX);
905  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
906  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
907  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
908  }
909  else {
910  // Read as much as possible alongside the inner most dimension, that is the plane
911  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
912  const int maxP = numext::mini<int>(inner_dim, numP);
913  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
914 
915  block_size.x = numext::mini(warpSize, maxX);
916  block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
917  }
918 
919  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
920  gpu_assert(shared_mem <= maxSharedMem);
921 
922  const int num_x_blocks = ceil(numX, maxX);
923  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
924  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
925 
926  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
927 
928 
929  //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
930 
931  const array<Index, 1> indices(m_indices[0]);
932  const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
933  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
934  m_inputImpl.dimensions(), kernel_dims, indices);
935  switch(kernel_size) {
936  case 4: {
937  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
938  break;
939  }
940  case 7: {
941  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
942  break;
943  }
944  default: {
945  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
946  }
947  }
948  break;
949  }
950 
951  case 2: {
952  const int idxX =
953  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
954  const int idxY =
955  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
956  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
957  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
958 
959  const int numX = dimensions()[m_indices[idxX]];
960  const int numY = dimensions()[m_indices[idxY]];
961  const int numP = dimensions().TotalSize() / (numX*numY);
962 
963  const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
964 
965  // Snap maxX to warp size
966  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
967  const int maxX = numext::mini<int>(inner_dim, numX);
968  const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
969  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
970 
971  dim3 block_size;
972  block_size.x = numext::mini(1024, maxX);
973  block_size.y = numext::mini<int>(1024/block_size.x, maxY);
974  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
975 
976  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
977  gpu_assert(shared_mem <= maxSharedMem);
978 
979  const int num_x_blocks = ceil(numX, maxX);
980  const int num_y_blocks = ceil(numY, maxY);
981  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
982  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
983 
984  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
985 
986 
987  //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
988 
989  const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
990  const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
991  m_kernelImpl.dimensions()[idxY]);
992  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
993  m_inputImpl.dimensions(), kernel_dims, indices);
994  switch (kernel_size_x) {
995  case 4: {
996  switch (kernel_size_y) {
997  case 7: {
998  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
999  break;
1000  }
1001  default: {
1002  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1003  break;
1004  }
1005  }
1006  break;
1007  }
1008  case 7: {
1009  switch (kernel_size_y) {
1010  case 4: {
1011  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1012  break;
1013  }
1014  default: {
1015  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1016  break;
1017  }
1018  }
1019  break;
1020  }
1021  default: {
1022  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1023  break;
1024  }
1025  }
1026  break;
1027  }
1028 
1029  case 3: {
1030  const int idxX =
1031  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1032  const int idxY =
1033  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1034  const int idxZ =
1035  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1036 
1037  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1038  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1039  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1040 
1041  const int numX = dimensions()[m_indices[idxX]];
1042  const int numY = dimensions()[m_indices[idxY]];
1043  const int numZ = dimensions()[m_indices[idxZ]];
1044  const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1045 
1046  const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1047  const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1048  const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1049 
1050  dim3 block_size;
1051  block_size.x = numext::mini(32, maxX);
1052  block_size.y = numext::mini(32, maxY);
1053  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1054  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1055 
1056  const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1057  gpu_assert(shared_mem <= maxSharedMem);
1058 
1059  //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1060  const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1061  m_indices[idxZ]);
1062  const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1063  m_kernelImpl.dimensions()[idxY],
1064  m_kernelImpl.dimensions()[idxZ]);
1065  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1066  m_inputImpl.dimensions(), kernel_dims, indices);
1067 
1068  LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1069  break;
1070  }
1071 
1072  default: {
1073  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1074  }
1075  }
1076  }
1077 
1079  {
1080  eigen_assert(m_buf);
1081  eigen_assert(index < m_dimensions.TotalSize());
1082  return m_buf[index];
1083  }
1084 
1085  template<int LoadMode>
1087  {
1088  eigen_assert(m_buf);
1089  eigen_assert(index < m_dimensions.TotalSize());
1090  return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1091  }
1092 
1094  costPerCoeff(bool vectorized) const {
1095  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1096  // model.
1097  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1098  // We ignore the use of fused multiply-add.
1099  const double convolve_compute_cost =
1100  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1101  const double firstIndex_compute_cost =
1102  NumDims *
1103  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1104  TensorOpCost::DivCost<Index>());
1105  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1106  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1107  m_kernelImpl.costPerCoeff(vectorized) +
1108  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1109  PacketSize));
1110  }
1111 
1112  private:
1113  // No assignment (copies are needed by the kernels)
1114  TensorEvaluator& operator = (const TensorEvaluator&);
1115 
1116  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1117  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1118  KernelArgType m_kernelArg;
1119  Indices m_indices;
1120  Dimensions m_dimensions;
1121  Scalar* m_buf;
1122  const Scalar* m_kernel;
1123  bool m_local_kernel;
1124 
1125  const GpuDevice& m_device;
1126 };
1127 #endif
1128 
1129 
1130 } // end namespace Eigen
1131 
1132 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Eigen::TensorEvaluator::dimensions
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE Dimensions & dimensions() const
Definition: TensorEvaluator.h:73
gtsam.examples.DogLegOptimizerExample.int
int
Definition: DogLegOptimizerExample.py:111
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::Dimensions
DSizes< Index, NumDims > Dimensions
Definition: TensorConvolution.h:300
Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::StorageKind
promote_storage_type< typename traits< InputXprType >::StorageKind, typename traits< KernelXprType >::StorageKind >::ret StorageKind
Definition: TensorConvolution.h:225
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::evalTo
void evalTo(typename XprType::Scalar *buffer)
Definition: TensorConvolution.h:401
Eigen::internal::array_size::value
@ value
Definition: Meta.h:446
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::cleanup
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorConvolution.h:392
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
dimensions
const std::vector< size_t > dimensions
Definition: testVerticalBlockMatrix.cpp:27
Eigen::internal::IndexMapper::mapGpuInputKernelToTensorInputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
Definition: TensorConvolution.h:164
Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::PointerType
conditional< Pointer_type_promotion< typename InputXprType::Scalar, Scalar >::val, typename traits< InputXprType >::PointerType, typename traits< KernelXprType >::PointerType >::type PointerType
Definition: TensorConvolution.h:235
Eigen::internal::TensorBlockNotImplemented
Definition: TensorBlock.h:617
__restrict
#define __restrict
Definition: gkregex.h:522
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::Index
XprType::Index Index
Definition: TensorConvolution.h:299
s
RealScalar s
Definition: level1_cplx_impl.h:126
d
static const double d[K][N]
Definition: igam.h:11
Eigen::internal::IsVectorizable
Definition: TensorForwardDeclarations.h:147
Eigen::internal::IndexMapper::IndexMapper
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
Definition: TensorConvolution.h:27
Eigen::array< Index, NumKernelDims >
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::dimensions
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE Dimensions & dimensions() const
Definition: TensorConvolution.h:385
Eigen::internal::nested
Definition: TensorTraits.h:174
blockDim
dim3 blockDim
Definition: gpu_common.h:19
Eigen::internal::IndexMapper::mapGpuInputKernelToTensorInputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const
Definition: TensorConvolution.h:178
eigen_assert
#define eigen_assert(x)
Definition: Macros.h:1037
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_kernelStride
array< Index, NumKernelDims > m_kernelStride
Definition: TensorConvolution.h:549
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::EvaluatorPointerType
Storage::Type EvaluatorPointerType
Definition: TensorConvolution.h:307
Eigen::TensorEvaluator::TensorBlock
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
Definition: TensorEvaluator.h:63
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::costPerCoeff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorConvolution.h:462
ret
DenseIndex ret
Definition: level1_cplx_impl.h:44
Eigen::TensorEvaluator::Index
Derived::Index Index
Definition: TensorEvaluator.h:30
Eigen::TensorEvaluator::Layout
@ Layout
Definition: TensorEvaluator.h:50
Eigen::PacketType::type
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:51
type
Definition: pytypes.h:1525
Eigen::internal::IsVectorizable::value
static const bool value
Definition: TensorForwardDeclarations.h:148
Eigen::internal::IndexMapper::m_outputStrides
array< Index, NumDims > m_outputStrides
Definition: TensorConvolution.h:211
buffer
Definition: pytypes.h:2270
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_inputStride
array< Index, NumDims > m_inputStride
Definition: TensorConvolution.h:545
Eigen::internal::IndexMapper::m_inputStrides
array< Index, NumDims > m_inputStrides
Definition: TensorConvolution.h:210
Eigen::TensorEvaluator::packet
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
Definition: TensorEvaluator.h:105
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::firstInput
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const
Definition: TensorConvolution.h:481
result
Values result
Definition: OdometryOptimize.cpp:8
Eigen::internal::IndexMapper::mapGpuOutputKernelToTensorOutputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
Definition: TensorConvolution.h:200
Eigen::TensorEvaluator::PacketSize
static const int PacketSize
Definition: TensorEvaluator.h:36
Eigen::TensorEvaluator::evalSubExprsIfNeeded
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
Definition: TensorEvaluator.h:75
gtsam::utils.numerical_derivative.local
np.ndarray local(Y a, Y b)
Definition: numerical_derivative.py:33
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::packet
EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
Definition: TensorConvolution.h:417
Eigen::DSizes< Index, NumDims >
test_eigen_tensor.indices
indices
Definition: test_eigen_tensor.py:33
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_local_kernel
bool m_local_kernel
Definition: TensorConvolution.h:556
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::evalSubExprsIfNeeded
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar *)
Definition: TensorConvolution.h:387
Eigen::internal::IndexMapper::mapGpuInputKernelToTensorInputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const
Definition: TensorConvolution.h:192
EIGEN_ALIGN_MAX
#define EIGEN_ALIGN_MAX
Definition: ConfigureVectorization.h:157
n
int n
Definition: BiCGSTAB_simple.cpp:1
Eigen::PacketType
Definition: TensorMeta.h:50
Eigen::internal::IndexMapper::mapGpuOutputPlaneToTensorOutputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Definition: TensorConvolution.h:140
Eigen::internal::true_type
Definition: Meta.h:96
Eigen::internal::unpacket_traits::size
@ size
Definition: GenericPacketMath.h:138
Eigen::TensorConvolutionOp::CoeffReturnType
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition: TensorConvolution.h:265
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::preloadKernel
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel()
Definition: TensorConvolution.h:525
Eigen::TensorConvolutionOp::m_input_xpr
InputXprType::Nested m_input_xpr
Definition: TensorConvolution.h:286
Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::Index
promote_index_type< typename traits< InputXprType >::Index, typename traits< KernelXprType >::Index >::type Index
Definition: TensorConvolution.h:227
j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::Scalar
promote_storage_type< typename InputXprType::Scalar, typename KernelXprType::Scalar >::ret Scalar
Definition: TensorConvolution.h:223
Eigen::TensorConvolutionOp::m_indices
const Indices m_indices
Definition: TensorConvolution.h:288
operator()
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Definition: IndexedViewMethods.h:73
Eigen::TensorConvolutionOp
Definition: TensorConvolution.h:259
Eigen::Dynamic
const int Dynamic
Definition: Constants.h:22
Eigen::TensorEvaluator::data
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:181
Eigen::numext::mini
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: Eigen/src/Core/MathFunctions.h:1085
l
static const Line3 l(Rot3(), 1, 1)
Eigen::internal::IndexMapper::mapGpuInputPlaneToTensorInputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
Definition: TensorConvolution.h:116
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_outputStride
array< Index, NumDims > m_outputStride
Definition: TensorConvolution.h:546
Eigen::TensorEvaluator::PreferBlockAccess
@ PreferBlockAccess
Definition: TensorEvaluator.h:49
Eigen::internal::IndexMapper::m_gpuOutputStrides
array< Index, NumDims > m_gpuOutputStrides
Definition: TensorConvolution.h:213
blockIdx
dim3 blockIdx
Definition: gpu_common.h:19
Eigen::TensorConvolutionOp::Index
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition: TensorConvolution.h:268
Eigen::TensorEvaluator::TensorEvaluator
EIGEN_STRONG_INLINE TensorEvaluator(const Derived &m, const Device &device)
Definition: TensorEvaluator.h:66
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::TensorEvaluator
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorConvolution.h:323
Eigen::internal::promote_storage_type
Definition: XprHelper.h:518
m
Matrix3f m
Definition: AngleAxis_mimic_euler.cpp:1
Eigen::Triplet< double >
Eigen::TensorConvolutionOp::StorageKind
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Definition: TensorConvolution.h:267
Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::_LhsNested
remove_reference< LhsNested >::type _LhsNested
Definition: TensorConvolution.h:230
Eigen::internal::eval< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType >, Eigen::Dense >::type
const typedef TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > & type
Definition: TensorConvolution.h:245
Eigen::TensorEvaluator::m_device
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:192
Eigen::TensorConvolutionOp::inputExpression
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE internal::remove_all< typename InputXprType::Nested >::type & inputExpression() const
Definition: TensorConvolution.h:279
Eigen::TensorConvolutionOp::TensorConvolutionOp
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
Definition: TensorConvolution.h:270
Eigen::StorageMemory
Definition: TensorForwardDeclarations.h:37
Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::LhsNested
InputXprType::Nested LhsNested
Definition: TensorConvolution.h:228
Eigen::TensorBase
The tensor base class.
Definition: TensorBase.h:973
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::convolvePacket
EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet &accum) const
Definition: TensorConvolution.h:513
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::Storage
StorageMemory< Scalar, Device > Storage
Definition: TensorConvolution.h:306
ordering
static enum @1096 ordering
Eigen::internal::array_size
Definition: Meta.h:445
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::TensorBlock
internal::TensorBlockNotImplemented TensorBlock
Definition: TensorConvolution.h:320
XprType
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
Eigen::TensorEvaluator::BlockAccess
@ BlockAccess
Definition: TensorEvaluator.h:48
Eigen::internal::IndexMapper::m_gpuInputStrides
array< Index, NumDims > m_gpuInputStrides
Definition: TensorConvolution.h:212
offset
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition: gnuplot_common_settings.hh:64
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::CoeffReturnType
XprType::CoeffReturnType CoeffReturnType
Definition: TensorConvolution.h:303
Eigen::TensorConvolutionOp::Scalar
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
Definition: TensorConvolution.h:262
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::coeff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorConvolution.h:409
Eigen::internal::nested< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType >, 1, typename eval< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::type >::type
TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > type
Definition: TensorConvolution.h:251
array
Definition: numpy.h:821
EIGEN_HIP_LAUNCH_BOUNDS_1024
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:510
Eigen::TensorEvaluator::costPerCoeff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorEvaluator.h:147
Eigen::internal::IndexMapper::mapGpuOutputKernelToTensorOutputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
Definition: TensorConvolution.h:171
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_device
const Device EIGEN_DEVICE_REF m_device
Definition: TensorConvolution.h:557
threadIdx
dim3 threadIdx
Definition: gpu_common.h:19
Eigen::internal::traits
Definition: ForwardDeclarations.h:17
Eigen::TensorEvaluator::cleanup
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorEvaluator.h:92
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::Scalar
XprType::Scalar Scalar
Definition: TensorConvolution.h:302
EIGEN_STATIC_ASSERT
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:127
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_inputImpl
TensorEvaluator< InputArgType, Device > m_inputImpl
Definition: TensorConvolution.h:550
Eigen::internal::conditional
Definition: Meta.h:109
p
float * p
Definition: Tutorial_Map_using.cpp:9
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::XprType
TensorConvolutionOp< Indices, InputArgType, KernelArgType > XprType
Definition: TensorConvolution.h:295
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_indexStride
array< Index, NumKernelDims > m_indexStride
Definition: TensorConvolution.h:548
Eigen::TensorEvaluator::coeff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorEvaluator.h:94
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::PacketReturnType
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorConvolution.h:304
EIGEN_DEVICE_REF
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:50
Eigen::TensorConvolutionOp::kernelExpression
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE internal::remove_all< typename KernelXprType::Nested >::type & kernelExpression() const
Definition: TensorConvolution.h:283
Eigen::internal::IndexMapper::mapGpuOutputKernelToTensorOutputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const
Definition: TensorConvolution.h:185
Eigen::TensorEvaluator
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:28
ceil
const EIGEN_DEVICE_FUNC CeilReturnType ceil() const
Definition: ArrayCwiseUnaryOps.h:495
Eigen::internal::IndexMapper::NumDims
static const int NumDims
Definition: TensorConvolution.h:209
Eigen::TensorEvaluator::Scalar
Derived::Scalar Scalar
Definition: TensorEvaluator.h:31
internal
Definition: BandTriangularSolver.h:13
Eigen::TensorEvaluator::CoeffReturnType
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:32
NULL
#define NULL
Definition: ccolamd.c:609
Eigen::ColMajor
@ ColMajor
Definition: Constants.h:319
Eigen::TensorConvolutionOp::Nested
Eigen::internal::nested< TensorConvolutionOp >::type Nested
Definition: TensorConvolution.h:266
Indices
std::vector< size_t > Indices
Definition: testPartialPriorFactor.cpp:37
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::convolve
EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType &accum) const
Definition: TensorConvolution.h:500
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_kernelArg
KernelArgType m_kernelArg
Definition: TensorConvolution.h:554
Eigen::TensorEvaluator::IsAligned
@ IsAligned
Definition: TensorEvaluator.h:46
Eigen::internal::promote_index_type
Definition: XprHelper.h:120
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::data
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorConvolution.h:478
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_kernelImpl
TensorEvaluator< KernelArgType, Device > m_kernelImpl
Definition: TensorConvolution.h:551
Eigen::internal::eval
Definition: XprHelper.h:332
Eigen::TensorEvaluator::PacketAccess
@ PacketAccess
Definition: TensorEvaluator.h:47
Eigen::TensorEvaluator::PacketReturnType
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorEvaluator.h:33
Eigen::TensorConvolutionOp::RealScalar
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorConvolution.h:263
Eigen::internal::IndexMapper
Definition: TensorConvolution.h:25
Eigen::TensorOpCost
Definition: TensorCostModel.h:25
Eigen::TensorConvolutionOp::m_kernel_xpr
KernelXprType::Nested m_kernel_xpr
Definition: TensorConvolution.h:287
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_kernel
const Scalar * m_kernel
Definition: TensorConvolution.h:555
test_callbacks.value
value
Definition: test_callbacks.py:160
eval
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:38
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9
Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::_RhsNested
remove_reference< RhsNested >::type _RhsNested
Definition: TensorConvolution.h:231
Eigen::TensorEvaluator::XprType
Derived XprType
Definition: TensorEvaluator.h:35
Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_dimensions
Dimensions m_dimensions
Definition: TensorConvolution.h:552
Eigen::TensorEvalToOp
Definition: TensorEvalTo.h:68
Eigen::TensorEvaluator::Dimensions
Derived::Dimensions Dimensions
Definition: TensorEvaluator.h:34
Scalar
SCALAR Scalar
Definition: bench_gemm.cpp:46
Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::RhsNested
KernelXprType::Nested RhsNested
Definition: TensorConvolution.h:229
Eigen::TensorConvolutionOp::indices
EIGEN_DEVICE_FUNC const EIGEN_STRONG_INLINE Indices & indices() const
Definition: TensorConvolution.h:274
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
Eigen::Dense
Definition: Constants.h:507
Eigen::internal::TensorExecutor::run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())
Definition: TensorExecutor.h:96


gtsam
Author(s):
autogenerated on Thu Dec 19 2024 04:04:35