10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
24 template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
31 for (
int i = 0;
i < NumKernelDims; ++
i) {
33 const Index input_dim = input_dims[index];
34 const Index kernel_dim = kernel_dims[
i];
35 const Index result_dim = input_dim - kernel_dim + 1;
41 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
45 inputStrides[
i] = inputStrides[
i-1] * input_dims[
i-1];
51 for (
int i =
static_cast<int>(
NumDims) - 2;
i >= 0; --
i) {
52 inputStrides[
i] = inputStrides[
i + 1] * input_dims[
i + 1];
61 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
64 for (
int i = 0;
i < NumKernelDims; ++
i) {
68 gpuInputDimensions[index] = input_dims[
indices[
i]];
72 int written =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
78 gpuInputDimensions[written] = input_dims[
i];
89 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
91 if (
i > NumKernelDims) {
103 if (
static_cast<size_t>(
i + 1) <
offset) {
117 Index inputIndex = 0;
118 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
119 for (
int d =
NumDims - 1;
d > NumKernelDims; --
d) {
126 std::ptrdiff_t limit = 0;
128 limit =
NumDims - NumKernelDims - 1;
130 for (
int d = 0;
d < limit; ++
d) {
141 Index outputIndex = 0;
142 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
143 for (
int d =
NumDims - 1;
d > NumKernelDims; --
d) {
150 std::ptrdiff_t limit = 0;
152 limit =
NumDims - NumKernelDims - 1;
154 for (
int d = 0;
d < limit; ++
d) {
165 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
172 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
179 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
186 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
193 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
201 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
218 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
242 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
248 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
258 template<
typename Indices,
typename InputXprType,
typename KernelXprType>
292 template<
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
324 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(
NULL), m_local_kernel(false),
m_device(device)
332 m_inputStride[0] = 1;
333 for (
int i = 1;
i < NumDims; ++
i) {
334 m_inputStride[
i] = m_inputStride[
i - 1] * input_dims[
i - 1];
337 m_inputStride[NumDims - 1] = 1;
338 for (
int i = NumDims - 2;
i >= 0; --
i) {
339 m_inputStride[
i] = m_inputStride[
i + 1] * input_dims[
i + 1];
345 for (
int i = 0;
i < NumKernelDims; ++
i) {
347 const Index input_dim = input_dims[index];
348 const Index kernel_dim = kernel_dims[
i];
349 const Index result_dim = input_dim - kernel_dim + 1;
350 m_dimensions[index] = result_dim;
352 m_kernelStride[
i] = m_kernelStride[
i - 1] * kernel_dims[
i - 1];
354 m_kernelStride[0] = 1;
356 m_indexStride[
i] = m_inputStride[index];
359 m_outputStride[0] = 1;
360 for (
int i = 1;
i < NumDims; ++
i) {
361 m_outputStride[
i] = m_outputStride[
i - 1] * m_dimensions[
i - 1];
364 for (
int i = NumKernelDims - 1;
i >= 0; --
i) {
366 const Index input_dim = input_dims[index];
367 const Index kernel_dim = kernel_dims[
i];
368 const Index result_dim = input_dim - kernel_dim + 1;
369 m_dimensions[index] = result_dim;
370 if (
i < NumKernelDims - 1) {
371 m_kernelStride[
i] = m_kernelStride[
i + 1] * kernel_dims[
i + 1];
373 m_kernelStride[NumKernelDims - 1] = 1;
375 m_indexStride[
i] = m_inputStride[index];
378 m_outputStride[NumDims - 1] = 1;
379 for (
int i = NumDims - 2;
i >= 0; --
i) {
380 m_outputStride[
i] = m_outputStride[
i + 1] * m_dimensions[
i + 1];
388 m_inputImpl.evalSubExprsIfNeeded(
NULL);
393 m_inputImpl.cleanup();
394 if (m_local_kernel) {
395 m_device.deallocate((
void*)m_kernel);
396 m_local_kernel =
false;
412 convolve(firstInput(index), 0, NumKernelDims-1,
result);
416 template<
int LoadMode>
420 Index startInputs[2] = {0, 0};
422 for (
int i = NumDims - 1;
i > 0; --
i) {
425 startInputs[0] += idx0 * m_inputStride[
i];
426 startInputs[1] += idx1 * m_inputStride[
i];
427 indices[0] -= idx0 * m_outputStride[
i];
428 indices[1] -= idx1 * m_outputStride[
i];
431 for (
int i = 0;
i < NumDims - 1; ++
i) {
434 startInputs[0] += idx0 * m_inputStride[
i];
435 startInputs[1] += idx1 * m_inputStride[
i];
436 indices[0] -= idx0 * m_outputStride[
i];
437 indices[1] -= idx1 * m_outputStride[
i];
443 if (startInputs[1]-startInputs[0] ==
PacketSize-1) {
445 convolvePacket(startInputs[0], 0, NumKernelDims-1,
result);
450 convolve(startInputs[0], 0, NumKernelDims-1,
data[0]);
453 convolve(firstInput(index+
i), 0, NumKernelDims-1,
data[
i]);
457 return internal::pload<PacketReturnType>(
data);
463 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
465 const double convolve_compute_cost =
466 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
467 const double firstIndex_compute_cost =
469 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
470 TensorOpCost::DivCost<Index>());
472 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
473 m_kernelImpl.costPerCoeff(vectorized) +
482 Index startInput = 0;
484 for (
int i = NumDims - 1;
i > 0; --
i) {
485 const Index idx = index / m_outputStride[
i];
486 startInput += idx * m_inputStride[
i];
487 index -= idx * m_outputStride[
i];
490 for (
int i = 0;
i < NumDims - 1; ++
i) {
491 const Index idx = index / m_outputStride[
i];
492 startInput += idx * m_inputStride[
i];
493 index -= idx * m_outputStride[
i];
501 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
502 const Index input = firstIndex +
j * m_indexStride[DimIndex];
503 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
505 convolve(input, kernel, DimIndex-1, accum);
507 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
512 template <
typename Packet>
514 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
515 const Index input = firstIndex +
j * m_indexStride[DimIndex];
516 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
518 convolvePacket(input, kernel, DimIndex-1, accum);
520 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
528 const Scalar* in_place = m_kernelImpl.data();
531 m_local_kernel =
false;
533 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
536 EvalTo evalToTmp(local, m_kernelArg);
541 m_local_kernel =
true;
564 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
566 template <
int StaticKernelSize>
567 struct GetKernelSize {
569 return StaticKernelSize;
573 struct GetKernelSize<
Dynamic> {
579 template <
typename InputEvaluator,
typename Index,
typename InputDims,
580 int StaticKernelSize>
583 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
585 const float*
__restrict kernel,
const int numPlanes,
const int numX,
586 const int maxX,
const int kernelSize,
float*
buffer) {
587 #if defined(EIGEN_HIPCC)
588 HIP_DYNAMIC_SHARED(
float,
s)
590 extern __shared__
float s[];
593 const int first_x =
blockIdx.x * maxX;
594 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
595 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
596 const int num_x_output = last_x - first_x + 1;
599 const int plane_stride =
blockDim.y * gridDim.y;
601 for (
int p = first_plane +
threadIdx.y;
p < numPlanes;
p += plane_stride) {
603 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
604 const int plane_kernel_offset =
threadIdx.y * num_x_input;
607 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i+first_x);
608 s[
i + plane_kernel_offset] =
eval.coeff(tensor_index);
614 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
618 const int kernel_offset = plane_kernel_offset +
i;
621 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
622 result +=
s[k + kernel_offset] * kernel[k];
624 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i+first_x);
631 template <
typename InputEvaluator,
typename Index,
typename InputDims,
632 int StaticKernelSizeX,
int StaticKernelSizeY>
635 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
637 const float*
__restrict kernel,
const int numPlanes,
const int numX,
638 const int maxX,
const int numY,
const int maxY,
const int kernelSizeX,
639 const int kernelSizeY,
float*
buffer) {
640 #if defined(EIGEN_HIPCC)
641 HIP_DYNAMIC_SHARED(
float,
s)
643 extern __shared__
float s[];
646 const int first_x =
blockIdx.x * maxX;
647 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
648 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
649 const int num_x_output = last_x - first_x + 1;
651 const int first_y =
blockIdx.y * maxY;
652 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
653 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
654 const int num_y_output = last_y - first_y + 1;
657 const int plane_stride =
blockDim.z * gridDim.z;
659 for (
int p = first_plane +
threadIdx.z;
p < numPlanes;
p += plane_stride) {
661 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
662 const int plane_kernel_offset =
threadIdx.z * num_y_input;
667 const int input_offset = num_x_input * (
j + plane_kernel_offset);
670 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i+first_x,
j+first_y);
671 s[
i + input_offset] =
eval.coeff(tensor_index);
678 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
686 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++
l) {
687 const int kernel_offset = kernelSizeX *
l;
688 const int input_offset =
i + num_x_input * (
j +
l + plane_kernel_offset);
690 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
691 result +=
s[k + input_offset] * kernel[k + kernel_offset];
694 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i+first_x,
j+first_y);
703 template <
typename InputEvaluator,
typename Index,
typename InputDims>
706 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
708 const float*
__restrict kernel,
const size_t numPlanes,
const size_t numX,
709 const size_t maxX,
const size_t numY,
const size_t maxY,
const size_t numZ,
710 const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
711 const size_t kernelSizeZ,
float*
buffer) {
712 #if defined(EIGEN_HIPCC)
713 HIP_DYNAMIC_SHARED(
float,
s)
715 extern __shared__
float s[];
719 const int first_x =
blockIdx.x * maxX;
720 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
721 const int num_x_input = last_x - first_x + kernelSizeX;
723 const int first_y =
blockIdx.y * maxY;
724 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
725 const int num_y_input = last_y - first_y + kernelSizeY;
727 const int first_z =
blockIdx.z * maxZ;
728 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
729 const int num_z_input = last_z - first_z + kernelSizeZ;
731 for (
int p = 0;
p < numPlanes; ++
p) {
733 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
734 const int plane_kernel_offset = 0;
739 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i+first_x,
j+first_y, k+first_z);
740 s[
i + num_x_input * (
j + num_y_input * (k + plane_kernel_offset))] =
eval.coeff(tensor_index);
748 const int num_z_output = last_z - first_z + 1;
749 const int num_y_output = last_y - first_y + 1;
750 const int num_x_output = last_x - first_x + 1;
751 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
757 for (
int n = 0;
n < kernelSizeZ; ++
n) {
758 for (
int m = 0;
m < kernelSizeY; ++
m) {
759 for (
int l = 0;
l < kernelSizeX; ++
l) {
760 result +=
s[
i +
l + num_x_input * (
j +
m + num_y_input * (k +
n + plane_kernel_offset))] * kernel[
l + kernelSizeX * (
m + kernelSizeY *
n)];
764 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i+first_x,
j+first_y, k+first_z);
775 template<
typename Indices,
typename InputArgType,
typename KernelArgType>
776 struct TensorEvaluator<const TensorConvolutionOp<
Indices, InputArgType, KernelArgType>, GpuDevice>
778 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType>
XprType;
780 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>
::value;
797 typedef internal::TensorBlockNotImplemented
TensorBlock;
801 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.
indices()), m_buf(
NULL), m_kernel(
NULL), m_local_kernel(false),
m_device(device)
808 m_dimensions = m_inputImpl.dimensions();
809 for (
int i = 0;
i < NumKernelDims; ++
i) {
810 const Index index = op.indices()[
i];
811 const Index input_dim = input_dims[index];
812 const Index kernel_dim = kernel_dims[
i];
813 const Index result_dim = input_dim - kernel_dim + 1;
814 m_dimensions[index] = result_dim;
827 m_inputImpl.evalSubExprsIfNeeded(
NULL);
839 m_inputImpl.cleanup();
844 if (m_local_kernel) {
845 m_device.deallocate((
void*)m_kernel);
846 m_local_kernel =
false;
854 const Scalar* in_place = m_kernelImpl.data();
857 m_local_kernel =
false;
859 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
861 typedef TensorEvalToOp<const KernelArgType> EvalTo;
862 EvalTo evalToTmp(local, m_kernelArg);
867 m_local_kernel =
true;
871 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
872 const unsigned int rounded_toward_zero = num / denom;
873 if (num > rounded_toward_zero * denom) {
874 return rounded_toward_zero + 1;
876 return rounded_toward_zero;
882 const int maxSharedMem =
m_device.sharedMemPerBlock();
883 const int maxThreadsPerBlock =
m_device.maxGpuThreadsPerBlock();
884 const int maxBlocksPerProcessor =
m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
885 const int numMultiProcessors =
m_device.getNumGpuMultiProcessors();
886 const int warpSize = 32;
888 switch (NumKernelDims) {
890 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
893 const int numP =
dimensions().TotalSize() / numX;
897 const int single_stride_dim =
900 : m_inputImpl.dimensions().rank() - 1;
901 if (m_indices[0] == single_stride_dim) {
903 const int inner_dim = ((maxSharedMem / (
sizeof(
Scalar)) - kernel_size + 1 + 31) / 32) * 32;
904 maxX = numext::mini<int>(inner_dim, numX);
905 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(
Scalar)), numP);
907 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
911 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(
Scalar));
912 const int maxP = numext::mini<int>(inner_dim, numP);
913 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(
Scalar)) - kernel_size + 1, numX);
916 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
919 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(
Scalar);
920 gpu_assert(shared_mem <= maxSharedMem);
922 const int num_x_blocks =
ceil(numX, maxX);
923 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
924 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
926 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
933 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
934 m_inputImpl.dimensions(), kernel_dims,
indices);
935 switch(kernel_size) {
937 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4,
data);
941 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7,
data);
945 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size,
data);
956 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
957 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
959 const int numX =
dimensions()[m_indices[idxX]];
960 const int numY =
dimensions()[m_indices[idxY]];
961 const int numP =
dimensions().TotalSize() / (numX*numY);
963 const float scaling_factor = sqrtf(
static_cast<float>(maxSharedMem) / (
sizeof(
Scalar) * kernel_size_y * kernel_size_x));
966 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
967 const int maxX = numext::mini<int>(inner_dim, numX);
968 const int maxY = numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
969 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(
Scalar)), numP);
973 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
974 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
976 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(
Scalar);
977 gpu_assert(shared_mem <= maxSharedMem);
979 const int num_x_blocks =
ceil(numX, maxX);
980 const int num_y_blocks =
ceil(numY, maxY);
981 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
982 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
984 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
991 m_kernelImpl.dimensions()[idxY]);
992 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
993 m_inputImpl.dimensions(), kernel_dims,
indices);
994 switch (kernel_size_x) {
996 switch (kernel_size_y) {
998 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7,
data);
1002 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y,
data);
1009 switch (kernel_size_y) {
1011 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4,
data);
1015 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y,
data);
1022 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims,
Dynamic,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y,
data);
1037 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1038 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1039 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1041 const int numX =
dimensions()[m_indices[idxX]];
1042 const int numY =
dimensions()[m_indices[idxY]];
1043 const int numZ =
dimensions()[m_indices[idxZ]];
1044 const int numP =
dimensions().TotalSize() / (numX*numY*numZ);
1046 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1047 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1048 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1053 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1054 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1056 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(
Scalar);
1057 gpu_assert(shared_mem <= maxSharedMem);
1063 m_kernelImpl.dimensions()[idxY],
1064 m_kernelImpl.dimensions()[idxZ]);
1065 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1066 m_inputImpl.dimensions(), kernel_dims,
indices);
1068 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z,
data);
1073 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1082 return m_buf[index];
1085 template<
int LoadMode>
1090 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1097 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1099 const double convolve_compute_cost =
1100 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1101 const double firstIndex_compute_cost =
1103 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1104 TensorOpCost::DivCost<Index>());
1105 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized,
PacketSize) +
1106 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1107 m_kernelImpl.costPerCoeff(vectorized) +
1108 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1116 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1117 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1118 KernelArgType m_kernelArg;
1123 bool m_local_kernel;
1132 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H