10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 24 template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
31 for (
int i = 0;
i < NumKernelDims; ++
i) {
32 const Index index = indices[
i];
33 const Index input_dim = input_dims[index];
34 const Index kernel_dim = kernel_dims[
i];
35 const Index result_dim = input_dim - kernel_dim + 1;
36 dimensions[index] = result_dim;
41 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
45 inputStrides[
i] = inputStrides[
i-1] * input_dims[
i-1];
46 outputStrides[
i] = outputStrides[
i-1] * dimensions[
i-1];
51 for (
int i = static_cast<int>(
NumDims) - 2;
i >= 0; --
i) {
52 inputStrides[
i] = inputStrides[
i + 1] * input_dims[
i + 1];
53 outputStrides[
i] = outputStrides[
i + 1] * dimensions[
i + 1];
61 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
64 for (
int i = 0;
i < NumKernelDims; ++
i) {
66 ordering[index] = indices[
i];
68 gpuInputDimensions[index] = input_dims[indices[
i]];
69 gpuOutputDimensions[index] = dimensions[indices[
i]];
72 int written =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
77 ordering[written] =
i;
78 gpuInputDimensions[written] = input_dims[
i];
79 gpuOutputDimensions[written] = dimensions[
i];
89 if (static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
91 if (
i > NumKernelDims) {
102 for (
int i = NumDims - 1;
i >= 0; --
i) {
103 if (static_cast<size_t>(
i + 1) <
offset) {
117 Index inputIndex = 0;
118 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
119 for (
int d =
NumDims - 1;
d > NumKernelDims; --
d) {
126 std::ptrdiff_t limit = 0;
128 limit =
NumDims - NumKernelDims - 1;
130 for (
int d = 0;
d < limit; ++
d) {
141 Index outputIndex = 0;
142 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
143 for (
int d =
NumDims - 1;
d > NumKernelDims; --
d) {
150 std::ptrdiff_t limit = 0;
152 limit =
NumDims - NumKernelDims - 1;
154 for (
int d = 0;
d < limit; ++
d) {
165 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
172 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
179 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
186 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
193 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
201 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
218 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
242 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
248 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
258 template<
typename Indices,
typename InputXprType,
typename KernelXprType>
271 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
292 template<
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
313 PreferBlockAccess =
false,
324 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(
NULL), m_local_kernel(false), m_device(device)
331 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
332 m_inputStride[0] = 1;
334 m_inputStride[
i] = m_inputStride[
i - 1] * input_dims[
i - 1];
337 m_inputStride[NumDims - 1] = 1;
338 for (
int i = NumDims - 2;
i >= 0; --
i) {
339 m_inputStride[
i] = m_inputStride[
i + 1] * input_dims[
i + 1];
344 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
345 for (
int i = 0;
i < NumKernelDims; ++
i) {
346 const Index index = op.
indices()[
i];
347 const Index input_dim = input_dims[index];
348 const Index kernel_dim = kernel_dims[
i];
349 const Index result_dim = input_dim - kernel_dim + 1;
350 m_dimensions[index] = result_dim;
352 m_kernelStride[
i] = m_kernelStride[
i - 1] * kernel_dims[
i - 1];
354 m_kernelStride[0] = 1;
356 m_indexStride[
i] = m_inputStride[index];
359 m_outputStride[0] = 1;
361 m_outputStride[
i] = m_outputStride[
i - 1] * m_dimensions[
i - 1];
364 for (
int i = NumKernelDims - 1;
i >= 0; --
i) {
365 const Index index = op.
indices()[
i];
366 const Index input_dim = input_dims[index];
367 const Index kernel_dim = kernel_dims[
i];
368 const Index result_dim = input_dim - kernel_dim + 1;
369 m_dimensions[index] = result_dim;
370 if (
i < NumKernelDims - 1) {
371 m_kernelStride[
i] = m_kernelStride[
i + 1] * kernel_dims[
i + 1];
373 m_kernelStride[NumKernelDims - 1] = 1;
375 m_indexStride[
i] = m_inputStride[index];
378 m_outputStride[NumDims - 1] = 1;
379 for (
int i = NumDims - 2;
i >= 0; --
i) {
380 m_outputStride[
i] = m_outputStride[
i + 1] * m_dimensions[
i + 1];
388 m_inputImpl.evalSubExprsIfNeeded(
NULL);
393 m_inputImpl.cleanup();
394 if (m_local_kernel) {
395 m_device.deallocate((
void*)m_kernel);
396 m_local_kernel =
false;
402 evalSubExprsIfNeeded(
NULL);
404 buffer[
i] += coeff(
i);
411 CoeffReturnType
result = CoeffReturnType(0);
412 convolve(firstInput(index), 0, NumKernelDims-1, result);
416 template<
int LoadMode>
419 Index indices[2] = {index, index+PacketSize-1};
420 Index startInputs[2] = {0, 0};
421 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
422 for (
int i = NumDims - 1;
i > 0; --
i) {
423 const Index idx0 = indices[0] / m_outputStride[
i];
424 const Index idx1 = indices[1] / m_outputStride[
i];
425 startInputs[0] += idx0 * m_inputStride[
i];
426 startInputs[1] += idx1 * m_inputStride[
i];
427 indices[0] -= idx0 * m_outputStride[
i];
428 indices[1] -= idx1 * m_outputStride[
i];
431 for (
int i = 0;
i < NumDims - 1; ++
i) {
432 const Index idx0 = indices[0] / m_outputStride[
i];
433 const Index idx1 = indices[1] / m_outputStride[
i];
434 startInputs[0] += idx0 * m_inputStride[
i];
435 startInputs[1] += idx1 * m_inputStride[
i];
436 indices[0] -= idx0 * m_outputStride[
i];
437 indices[1] -= idx1 * m_outputStride[
i];
440 startInputs[0] += indices[0];
441 startInputs[1] += indices[1];
443 if (startInputs[1]-startInputs[0] == PacketSize-1) {
444 PacketReturnType
result = internal::pset1<PacketReturnType>(0);
445 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
450 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
451 for (
int i = 1;
i < PacketSize-1; ++
i) {
453 convolve(firstInput(index+
i), 0, NumKernelDims-1, data[
i]);
455 data[PacketSize-1] =
Scalar(0);
456 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
457 return internal::pload<PacketReturnType>(
data);
463 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
465 const double convolve_compute_cost =
466 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
467 const double firstIndex_compute_cost =
469 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
470 TensorOpCost::DivCost<Index>());
471 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
472 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
473 m_kernelImpl.costPerCoeff(vectorized) +
482 Index startInput = 0;
483 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
484 for (
int i = NumDims - 1;
i > 0; --
i) {
485 const Index idx = index / m_outputStride[
i];
486 startInput += idx * m_inputStride[
i];
487 index -= idx * m_outputStride[
i];
490 for (
int i = 0;
i < NumDims - 1; ++
i) {
491 const Index idx = index / m_outputStride[
i];
492 startInput += idx * m_inputStride[
i];
493 index -= idx * m_outputStride[
i];
501 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
502 const Index input = firstIndex +
j * m_indexStride[DimIndex];
503 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
505 convolve(input, kernel, DimIndex-1, accum);
507 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
512 template <
typename Packet>
514 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
515 const Index input = firstIndex +
j * m_indexStride[DimIndex];
516 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
518 convolvePacket(input, kernel, DimIndex-1, accum);
520 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
528 const Scalar* in_place = m_kernelImpl.data();
531 m_local_kernel =
false;
533 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
534 Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
536 EvalTo evalToTmp(local, m_kernelArg);
541 m_local_kernel =
true;
564 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) 566 template <
int StaticKernelSize>
567 struct GetKernelSize {
569 return StaticKernelSize;
573 struct GetKernelSize<Dynamic> {
579 template <
typename InputEvaluator,
typename Index,
typename InputDims,
580 int StaticKernelSize>
585 const float*
__restrict kernel,
const int numPlanes,
const int numX,
586 const int maxX,
const int kernelSize,
float*
buffer) {
587 #if defined(EIGEN_HIPCC) 588 HIP_DYNAMIC_SHARED(
float,
s)
590 extern __shared__
float s[];
593 const int first_x =
blockIdx.x * maxX;
594 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
595 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
596 const int num_x_output = last_x - first_x + 1;
599 const int plane_stride =
blockDim.y * gridDim.y;
601 for (
int p = first_plane +
threadIdx.y;
p < numPlanes;
p += plane_stride) {
604 const int plane_kernel_offset =
threadIdx.y * num_x_input;
608 s[
i + plane_kernel_offset] = eval.coeff(tensor_index);
618 const int kernel_offset = plane_kernel_offset +
i;
621 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
622 result += s[k + kernel_offset] * kernel[k];
625 buffer[tensor_index] =
result;
631 template <
typename InputEvaluator,
typename Index,
typename InputDims,
632 int StaticKernelSizeX,
int StaticKernelSizeY>
637 const float*
__restrict kernel,
const int numPlanes,
const int numX,
638 const int maxX,
const int numY,
const int maxY,
const int kernelSizeX,
639 const int kernelSizeY,
float* buffer) {
640 #if defined(EIGEN_HIPCC) 641 HIP_DYNAMIC_SHARED(
float,
s)
643 extern __shared__
float s[];
646 const int first_x =
blockIdx.x * maxX;
647 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
648 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
649 const int num_x_output = last_x - first_x + 1;
651 const int first_y =
blockIdx.y * maxY;
652 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
653 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
654 const int num_y_output = last_y - first_y + 1;
657 const int plane_stride =
blockDim.z * gridDim.z;
659 for (
int p = first_plane +
threadIdx.z;
p < numPlanes;
p += plane_stride) {
662 const int plane_kernel_offset =
threadIdx.z * num_y_input;
667 const int input_offset = num_x_input * (
j + plane_kernel_offset);
671 s[
i + input_offset] = eval.coeff(tensor_index);
686 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++
l) {
687 const int kernel_offset = kernelSizeX *
l;
688 const int input_offset =
i + num_x_input * (
j + l + plane_kernel_offset);
690 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
691 result += s[k + input_offset] * kernel[k + kernel_offset];
695 buffer[tensor_index] =
result;
703 template <
typename InputEvaluator,
typename Index,
typename InputDims>
708 const float*
__restrict kernel,
const size_t numPlanes,
const size_t numX,
709 const size_t maxX,
const size_t numY,
const size_t maxY,
const size_t numZ,
710 const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
711 const size_t kernelSizeZ,
float* buffer) {
712 #if defined(EIGEN_HIPCC) 713 HIP_DYNAMIC_SHARED(
float,
s)
715 extern __shared__
float s[];
719 const int first_x =
blockIdx.x * maxX;
720 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
721 const int num_x_input = last_x - first_x + kernelSizeX;
723 const int first_y =
blockIdx.y * maxY;
724 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
725 const int num_y_input = last_y - first_y + kernelSizeY;
727 const int first_z =
blockIdx.z * maxZ;
728 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
729 const int num_z_input = last_z - first_z + kernelSizeZ;
731 for (
int p = 0;
p < numPlanes; ++
p) {
734 const int plane_kernel_offset = 0;
740 s[
i + num_x_input * (
j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
748 const int num_z_output = last_z - first_z + 1;
749 const int num_y_output = last_y - first_y + 1;
750 const int num_x_output = last_x - first_x + 1;
757 for (
int n = 0;
n < kernelSizeZ; ++
n) {
758 for (
int m = 0;
m < kernelSizeY; ++
m) {
759 for (
int l = 0;
l < kernelSizeX; ++
l) {
760 result += s[
i +
l + num_x_input * (
j +
m + num_y_input * (k +
n + plane_kernel_offset))] * kernel[
l + kernelSizeX * (
m + kernelSizeY *
n)];
765 buffer[tensor_index] =
result;
775 template<
typename Indices,
typename InputArgType,
typename KernelArgType>
776 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
788 PacketAccess =
false,
790 PreferBlockAccess =
false,
808 m_dimensions = m_inputImpl.dimensions();
809 for (
int i = 0;
i < NumKernelDims; ++
i) {
810 const Index index = op.
indices()[
i];
811 const Index input_dim = input_dims[index];
812 const Index kernel_dim = kernel_dims[
i];
813 const Index result_dim = input_dim - kernel_dim + 1;
814 m_dimensions[index] = result_dim;
827 m_inputImpl.evalSubExprsIfNeeded(
NULL);
832 m_buf = (Scalar*)m_device.allocate(
dimensions().TotalSize() *
sizeof(Scalar));
839 m_inputImpl.cleanup();
841 m_device.deallocate(m_buf);
844 if (m_local_kernel) {
845 m_device.deallocate((
void*)m_kernel);
846 m_local_kernel =
false;
854 const Scalar* in_place = m_kernelImpl.data();
857 m_local_kernel =
false;
859 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
860 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
862 EvalTo evalToTmp(local, m_kernelArg);
867 m_local_kernel =
true;
871 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
872 const unsigned int rounded_toward_zero = num / denom;
873 if (num > rounded_toward_zero * denom) {
874 return rounded_toward_zero + 1;
876 return rounded_toward_zero;
879 void executeEval(Scalar* data)
const {
882 const int maxSharedMem = m_device.sharedMemPerBlock();
883 const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
884 const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
885 const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
886 const int warpSize = 32;
888 switch (NumKernelDims) {
890 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
893 const int numP =
dimensions().TotalSize() / numX;
897 const int single_stride_dim =
898 static_cast<int>(Layout) == static_cast<int>(
ColMajor)
900 : m_inputImpl.dimensions().rank() - 1;
901 if (m_indices[0] == single_stride_dim) {
903 const int inner_dim = ((maxSharedMem / (
sizeof(
Scalar)) - kernel_size + 1 + 31) / 32) * 32;
904 maxX = numext::mini<int>(inner_dim, numX);
905 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(Scalar)), numP);
907 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
911 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(Scalar));
912 const int maxP = numext::mini<int>(inner_dim, numP);
913 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(
Scalar)) - kernel_size + 1, numX);
916 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
919 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(Scalar);
920 gpu_assert(shared_mem <= maxSharedMem);
922 const int num_x_blocks =
ceil(numX, maxX);
923 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
924 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
926 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
934 m_inputImpl.dimensions(), kernel_dims, indices);
935 switch(kernel_size) {
937 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
941 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
945 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
953 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 0 : 1;
955 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 1 : 0;
956 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
957 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
959 const int numX =
dimensions()[m_indices[idxX]];
960 const int numY =
dimensions()[m_indices[idxY]];
961 const int numP =
dimensions().TotalSize() / (numX*numY);
963 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (
sizeof(Scalar) * kernel_size_y * kernel_size_x));
966 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
967 const int maxX = numext::mini<int>(inner_dim, numX);
968 const int maxY = numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
969 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(
Scalar)), numP);
973 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
974 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
976 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(
Scalar);
977 gpu_assert(shared_mem <= maxSharedMem);
979 const int num_x_blocks =
ceil(numX, maxX);
980 const int num_y_blocks =
ceil(numY, maxY);
981 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
982 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
984 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
991 m_kernelImpl.dimensions()[idxY]);
993 m_inputImpl.dimensions(), kernel_dims, indices);
994 switch (kernel_size_x) {
996 switch (kernel_size_y) {
998 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
1002 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1009 switch (kernel_size_y) {
1011 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1015 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1022 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1031 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 0 : 2;
1033 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 1 : 1;
1035 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 2 : 0;
1037 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1038 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1039 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1041 const int numX =
dimensions()[m_indices[idxX]];
1042 const int numY =
dimensions()[m_indices[idxY]];
1043 const int numZ =
dimensions()[m_indices[idxZ]];
1044 const int numP =
dimensions().TotalSize() / (numX*numY*numZ);
1046 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1047 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1048 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1053 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1054 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1056 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(Scalar);
1057 gpu_assert(shared_mem <= maxSharedMem);
1063 m_kernelImpl.dimensions()[idxY],
1064 m_kernelImpl.dimensions()[idxZ]);
1066 m_inputImpl.dimensions(), kernel_dims, indices);
1068 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1073 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1082 return m_buf[index];
1085 template<
int LoadMode>
1090 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1094 costPerCoeff(
bool vectorized)
const {
1097 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1099 const double convolve_compute_cost =
1100 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1101 const double firstIndex_compute_cost =
1103 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1104 TensorOpCost::DivCost<Index>());
1105 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1106 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1107 m_kernelImpl.costPerCoeff(vectorized) +
1118 KernelArgType m_kernelArg;
1120 Dimensions m_dimensions;
1122 const Scalar* m_kernel;
1123 bool m_local_kernel;
1125 const GpuDevice& m_device;
1132 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
#define EIGEN_STRONG_INLINE
KernelXprType::Nested m_kernel_xpr
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Namespace containing all symbols from the Eigen library.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
A cost model used to limit the number of threads used for evaluating tensor expression.
InputXprType::Nested m_input_xpr
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
array< Index, NumDims > m_gpuInputStrides
static enum @1107 ordering
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename InputXprType::Nested >::type & inputExpression() const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
static const Line3 l(Rot3(), 1, 1)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
array< Index, NumDims > m_gpuOutputStrides
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
array< Index, NumDims > m_outputStrides
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
array< Index, NumDims > m_inputStrides
Derived::Dimensions Dimensions
#define EIGEN_DEVICE_FUNC
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename KernelXprType::Nested >::type & kernelExpression() const
Eigen::internal::traits< TensorConvolutionOp >::Index Index
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const
std::vector< size_t > Indices
Eigen::NumTraits< Scalar >::Real RealScalar
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const
Generic expression where a coefficient-wise unary operator is applied to an expression.
const std::vector< size_t > dimensions
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Eigen::internal::nested< TensorConvolutionOp >::type Nested
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())