10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 24 template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
31 for (
int i = 0; i < NumKernelDims; ++i) {
32 const Index index = indices[i];
33 const Index input_dim = input_dims[index];
34 const Index kernel_dim = kernel_dims[i];
35 const Index result_dim = input_dim - kernel_dim + 1;
36 dimensions[index] = result_dim;
41 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
44 for (
int i = 1; i <
NumDims; ++i) {
45 inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46 outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
51 for (
int i = static_cast<int>(
NumDims) - 2; i >= 0; --i) {
52 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
61 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
64 for (
int i = 0; i < NumKernelDims; ++i) {
65 const Index index = i + offset;
66 ordering[index] = indices[i];
68 cudaInputDimensions[index] = input_dims[indices[i]];
69 cudaOutputDimensions[index] = dimensions[indices[i]];
72 int written =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
75 for (
int i = 0; i <
NumDims; ++i) {
77 ordering[written] = i;
78 cudaInputDimensions[written] = input_dims[i];
79 cudaOutputDimensions[written] = dimensions[i];
84 for (
int i = 0; i <
NumDims; ++i) {
89 if (static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
90 for (
int i = 0; i <
NumDims; ++i) {
91 if (i > NumKernelDims) {
102 for (
int i = NumDims - 1; i >= 0; --i) {
103 if (i + 1 < offset) {
117 Index inputIndex = 0;
118 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
119 for (
int d =
NumDims - 1;
d > NumKernelDims; --
d) {
126 std::ptrdiff_t limit = 0;
128 limit =
NumDims - NumKernelDims - 1;
130 for (
int d = 0;
d < limit; ++
d) {
141 Index outputIndex = 0;
142 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
143 for (
int d =
NumDims - 1;
d > NumKernelDims; --
d) {
150 std::ptrdiff_t limit = 0;
152 limit =
NumDims - NumKernelDims - 1;
154 for (
int d = 0;
d < limit; ++
d) {
165 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
172 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
179 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
186 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
193 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
201 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
218 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
223 typename KernelXprType::Scalar>::ret
Scalar;
240 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
246 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
256 template<
typename Indices,
typename InputXprType,
typename KernelXprType>
269 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
272 const Indices&
indices()
const {
return m_indices; }
290 template<
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
314 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
321 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
322 m_inputStride[0] = 1;
323 for (
int i = 1; i <
NumDims; ++i) {
324 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
327 m_inputStride[NumDims - 1] = 1;
328 for (
int i = NumDims - 2; i >= 0; --i) {
329 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
334 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
335 for (
int i = 0; i < NumKernelDims; ++i) {
336 const Index index = op.
indices()[i];
337 const Index input_dim = input_dims[index];
338 const Index kernel_dim = kernel_dims[i];
339 const Index result_dim = input_dim - kernel_dim + 1;
340 m_dimensions[index] = result_dim;
342 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
344 m_kernelStride[0] = 1;
346 m_indexStride[i] = m_inputStride[index];
349 m_outputStride[0] = 1;
350 for (
int i = 1; i <
NumDims; ++i) {
351 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
354 for (
int i = NumKernelDims - 1; i >= 0; --i) {
355 const Index index = op.
indices()[i];
356 const Index input_dim = input_dims[index];
357 const Index kernel_dim = kernel_dims[i];
358 const Index result_dim = input_dim - kernel_dim + 1;
359 m_dimensions[index] = result_dim;
360 if (i < NumKernelDims - 1) {
361 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
363 m_kernelStride[NumKernelDims - 1] = 1;
365 m_indexStride[i] = m_inputStride[index];
368 m_outputStride[NumDims - 1] = 1;
369 for (
int i = NumDims - 2; i >= 0; --i) {
370 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
378 m_inputImpl.evalSubExprsIfNeeded(NULL);
383 m_inputImpl.cleanup();
384 if (m_local_kernel) {
385 m_device.deallocate((
void*)m_kernel);
386 m_local_kernel =
false;
392 evalSubExprsIfNeeded(NULL);
393 for (
int i = 0; i < dimensions().TotalSize(); ++i) {
394 buffer[i] += coeff(i);
401 CoeffReturnType result = CoeffReturnType(0);
402 convolve(firstInput(index), 0, NumKernelDims-1, result);
406 template<
int LoadMode>
407 EIGEN_DEVICE_FUNC PacketReturnType
packet(
const Index index)
const 409 Index indices[2] = {index, index+PacketSize-1};
410 Index startInputs[2] = {0, 0};
411 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
412 for (
int i = NumDims - 1; i > 0; --i) {
413 const Index idx0 = indices[0] / m_outputStride[i];
414 const Index idx1 = indices[1] / m_outputStride[i];
415 startInputs[0] += idx0 * m_inputStride[i];
416 startInputs[1] += idx1 * m_inputStride[i];
417 indices[0] -= idx0 * m_outputStride[i];
418 indices[1] -= idx1 * m_outputStride[i];
421 for (
int i = 0; i < NumDims - 1; ++i) {
422 const Index idx0 = indices[0] / m_outputStride[i];
423 const Index idx1 = indices[1] / m_outputStride[i];
424 startInputs[0] += idx0 * m_inputStride[i];
425 startInputs[1] += idx1 * m_inputStride[i];
426 indices[0] -= idx0 * m_outputStride[i];
427 indices[1] -= idx1 * m_outputStride[i];
430 startInputs[0] += indices[0];
431 startInputs[1] += indices[1];
433 if (startInputs[1]-startInputs[0] == PacketSize-1) {
434 PacketReturnType result = internal::pset1<PacketReturnType>(0);
435 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
440 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
441 for (
int i = 1; i < PacketSize-1; ++i) {
443 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
445 data[PacketSize-1] = Scalar(0);
446 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
447 return internal::pload<PacketReturnType>(data);
453 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
455 const double convolve_compute_cost =
456 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
457 const double firstIndex_compute_cost =
459 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
460 TensorOpCost::DivCost<Index>());
461 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
462 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
463 m_kernelImpl.costPerCoeff(vectorized) +
468 EIGEN_DEVICE_FUNC Scalar*
data()
const {
return NULL; }
472 Index startInput = 0;
473 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
474 for (
int i = NumDims - 1; i > 0; --i) {
475 const Index idx = index / m_outputStride[i];
476 startInput += idx * m_inputStride[i];
477 index -= idx * m_outputStride[i];
480 for (
int i = 0; i < NumDims - 1; ++i) {
481 const Index idx = index / m_outputStride[i];
482 startInput += idx * m_inputStride[i];
483 index -= idx * m_outputStride[i];
490 EIGEN_DEVICE_FUNC
void convolve(Index firstIndex, Index firstKernel,
int DimIndex, CoeffReturnType& accum)
const {
491 for (
int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
492 const Index input = firstIndex + j * m_indexStride[DimIndex];
493 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
495 convolve(input, kernel, DimIndex-1, accum);
497 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
502 template <
typename Packet>
503 EIGEN_DEVICE_FUNC
void convolvePacket(Index firstIndex, Index firstKernel,
int DimIndex, Packet& accum)
const {
504 for (
int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
505 const Index input = firstIndex + j * m_indexStride[DimIndex];
506 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
508 convolvePacket(input, kernel, DimIndex-1, accum);
510 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
518 const Scalar* in_place = m_kernelImpl.data();
521 m_local_kernel =
false;
523 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
524 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
526 EvalTo evalToTmp(local, m_kernelArg);
531 m_local_kernel =
true;
554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 556 template <
int StaticKernelSize>
557 struct GetKernelSize {
559 return StaticKernelSize;
563 struct GetKernelSize<Dynamic> {
569 template <
typename InputEvaluator,
typename Index,
typename InputDims,
570 int StaticKernelSize>
571 __global__
void EigenConvolutionKernel1D(
575 const float* __restrict kernel,
const int numPlanes,
const int numX,
576 const int maxX,
const int kernelSize,
float* buffer) {
577 extern __shared__
float s[];
579 const int first_x = blockIdx.x * maxX;
580 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
581 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
582 const int num_x_output = last_x - first_x + 1;
584 const int first_plane = blockIdx.y * blockDim.y;
585 const int plane_stride = blockDim.y * gridDim.y;
587 for (
int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
590 const int plane_kernel_offset = threadIdx.y * num_x_input;
592 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
594 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
603 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
604 const int kernel_offset = plane_kernel_offset + i;
607 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
608 result += s[k + kernel_offset] * kernel[k];
611 buffer[tensor_index] = result;
617 template <
typename InputEvaluator,
typename Index,
typename InputDims,
618 int StaticKernelSizeX,
int StaticKernelSizeY>
619 __global__
void EigenConvolutionKernel2D(
623 const float* __restrict kernel,
const int numPlanes,
const int numX,
624 const int maxX,
const int numY,
const int maxY,
const int kernelSizeX,
625 const int kernelSizeY,
float* buffer) {
626 extern __shared__
float s[];
628 const int first_x = blockIdx.x * maxX;
629 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
630 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
631 const int num_x_output = last_x - first_x + 1;
633 const int first_y = blockIdx.y * maxY;
634 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
635 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
636 const int num_y_output = last_y - first_y + 1;
638 const int first_plane = blockIdx.z * blockDim.z;
639 const int plane_stride = blockDim.z * gridDim.z;
641 for (
int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
644 const int plane_kernel_offset = threadIdx.z * num_y_input;
648 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
649 const int input_offset = num_x_input * (j + plane_kernel_offset);
651 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
653 s[i + input_offset] = eval.coeff(tensor_index);
663 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
665 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
668 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
669 const int kernel_offset = kernelSizeX * l;
670 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
672 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
673 result += s[k + input_offset] * kernel[k + kernel_offset];
677 buffer[tensor_index] = result;
685 template <
typename InputEvaluator,
typename Index,
typename InputDims>
686 __global__
void EigenConvolutionKernel3D(
690 const float* __restrict kernel,
const size_t numPlanes,
const size_t numX,
691 const size_t maxX,
const size_t numY,
const size_t maxY,
const size_t numZ,
692 const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
693 const size_t kernelSizeZ,
float* buffer) {
694 extern __shared__
float s[];
697 const int first_x = blockIdx.x * maxX;
698 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
699 const int num_x_input = last_x - first_x + kernelSizeX;
701 const int first_y = blockIdx.y * maxY;
702 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
703 const int num_y_input = last_y - first_y + kernelSizeY;
705 const int first_z = blockIdx.z * maxZ;
706 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
707 const int num_z_input = last_z - first_z + kernelSizeZ;
709 for (
int p = 0; p < numPlanes; ++p) {
712 const int plane_kernel_offset = 0;
714 for (
int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
715 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
716 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
718 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
726 const int num_z_output = last_z - first_z + 1;
727 const int num_y_output = last_y - first_y + 1;
728 const int num_x_output = last_x - first_x + 1;
731 for (
int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
732 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
733 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
735 for (
int n = 0; n < kernelSizeZ; ++n) {
736 for (
int m = 0; m < kernelSizeY; ++m) {
737 for (
int l = 0; l < kernelSizeX; ++l) {
738 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
743 buffer[tensor_index] = result;
753 template<
typename Indices,
typename InputArgType,
typename KernelArgType>
754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
766 PacketAccess =
false,
772 EIGEN_DEVICE_FUNC
TensorEvaluator(
const XprType& op,
const GpuDevice& device)
780 m_dimensions = m_inputImpl.dimensions();
781 for (
int i = 0; i < NumKernelDims; ++i) {
782 const Index index = op.
indices()[i];
783 const Index input_dim = input_dims[index];
784 const Index kernel_dim = kernel_dims[i];
785 const Index result_dim = input_dim - kernel_dim + 1;
786 m_dimensions[index] = result_dim;
792 typedef typename InputArgType::Scalar
Scalar;
795 EIGEN_DEVICE_FUNC
const Dimensions& dimensions()
const {
return m_dimensions; }
799 m_inputImpl.evalSubExprsIfNeeded(NULL);
804 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() *
sizeof(Scalar));
811 m_inputImpl.cleanup();
813 m_device.deallocate(m_buf);
816 if (m_local_kernel) {
817 m_device.deallocate((
void*)m_kernel);
818 m_local_kernel =
false;
826 const Scalar* in_place = m_kernelImpl.data();
829 m_local_kernel =
false;
831 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
832 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
834 EvalTo evalToTmp(local, m_kernelArg);
839 m_local_kernel =
true;
843 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
844 const unsigned int rounded_toward_zero = num / denom;
845 if (num > rounded_toward_zero * denom) {
846 return rounded_toward_zero + 1;
848 return rounded_toward_zero;
851 void executeEval(Scalar* data)
const {
854 const int maxSharedMem = m_device.sharedMemPerBlock();
855 const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
856 const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
857 const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
858 const int warpSize = 32;
860 switch (NumKernelDims) {
862 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
864 const int numX = dimensions()[m_indices[0]];
865 const int numP = dimensions().TotalSize() / numX;
869 const int single_stride_dim =
870 static_cast<int>(Layout) == static_cast<int>(
ColMajor)
872 : m_inputImpl.dimensions().rank() - 1;
873 if (m_indices[0] == single_stride_dim) {
875 const int inner_dim = ((maxSharedMem / (
sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
876 maxX = numext::mini<int>(inner_dim, numX);
877 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(Scalar)), numP);
878 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
879 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
883 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(Scalar));
884 const int maxP = numext::mini<int>(inner_dim, numP);
885 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(Scalar)) - kernel_size + 1, numX);
887 block_size.x = numext::mini(warpSize, maxX);
888 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
891 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(Scalar);
892 assert(shared_mem <= maxSharedMem);
894 const int num_x_blocks =
ceil(numX, maxX);
895 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
896 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
898 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
906 m_inputImpl.dimensions(), kernel_dims, indices);
907 switch(kernel_size) {
909 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
913 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
917 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
925 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 0 : 1;
927 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 1 : 0;
928 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
929 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
931 const int numX = dimensions()[m_indices[idxX]];
932 const int numY = dimensions()[m_indices[idxY]];
933 const int numP = dimensions().TotalSize() / (numX*numY);
935 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (
sizeof(Scalar) * kernel_size_y * kernel_size_x));
938 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
939 const int maxX = numext::mini<int>(inner_dim, numX);
940 const int maxY = numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(Scalar)), numP);
944 block_size.x = numext::mini(1024, maxX);
945 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
946 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
948 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(Scalar);
949 assert(shared_mem <= maxSharedMem);
951 const int num_x_blocks =
ceil(numX, maxX);
952 const int num_y_blocks =
ceil(numY, maxY);
953 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
954 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
956 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
963 m_kernelImpl.dimensions()[idxY]);
965 m_inputImpl.dimensions(), kernel_dims, indices);
966 switch (kernel_size_x) {
968 switch (kernel_size_y) {
970 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
974 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
981 switch (kernel_size_y) {
983 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
987 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
994 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1003 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 0 : 2;
1005 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 1 : 1;
1007 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 2 : 0;
1009 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1010 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1011 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1013 const int numX = dimensions()[m_indices[idxX]];
1014 const int numY = dimensions()[m_indices[idxY]];
1015 const int numZ = dimensions()[m_indices[idxZ]];
1016 const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1018 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1019 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1020 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1023 block_size.x = numext::mini(32, maxX);
1024 block_size.y = numext::mini(32, maxY);
1025 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1026 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1028 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(Scalar);
1029 assert(shared_mem <= maxSharedMem);
1035 m_kernelImpl.dimensions()[idxY],
1036 m_kernelImpl.dimensions()[idxZ]);
1038 m_inputImpl.dimensions(), kernel_dims, indices);
1040 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1045 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1054 return m_buf[index];
1057 template<
int LoadMode>
1062 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1066 costPerCoeff(
bool vectorized)
const {
1069 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1071 const double convolve_compute_cost =
1072 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1073 const double firstIndex_compute_cost =
1075 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1076 TensorOpCost::DivCost<Index>());
1077 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1078 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1079 m_kernelImpl.costPerCoeff(vectorized) +
1086 TensorEvaluator& operator = (
const TensorEvaluator&);
1090 KernelArgType m_kernelArg;
1092 Dimensions m_dimensions;
1094 const Scalar* m_kernel;
1095 bool m_local_kernel;
1097 const GpuDevice& m_device;
1104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
#define EIGEN_STRONG_INLINE
KernelXprType::Nested m_kernel_xpr
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
array< Index, NumDims > m_cudaOutputStrides
array< Index, NumDims > m_cudaInputStrides
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
A cost model used to limit the number of threads used for evaluating tensor expression.
InputXprType::Nested m_input_xpr
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename KernelXprType::Nested >::type & kernelExpression() const
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const
static EIGEN_DEVICE_FUNC void run(const Expression &expr, const Device &device=Device())
array< Index, NumDims > m_outputStrides
array< Index, NumDims > m_inputStrides
Derived::Dimensions Dimensions
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Eigen::NumTraits< Scalar >::Real RealScalar
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Eigen::internal::nested< TensorConvolutionOp >::type Nested
internal::packet_traits< Scalar >::type type
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename InputXprType::Nested >::type & inputExpression() const