10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 24 template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
31 for (
int i = 0;
i < NumKernelDims; ++
i) {
32 const Index index = indices[
i];
33 const Index input_dim = input_dims[index];
34 const Index kernel_dim = kernel_dims[
i];
35 const Index result_dim = input_dim - kernel_dim + 1;
36 dimensions[index] = result_dim;
41 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
45 inputStrides[
i] = inputStrides[
i-1] * input_dims[
i-1];
46 outputStrides[
i] = outputStrides[
i-1] * dimensions[
i-1];
51 for (
int i = static_cast<int>(
NumDims) - 2;
i >= 0; --
i) {
52 inputStrides[
i] = inputStrides[
i + 1] * input_dims[
i + 1];
53 outputStrides[
i] = outputStrides[
i + 1] * dimensions[
i + 1];
61 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
64 for (
int i = 0;
i < NumKernelDims; ++
i) {
66 ordering[index] = indices[
i];
68 cudaInputDimensions[index] = input_dims[indices[
i]];
69 cudaOutputDimensions[index] = dimensions[indices[
i]];
72 int written =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
77 ordering[written] =
i;
78 cudaInputDimensions[written] = input_dims[
i];
79 cudaOutputDimensions[written] = dimensions[
i];
89 if (static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
91 if (
i > NumKernelDims) {
102 for (
int i = NumDims - 1;
i >= 0; --
i) {
103 if (
i + 1 < offset) {
117 Index inputIndex = 0;
118 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
119 for (
int d =
NumDims - 1;
d > NumKernelDims; --
d) {
126 std::ptrdiff_t limit = 0;
128 limit =
NumDims - NumKernelDims - 1;
130 for (
int d = 0;
d < limit; ++
d) {
141 Index outputIndex = 0;
142 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
143 for (
int d =
NumDims - 1;
d > NumKernelDims; --
d) {
150 std::ptrdiff_t limit = 0;
152 limit =
NumDims - NumKernelDims - 1;
154 for (
int d = 0;
d < limit; ++
d) {
165 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
172 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
179 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
186 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
193 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
201 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(
ColMajor)
218 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
240 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
246 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
256 template<
typename Indices,
typename InputXprType,
typename KernelXprType>
269 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
290 template<
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
314 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(
NULL), m_local_kernel(false), m_device(device)
321 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
322 m_inputStride[0] = 1;
324 m_inputStride[
i] = m_inputStride[
i - 1] * input_dims[
i - 1];
327 m_inputStride[NumDims - 1] = 1;
328 for (
int i = NumDims - 2;
i >= 0; --
i) {
329 m_inputStride[
i] = m_inputStride[
i + 1] * input_dims[
i + 1];
334 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
335 for (
int i = 0;
i < NumKernelDims; ++
i) {
336 const Index index = op.
indices()[
i];
337 const Index input_dim = input_dims[index];
338 const Index kernel_dim = kernel_dims[
i];
339 const Index result_dim = input_dim - kernel_dim + 1;
340 m_dimensions[index] = result_dim;
342 m_kernelStride[
i] = m_kernelStride[
i - 1] * kernel_dims[
i - 1];
344 m_kernelStride[0] = 1;
346 m_indexStride[
i] = m_inputStride[index];
349 m_outputStride[0] = 1;
351 m_outputStride[
i] = m_outputStride[
i - 1] * m_dimensions[
i - 1];
354 for (
int i = NumKernelDims - 1;
i >= 0; --
i) {
355 const Index index = op.
indices()[
i];
356 const Index input_dim = input_dims[index];
357 const Index kernel_dim = kernel_dims[
i];
358 const Index result_dim = input_dim - kernel_dim + 1;
359 m_dimensions[index] = result_dim;
360 if (
i < NumKernelDims - 1) {
361 m_kernelStride[
i] = m_kernelStride[
i + 1] * kernel_dims[
i + 1];
363 m_kernelStride[NumKernelDims - 1] = 1;
365 m_indexStride[
i] = m_inputStride[index];
368 m_outputStride[NumDims - 1] = 1;
369 for (
int i = NumDims - 2;
i >= 0; --
i) {
370 m_outputStride[
i] = m_outputStride[
i + 1] * m_dimensions[
i + 1];
378 m_inputImpl.evalSubExprsIfNeeded(
NULL);
383 m_inputImpl.cleanup();
384 if (m_local_kernel) {
385 m_device.deallocate((
void*)m_kernel);
386 m_local_kernel =
false;
392 evalSubExprsIfNeeded(
NULL);
394 buffer[
i] += coeff(
i);
401 CoeffReturnType
result = CoeffReturnType(0);
402 convolve(firstInput(index), 0, NumKernelDims-1, result);
406 template<
int LoadMode>
407 EIGEN_DEVICE_FUNC PacketReturnType
packet(
const Index index)
const 409 Index indices[2] = {index, index+PacketSize-1};
410 Index startInputs[2] = {0, 0};
411 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
412 for (
int i = NumDims - 1;
i > 0; --
i) {
413 const Index idx0 = indices[0] / m_outputStride[
i];
414 const Index idx1 = indices[1] / m_outputStride[
i];
415 startInputs[0] += idx0 * m_inputStride[
i];
416 startInputs[1] += idx1 * m_inputStride[
i];
417 indices[0] -= idx0 * m_outputStride[
i];
418 indices[1] -= idx1 * m_outputStride[
i];
421 for (
int i = 0;
i < NumDims - 1; ++
i) {
422 const Index idx0 = indices[0] / m_outputStride[
i];
423 const Index idx1 = indices[1] / m_outputStride[
i];
424 startInputs[0] += idx0 * m_inputStride[
i];
425 startInputs[1] += idx1 * m_inputStride[
i];
426 indices[0] -= idx0 * m_outputStride[
i];
427 indices[1] -= idx1 * m_outputStride[
i];
430 startInputs[0] += indices[0];
431 startInputs[1] += indices[1];
433 if (startInputs[1]-startInputs[0] == PacketSize-1) {
434 PacketReturnType
result = internal::pset1<PacketReturnType>(0);
435 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
440 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
441 for (
int i = 1;
i < PacketSize-1; ++
i) {
443 convolve(firstInput(index+
i), 0, NumKernelDims-1, data[
i]);
445 data[PacketSize-1] =
Scalar(0);
446 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
447 return internal::pload<PacketReturnType>(
data);
453 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
455 const double convolve_compute_cost =
456 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
457 const double firstIndex_compute_cost =
459 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
460 TensorOpCost::DivCost<Index>());
461 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
462 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
463 m_kernelImpl.costPerCoeff(vectorized) +
468 EIGEN_DEVICE_FUNC Scalar*
data()
const {
return NULL; }
472 Index startInput = 0;
473 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
474 for (
int i = NumDims - 1;
i > 0; --
i) {
475 const Index idx = index / m_outputStride[
i];
476 startInput += idx * m_inputStride[
i];
477 index -= idx * m_outputStride[
i];
480 for (
int i = 0;
i < NumDims - 1; ++
i) {
481 const Index idx = index / m_outputStride[
i];
482 startInput += idx * m_inputStride[
i];
483 index -= idx * m_outputStride[
i];
490 EIGEN_DEVICE_FUNC
void convolve(Index firstIndex, Index firstKernel,
int DimIndex, CoeffReturnType& accum)
const {
491 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
492 const Index input = firstIndex +
j * m_indexStride[DimIndex];
493 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
495 convolve(input, kernel, DimIndex-1, accum);
497 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
502 template <
typename Packet>
503 EIGEN_DEVICE_FUNC
void convolvePacket(Index firstIndex, Index firstKernel,
int DimIndex,
Packet& accum)
const {
504 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
505 const Index input = firstIndex +
j * m_indexStride[DimIndex];
506 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
508 convolvePacket(input, kernel, DimIndex-1, accum);
510 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
518 const Scalar* in_place = m_kernelImpl.data();
521 m_local_kernel =
false;
523 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
524 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
526 EvalTo evalToTmp(local, m_kernelArg);
531 m_local_kernel =
true;
554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 556 template <
int StaticKernelSize>
557 struct GetKernelSize {
559 return StaticKernelSize;
563 struct GetKernelSize<Dynamic> {
569 template <
typename InputEvaluator,
typename Index,
typename InputDims,
570 int StaticKernelSize>
571 __global__
void EigenConvolutionKernel1D(
575 const float*
__restrict kernel,
const int numPlanes,
const int numX,
576 const int maxX,
const int kernelSize,
float*
buffer) {
577 extern __shared__
float s[];
579 const int first_x =
blockIdx.x * maxX;
580 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
581 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
582 const int num_x_output = last_x - first_x + 1;
585 const int plane_stride =
blockDim.y * gridDim.y;
587 for (
int p = first_plane +
threadIdx.y;
p < numPlanes;
p += plane_stride) {
590 const int plane_kernel_offset =
threadIdx.y * num_x_input;
594 s[
i + plane_kernel_offset] = eval.coeff(tensor_index);
604 const int kernel_offset = plane_kernel_offset +
i;
607 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
608 result += s[k + kernel_offset] * kernel[k];
611 buffer[tensor_index] =
result;
617 template <
typename InputEvaluator,
typename Index,
typename InputDims,
618 int StaticKernelSizeX,
int StaticKernelSizeY>
619 __global__
void EigenConvolutionKernel2D(
623 const float*
__restrict kernel,
const int numPlanes,
const int numX,
624 const int maxX,
const int numY,
const int maxY,
const int kernelSizeX,
625 const int kernelSizeY,
float* buffer) {
626 extern __shared__
float s[];
628 const int first_x =
blockIdx.x * maxX;
629 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
630 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
631 const int num_x_output = last_x - first_x + 1;
633 const int first_y =
blockIdx.y * maxY;
634 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
635 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
636 const int num_y_output = last_y - first_y + 1;
639 const int plane_stride =
blockDim.z * gridDim.z;
641 for (
int p = first_plane +
threadIdx.z;
p < numPlanes;
p += plane_stride) {
644 const int plane_kernel_offset =
threadIdx.z * num_y_input;
649 const int input_offset = num_x_input * (
j + plane_kernel_offset);
653 s[
i + input_offset] = eval.coeff(tensor_index);
668 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++
l) {
669 const int kernel_offset = kernelSizeX *
l;
670 const int input_offset =
i + num_x_input * (
j + l + plane_kernel_offset);
672 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
673 result += s[k + input_offset] * kernel[k + kernel_offset];
677 buffer[tensor_index] =
result;
685 template <
typename InputEvaluator,
typename Index,
typename InputDims>
686 __global__
void EigenConvolutionKernel3D(
690 const float*
__restrict kernel,
const size_t numPlanes,
const size_t numX,
691 const size_t maxX,
const size_t numY,
const size_t maxY,
const size_t numZ,
692 const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
693 const size_t kernelSizeZ,
float* buffer) {
694 extern __shared__
float s[];
697 const int first_x =
blockIdx.x * maxX;
698 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
699 const int num_x_input = last_x - first_x + kernelSizeX;
701 const int first_y =
blockIdx.y * maxY;
702 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
703 const int num_y_input = last_y - first_y + kernelSizeY;
705 const int first_z =
blockIdx.z * maxZ;
706 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
707 const int num_z_input = last_z - first_z + kernelSizeZ;
709 for (
int p = 0;
p < numPlanes; ++
p) {
712 const int plane_kernel_offset = 0;
718 s[
i + num_x_input * (
j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
726 const int num_z_output = last_z - first_z + 1;
727 const int num_y_output = last_y - first_y + 1;
728 const int num_x_output = last_x - first_x + 1;
735 for (
int n = 0;
n < kernelSizeZ; ++
n) {
736 for (
int m = 0;
m < kernelSizeY; ++
m) {
737 for (
int l = 0;
l < kernelSizeX; ++
l) {
738 result += s[
i +
l + num_x_input * (
j +
m + num_y_input * (k +
n + plane_kernel_offset))] * kernel[
l + kernelSizeX * (
m + kernelSizeY *
n)];
743 buffer[tensor_index] =
result;
753 template<
typename Indices,
typename InputArgType,
typename KernelArgType>
754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
766 PacketAccess =
false,
772 EIGEN_DEVICE_FUNC
TensorEvaluator(
const XprType& op,
const GpuDevice& device)
780 m_dimensions = m_inputImpl.dimensions();
781 for (
int i = 0;
i < NumKernelDims; ++
i) {
782 const Index index = op.
indices()[
i];
783 const Index input_dim = input_dims[index];
784 const Index kernel_dim = kernel_dims[
i];
785 const Index result_dim = input_dim - kernel_dim + 1;
786 m_dimensions[index] = result_dim;
795 EIGEN_DEVICE_FUNC
const Dimensions&
dimensions()
const {
return m_dimensions; }
799 m_inputImpl.evalSubExprsIfNeeded(
NULL);
804 m_buf = (Scalar*)m_device.allocate(
dimensions().TotalSize() *
sizeof(Scalar));
811 m_inputImpl.cleanup();
813 m_device.deallocate(m_buf);
816 if (m_local_kernel) {
817 m_device.deallocate((
void*)m_kernel);
818 m_local_kernel =
false;
826 const Scalar* in_place = m_kernelImpl.data();
829 m_local_kernel =
false;
831 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
832 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
834 EvalTo evalToTmp(local, m_kernelArg);
839 m_local_kernel =
true;
843 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
844 const unsigned int rounded_toward_zero = num / denom;
845 if (num > rounded_toward_zero * denom) {
846 return rounded_toward_zero + 1;
848 return rounded_toward_zero;
851 void executeEval(Scalar* data)
const {
854 const int maxSharedMem = m_device.sharedMemPerBlock();
855 const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
856 const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
857 const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
858 const int warpSize = 32;
860 switch (NumKernelDims) {
862 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
865 const int numP =
dimensions().TotalSize() / numX;
869 const int single_stride_dim =
870 static_cast<int>(Layout) == static_cast<int>(
ColMajor)
872 : m_inputImpl.dimensions().rank() - 1;
873 if (m_indices[0] == single_stride_dim) {
875 const int inner_dim = ((maxSharedMem / (
sizeof(
Scalar)) - kernel_size + 1 + 31) / 32) * 32;
876 maxX = numext::mini<int>(inner_dim, numX);
877 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(Scalar)), numP);
879 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
883 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(Scalar));
884 const int maxP = numext::mini<int>(inner_dim, numP);
885 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(
Scalar)) - kernel_size + 1, numX);
888 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
891 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(Scalar);
892 assert(shared_mem <= maxSharedMem);
894 const int num_x_blocks =
ceil(numX, maxX);
895 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
896 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
898 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
906 m_inputImpl.dimensions(), kernel_dims, indices);
907 switch(kernel_size) {
909 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
913 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
917 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
925 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 0 : 1;
927 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 1 : 0;
928 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
929 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
931 const int numX =
dimensions()[m_indices[idxX]];
932 const int numY =
dimensions()[m_indices[idxY]];
933 const int numP =
dimensions().TotalSize() / (numX*numY);
935 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (
sizeof(Scalar) * kernel_size_y * kernel_size_x));
938 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
939 const int maxX = numext::mini<int>(inner_dim, numX);
940 const int maxY = numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(
Scalar)), numP);
945 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
946 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
948 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(
Scalar);
949 assert(shared_mem <= maxSharedMem);
951 const int num_x_blocks =
ceil(numX, maxX);
952 const int num_y_blocks =
ceil(numY, maxY);
953 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
954 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
956 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
963 m_kernelImpl.dimensions()[idxY]);
965 m_inputImpl.dimensions(), kernel_dims, indices);
966 switch (kernel_size_x) {
968 switch (kernel_size_y) {
970 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
974 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
981 switch (kernel_size_y) {
983 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
987 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
994 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1003 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 0 : 2;
1005 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 1 : 1;
1007 static_cast<int>(Layout) == static_cast<int>(
ColMajor) ? 2 : 0;
1009 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1010 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1011 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1013 const int numX =
dimensions()[m_indices[idxX]];
1014 const int numY =
dimensions()[m_indices[idxY]];
1015 const int numZ =
dimensions()[m_indices[idxZ]];
1016 const int numP =
dimensions().TotalSize() / (numX*numY*numZ);
1018 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1019 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1020 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1025 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1026 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1028 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(Scalar);
1029 assert(shared_mem <= maxSharedMem);
1035 m_kernelImpl.dimensions()[idxY],
1036 m_kernelImpl.dimensions()[idxZ]);
1038 m_inputImpl.dimensions(), kernel_dims, indices);
1040 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<
TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1045 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1054 return m_buf[index];
1057 template<
int LoadMode>
1062 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1066 costPerCoeff(
bool vectorized)
const {
1069 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1071 const double convolve_compute_cost =
1072 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1073 const double firstIndex_compute_cost =
1075 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1076 TensorOpCost::DivCost<Index>());
1077 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1078 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1079 m_kernelImpl.costPerCoeff(vectorized) +
1086 TensorEvaluator& operator = (
const TensorEvaluator&);
1090 KernelArgType m_kernelArg;
1092 Dimensions m_dimensions;
1094 const Scalar* m_kernel;
1095 bool m_local_kernel;
1097 const GpuDevice& m_device;
1104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
#define EIGEN_STRONG_INLINE
KernelXprType::Nested m_kernel_xpr
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
array< Index, NumDims > m_cudaOutputStrides
array< Index, NumDims > m_cudaInputStrides
static enum @843 ordering
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
A cost model used to limit the number of threads used for evaluating tensor expression.
InputXprType::Nested m_input_xpr
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
vector< size_t > dimensions(L.begin(), L.end())
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename KernelXprType::Nested >::type & kernelExpression() const
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
static const Line3 l(Rot3(), 1, 1)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const
static EIGEN_DEVICE_FUNC void run(const Expression &expr, const Device &device=Device())
array< Index, NumDims > m_outputStrides
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
array< Index, NumDims > m_inputStrides
Derived::Dimensions Dimensions
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const
Eigen::internal::traits< TensorConvolutionOp >::Index Index
std::vector< size_t > Indices
Eigen::NumTraits< Scalar >::Real RealScalar
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Eigen::internal::nested< TensorConvolutionOp >::type Nested
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typename InputXprType::Nested >::type & inputExpression() const