15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
29 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
30 typename Kernel_accessor,
typename Buffer_accessor,
convolution_type Conv_Dim>
32 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
33 typename Kernel_accessor,
typename Buffer_accessor>
36 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
46 Buffer_accessor buffer_acc_,
48 const size_t kernelSize_,
const cl::sycl::range<2> input_range_)
49 : local_acc(local_acc_),
50 device_evaluator(device_evaluator_),
51 kernel_filter(kernel_filter_),
52 buffer_acc(buffer_acc_),
53 indexMapper(indexMapper_),
54 kernelSize(kernelSize_),
55 input_range(input_range_) {}
57 template <
typename BooleanDim2>
59 return (boolean_check[0] && boolean_check[1]);
62 auto buffer_ptr = buffer_acc.get_pointer();
63 auto kernel_ptr = kernel_filter.get_pointer();
65 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
66 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
67 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
70 for (
size_t i = itemID.get_local_id(0);
i < num_input;
i += itemID.get_local_range()[0]) {
71 const size_t local_index =
i + plane_kernel_offset;
72 const size_t tensor_index =
75 local_acc[local_index] =
76 (((
i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
77 ? device_evaluator.coeff(tensor_index)
81 itemID.barrier(cl::sycl::access::fence_space::local_space);
84 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
85 if (boundary_check(itemID.get_global_id() < input_range)) {
86 CoeffReturnType
result =
static_cast<CoeffReturnType
>(0);
87 const size_t index = plane_kernel_offset + itemID.get_local_id(0);
88 for (
size_t k = 0; k < kernelSize; ++k) {
89 result += (local_acc[k + index] * kernel_ptr[k]);
91 const size_t tensor_index =
94 buffer_ptr[tensor_index] =
result;
99 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
100 typename Kernel_accessor,
typename Buffer_accessor>
103 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
113 Buffer_accessor buffer_acc_,
115 const cl::sycl::range<2> kernel_size_,
const cl::sycl::range<3> input_range_)
116 : local_acc(local_acc_),
117 device_evaluator(device_evaluator_),
118 kernel_filter(kernel_filter_),
119 buffer_acc(buffer_acc_),
120 indexMapper(indexMapper_),
121 kernel_size(kernel_size_),
122 input_range(input_range_) {}
123 template <
typename BooleanDim3>
125 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
129 auto buffer_ptr = buffer_acc.get_pointer();
130 auto kernel_ptr = kernel_filter.get_pointer();
132 const auto num_input = cl::sycl::range<2>{
133 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
136 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
138 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
139 itemID.get_group(1) * itemID.get_local_range()[1]};
142 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
143 for (
size_t j = itemID.get_local_id(1);
j < num_input[1];
j += itemID.get_local_range()[1]) {
144 const size_t local_input_offset = num_input[0] * (
j + plane_kernel_offset);
145 bool in_range_dim1 = ((
j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
146 for (
size_t i = itemID.get_local_id(0);
i < num_input[0];
i += itemID.get_local_range()[0]) {
147 const size_t local_index =
i + local_input_offset;
149 i + input_offset[0],
j + input_offset[1]);
150 local_acc[local_index] = (((
i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
151 in_range_dim1 && in_range_dim2)
152 ? device_evaluator.coeff(tensor_index)
153 : CoeffReturnType(0);
157 itemID.barrier(cl::sycl::access::fence_space::local_space);
160 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
161 itemID.get_group(1) * itemID.get_local_range()[1]};
163 if (boundary_check(itemID.get_global_id() < input_range)) {
164 CoeffReturnType
result =
static_cast<CoeffReturnType
>(0);
166 for (
size_t j = 0;
j < kernel_size[1];
j++) {
167 size_t kernel_offset = kernel_size[0] *
j;
169 (num_input[0] * (plane_kernel_offset +
j + itemID.get_local_id(1))) + itemID.get_local_id(0);
170 for (
size_t i = 0;
i < kernel_size[0];
i++) {
171 result += (local_acc[
i + index] * kernel_ptr[
i + kernel_offset]);
174 const size_t tensor_index =
177 itemID.get_local_id(1) + output_offset[1]);
179 buffer_ptr[tensor_index] =
result;
184 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
185 typename Kernel_accessor,
typename Buffer_accessor>
188 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
200 Buffer_accessor buffer_acc_,
202 const cl::sycl::range<3> kernel_size_,
const cl::sycl::range<3> input_range_,
204 : local_acc(local_acc_),
205 device_evaluator(device_evaluator_),
206 kernel_filter(kernel_filter_),
207 buffer_acc(buffer_acc_),
208 indexMapper(indexMapper_),
209 kernel_size(kernel_size_),
210 input_range(input_range_),
212 template <
typename BooleanDim3>
214 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
217 auto buffer_ptr = buffer_acc.get_pointer();
218 auto kernel_ptr = kernel_filter.get_pointer();
219 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
221 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
223 const auto output_offset =
224 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
226 for (
size_t p = 0;
p < numP;
p++) {
229 for (
size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
230 size_t local_index_dim2 = num_input[0] * num_input[1] * k;
231 bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
232 for (
size_t j = itemID.get_local_id(1);
j < num_input[1];
j += itemID.get_local_range()[1]) {
233 bool cond_j_dim = cond_k_dim && (
j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
234 size_t local_index_dim1 = (num_input[0] *
j) + local_index_dim2;
235 for (
size_t i = itemID.get_local_id(0);
i < num_input[0];
i += itemID.get_local_range()[0]) {
236 bool conds = cond_j_dim && (
i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
237 const size_t local_index = local_index_dim1 +
i;
238 const size_t tensor_index =
240 i + input_offset[0],
j + input_offset[1], k + input_offset[2]);
241 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
245 itemID.barrier(cl::sycl::access::fence_space::local_space);
249 if (boundary_check(itemID.get_global_id() < input_range)) {
250 CoeffReturnType
result =
static_cast<CoeffReturnType
>(0);
251 for (
size_t k = 0; k < kernel_size[2]; k++) {
252 for (
size_t j = 0;
j < kernel_size[1];
j++) {
253 for (
size_t i = 0;
i < kernel_size[0];
i++) {
254 const size_t kernel_index =
i + kernel_size[0] * (
j + kernel_size[1] * k);
255 const size_t local_index =
256 ((
i + itemID.get_local_id(0)) +
257 num_input[0] * ((
j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
259 result += (local_acc[local_index] * kernel_ptr[kernel_index]);
263 const size_t tensor_index =
266 buffer_ptr[tensor_index] =
result;
269 itemID.barrier(cl::sycl::access::fence_space::local_space);
274 template <
typename Indices,
typename InputArgType,
typename KernelArgType>
278 static const int NumDims =
309 : m_inputImpl(op.inputExpression(), device),
310 m_kernelArg(op.kernelExpression()),
311 m_kernelImpl(op.kernelExpression(), device),
315 m_local_kernel(false),
319 YOU_MADE_A_PROGRAMMING_MISTAKE);
325 m_dimensions = m_inputImpl.dimensions();
326 for (
int i = 0;
i < NumKernelDims; ++
i) {
328 const Index input_dim = input_dims[index];
329 const Index kernel_dim = kernel_dims[
i];
330 const Index result_dim = input_dim - kernel_dim + 1;
331 m_dimensions[index] = result_dim;
339 m_inputImpl.evalSubExprsIfNeeded(
NULL);
352 m_inputImpl.cleanup();
357 if (m_local_kernel) {
359 m_local_kernel =
false;
374 m_local_kernel =
false;
376 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
379 EvalTo evalToTmp(
m_device.get(local), m_kernelArg);
383 m_local_kernel =
true;
389 typedef typename InputEvaluator::Dimensions InputDims;
390 switch (NumKernelDims) {
392 const size_t numX =
dimensions()[m_indices[0]];
393 const size_t numP =
dimensions().TotalSize() / numX;
394 const auto input_dim = std::array<size_t, 2>{numX, numP};
395 auto global_range = cl::sycl::range<2>{};
396 auto local_range = cl::sycl::range<2>{};
397 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
399 m_device.parallel_for_setup(input_dim, global_range, local_range);
400 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
401 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
410 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
411 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
412 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
417 auto kernel_index = std::array<size_t, 2>{
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1,
419 auto kernel_size = cl::sycl::range<2>{(
size_t)m_kernelImpl.dimensions()[kernel_index[0]],
420 (
size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
421 const size_t numX =
dimensions()[m_indices[kernel_index[0]]];
422 const size_t numY =
dimensions()[m_indices[kernel_index[1]]];
423 const size_t numP =
dimensions().TotalSize() / (numX * numY);
424 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
426 auto global_range = cl::sycl::range<3>{};
427 auto local_range = cl::sycl::range<3>{};
429 m_device.parallel_for_setup(input_dim, global_range, local_range);
431 const size_t local_memory_size =
432 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
433 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
436 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
441 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
442 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
443 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
448 auto kernel_index = std::array<size_t, 3>{
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2,
452 auto kernel_size = cl::sycl::range<3>{(
size_t)m_kernelImpl.dimensions()[kernel_index[0]],
453 (
size_t)m_kernelImpl.dimensions()[kernel_index[1]],
454 (
size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
456 const size_t numX =
dimensions()[m_indices[kernel_index[0]]];
457 const size_t numY =
dimensions()[m_indices[kernel_index[1]]];
458 const size_t numZ =
dimensions()[m_indices[kernel_index[2]]];
459 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
460 const size_t numP =
dimensions().TotalSize() / (numX * numY * numZ);
463 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
464 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
465 m_kernelImpl.dimensions()[kernel_index[1]],
466 m_kernelImpl.dimensions()[kernel_index[2]]}};
470 auto global_range = cl::sycl::range<3>{};
471 auto local_range = cl::sycl::range<3>{};
473 m_device.parallel_for_setup(input_dim, global_range, local_range);
474 auto local_memory_range = (local_range + kernel_size - 1);
475 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
477 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
481 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
489 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
500 template <
int LoadMode>
504 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
510 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
512 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
513 const double firstIndex_compute_cost =
515 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
517 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
522 m_kernelImpl.bind(cgh);
523 m_inputImpl.bind(cgh);
544 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H