22 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 23 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 29 template<
typename BufferTOut,
typename BufferTIn>
30 static void run(BufferTOut* bufOut, BufferTIn& bufI,
const Eigen::SyclDevice& dev,
size_t length,
size_t local){
32 auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h)
mutable {
33 cl::sycl::nd_range<1> r{cl::sycl::range<1>{
std::max(length, local)},
34 cl::sycl::range<1>{std::min(length, local)}};
38 bufI.template get_access<cl::sycl::access::mode::read_write>(h);
40 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
41 cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
42 cl::sycl::access::target::local>
43 scratch(cl::sycl::range<1>(local), h);
47 h.parallel_for<KernelName>(
48 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
49 size_t globalid =
id.get_global(0);
50 size_t localid =
id.get_local(0);
56 if (globalid < length) {
57 scratch[localid] = aI[globalid];
59 id.barrier(cl::sycl::access::fence_space::local_space);
63 if (globalid < length) {
64 int min = (length < local) ? length : local;
65 for (
size_t offset = min / 2; offset > 0; offset /= 2) {
66 if (localid < offset) {
67 scratch[localid] += scratch[localid + offset];
69 id.barrier(cl::sycl::access::fence_space::local_space);
73 aI[
id.get_group(0)] = scratch[localid];
74 if((length<=local) && globalid ==0){
75 aOut[globalid]=scratch[localid];
81 dev.m_queue.submit(
f);
82 dev.m_queue.throw_asynchronous();
87 length = length / local;
102 template <
typename Self,
typename Op,
bool Vectorizable>
106 static const bool HasOptimizedImplementation =
false;
108 static void run(
const Self&
self, Op& reducer,
const Eigen::SyclDevice& dev, CoeffReturnType* output) {
109 typedef const typename Self::ChildType HostExpr;
113 size_t inputSize =
self.impl().dimensions().TotalSize();
114 size_t rng = inputSize/red_factor;
115 size_t remaining = inputSize% red_factor;
119 size_t tileSize =dev.m_queue.get_device().
template get_info<cl::sycl::info::device::max_work_group_size>()/2;
120 size_t GRange=
std::max((
size_t )1, rng);
124 GRange |= GRange >> 1;
125 GRange |= GRange >> 2;
126 GRange |= GRange >> 4;
127 GRange |= GRange >> 8;
128 GRange |= GRange >> 16;
129 #if __x86_64__ || __ppc64__ || _WIN64 130 GRange |= GRange >> 32;
133 size_t outTileSize = tileSize;
135 if (GRange < outTileSize) outTileSize=GRange;
137 auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(
self.dimensions().TotalSize(), output);
141 auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
143 Dims dims=
self.xprDims();
144 Op functor = reducer;
145 dev.m_queue.submit([&](cl::sycl::handler &cgh) {
148 auto tmp_global_accessor = temp_global_buffer.
template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
150 cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
152 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
156 const auto device_self_expr=
TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
161 auto globalid=itemID.get_global_linear_id();
166 tmp_global_accessor.get_pointer()[globalid]=
static_cast<CoeffReturnType
>(0);
168 if(remaining!=0 && globalid==0 )
173 dev.m_queue.throw_asynchronous();
181 template <
typename Self,
typename Op>
185 static const bool HasOptimizedImplementation =
false;
187 static bool run(
const Self&
self, Op& reducer,
const Eigen::SyclDevice& dev, CoeffReturnType* output,
typename Self::Index ,
typename Self::Index num_coeffs_to_preserve) {
188 typedef const typename Self::ChildType HostExpr;
192 size_t tileSize =dev.m_queue.get_device().
template get_info<cl::sycl::info::device::max_work_group_size>()/2;
194 size_t GRange=num_coeffs_to_preserve;
195 if (tileSize>GRange) tileSize=GRange;
196 else if(GRange>tileSize){
197 size_t xMode = GRange % tileSize;
198 if (xMode != 0) GRange += (tileSize - xMode);
205 Dims dims=
self.xprDims();
206 Op functor = reducer;
208 dev.m_queue.submit([&](cl::sycl::handler &cgh) {
211 auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
213 cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
215 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
219 const auto device_self_expr=
TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
225 auto globalid=itemID.get_global_linear_id();
226 if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
227 typename DeiceSelf::CoeffReturnType accum = functor.initialize();
229 functor.finalize(accum);
230 output_accessor.get_pointer()[globalid]= accum;
234 dev.m_queue.throw_asynchronous();
242 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP Self::CoeffReturnType CoeffReturnType
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
A cost model used to limit the number of threads used for evaluating tensor expression.
auto createTupleOfAccessors(cl::sycl::handler &cgh, const Evaluator &expr) -> decltype(ExtractAccessor< Evaluator >::getTuple(cgh, expr))
template deduction for ExtractAccessor
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, CoeffReturnType *output)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half() max(const half &a, const half &b)
PlaceHolderExpression< Expr, TotalLeaves-1 >::Type Type
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
static void run(BufferTOut *bufOut, BufferTIn &bufI, const Eigen::SyclDevice &dev, size_t length, size_t local)
Self::CoeffReturnType CoeffReturnType
static bool run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, CoeffReturnType *output, typename Self::Index, typename Self::Index num_coeffs_to_preserve)
auto extractFunctors(const Evaluator &evaluator) -> FunctorExtractor< Evaluator >
template deduction function for FunctorExtractor
This struct is used to convert the MakePointer in the host expression to the MakeGlobalPointer for th...