22 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 23 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 29 template<
typename BufferTOut,
typename BufferTIn>
30 static void run(BufferTOut* bufOut, BufferTIn& bufI,
const Eigen::SyclDevice& dev,
size_t length,
size_t local){
32 auto f = [length, local, bufOut, &bufI](cl::sycl::handler&
h)
mutable {
33 cl::sycl::nd_range<1> r{cl::sycl::range<1>{
std::max(length, local)},
34 cl::sycl::range<1>{
std::min(length, local)}};
38 bufI.template get_access<cl::sycl::access::mode::read_write>(
h);
40 bufOut->template get_access<cl::sycl::access::mode::discard_write>(
h);
41 cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
42 cl::sycl::access::target::local>
43 scratch(cl::sycl::range<1>(local),
h);
47 h.parallel_for<KernelName>(
48 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1>
id) {
49 size_t globalid =
id.get_global(0);
50 size_t localid =
id.get_local(0);
56 if (globalid < length) {
57 scratch[localid] = aI[globalid];
59 id.barrier(cl::sycl::access::fence_space::local_space);
63 if (globalid < length) {
64 int min = (length < local) ? length : local;
67 scratch[localid] += scratch[localid +
offset];
69 id.barrier(cl::sycl::access::fence_space::local_space);
73 aI[
id.get_group(0)] = scratch[localid];
74 if((length<=local) && globalid ==0){
75 aOut[globalid]=scratch[localid];
81 dev.m_queue.submit(
f);
82 dev.m_queue.throw_asynchronous();
87 length = length / local;
102 template <
typename Self,
typename Op,
bool Vectorizable>
106 static const bool HasOptimizedImplementation =
false;
108 static void run(
const Self&
self, Op& reducer,
const Eigen::SyclDevice& dev, CoeffReturnType* output) {
109 typedef const typename Self::ChildType HostExpr;
113 size_t inputSize =
self.impl().dimensions().TotalSize();
114 size_t rng = inputSize/red_factor;
115 size_t remaining = inputSize% red_factor;
119 size_t tileSize =dev.m_queue.get_device().
template get_info<cl::sycl::info::device::max_work_group_size>()/2;
120 size_t GRange=
std::max((
size_t )1, rng);
124 GRange |= GRange >> 1;
125 GRange |= GRange >> 2;
126 GRange |= GRange >> 4;
127 GRange |= GRange >> 8;
128 GRange |= GRange >> 16;
129 #if __x86_64__ || __ppc64__ || _WIN64 130 GRange |= GRange >> 32;
133 size_t outTileSize = tileSize;
135 if (GRange < outTileSize) outTileSize=GRange;
141 auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
143 Dims dims=
self.xprDims();
144 Op functor = reducer;
145 dev.m_queue.submit([&](cl::sycl::handler &cgh) {
148 auto tmp_global_accessor = temp_global_buffer.
template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
150 cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
152 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
156 const auto device_self_expr=
TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
161 auto globalid=itemID.get_global_linear_id();
166 tmp_global_accessor.get_pointer()[globalid]=
static_cast<CoeffReturnType
>(0);
168 if(remaining!=0 && globalid==0 )
173 dev.m_queue.throw_asynchronous();
181 template <
typename Self,
typename Op>
185 static const bool HasOptimizedImplementation =
false;
187 static bool run(
const Self&
self, Op& reducer,
const Eigen::SyclDevice& dev, CoeffReturnType* output,
typename Self::Index ,
typename Self::Index num_coeffs_to_preserve) {
188 typedef const typename Self::ChildType HostExpr;
192 size_t tileSize =dev.m_queue.get_device().
template get_info<cl::sycl::info::device::max_work_group_size>()/2;
194 size_t GRange=num_coeffs_to_preserve;
195 if (tileSize>GRange) tileSize=GRange;
196 else if(GRange>tileSize){
197 size_t xMode = GRange % tileSize;
198 if (xMode != 0) GRange += (tileSize - xMode);
205 Dims dims=
self.xprDims();
206 Op functor = reducer;
208 dev.m_queue.submit([&](cl::sycl::handler &cgh) {
211 auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
213 cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
215 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
219 const auto device_self_expr=
TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
225 auto globalid=itemID.get_global_linear_id();
226 if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
227 typename DeiceSelf::CoeffReturnType accum = functor.initialize();
229 functor.finalize(accum);
230 output_accessor.get_pointer()[globalid]= accum;
234 dev.m_queue.throw_asynchronous();
242 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP Self::CoeffReturnType CoeffReturnType
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Q id(Eigen::AngleAxisd(0, Q_z_axis))
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
A cost model used to limit the number of threads used for evaluating tensor expression.
auto createTupleOfAccessors(cl::sycl::handler &cgh, const Evaluator &expr) -> decltype(ExtractAccessor< Evaluator >::getTuple(cgh, expr))
template deduction for ExtractAccessor
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, CoeffReturnType *output)
PlaceHolderExpression< Expr, TotalLeaves-1 >::Type Type
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
static void run(BufferTOut *bufOut, BufferTIn &bufI, const Eigen::SyclDevice &dev, size_t length, size_t local)
Self::CoeffReturnType CoeffReturnType
static bool run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, CoeffReturnType *output, typename Self::Index, typename Self::Index num_coeffs_to_preserve)
auto extractFunctors(const Evaluator &evaluator) -> FunctorExtractor< Evaluator >
template deduction function for FunctorExtractor
This struct is used to convert the MakePointer in the host expression to the MakeGlobalPointer for th...