22 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 
   23 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 
   28 template<
typename CoeffReturnType, 
typename KernelName> 
struct syclGenericBufferReducer{
 
   29 template<
typename BufferTOut, 
typename BufferTIn>
 
   30 static void run(BufferTOut* bufOut, BufferTIn& bufI, 
const Eigen::SyclDevice& dev, 
size_t length, 
size_t local){
 
   32           auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) 
mutable {
 
   33             cl::sycl::nd_range<1> r{cl::sycl::range<1>{
std::max(length, local)},
 
   34                                     cl::sycl::range<1>{
std::min(length, local)}};
 
   38                 bufI.template get_access<cl::sycl::access::mode::read_write>(h);
 
   40                 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
 
   41             cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
 
   42                                cl::sycl::access::target::local>
 
   43                 scratch(cl::sycl::range<1>(local), h);
 
   47             h.parallel_for<KernelName>(
 
   48                 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
 
   49                   size_t globalid = 
id.get_global(0);
 
   50                   size_t localid = 
id.get_local(0);
 
   56                   if (globalid < length) {
 
   57                     scratch[localid] = aI[globalid];
 
   59                   id.barrier(cl::sycl::access::fence_space::local_space);
 
   63                   if (globalid < length) {
 
   64                     int min = (length < local) ? length : local;
 
   65                     for (
size_t offset = 
min / 2; offset > 0; offset /= 2) {
 
   66                       if (localid < offset) {
 
   67                         scratch[localid] += scratch[localid + offset];
 
   69                       id.barrier(cl::sycl::access::fence_space::local_space);
 
   73                       aI[
id.get_group(0)] = scratch[localid];
 
   74                       if((length<=local) && globalid ==0){
 
   75                         aOut[globalid]=scratch[localid];
 
   81             dev.m_queue.submit(f);
 
   82             dev.m_queue.throw_asynchronous();
 
   87           length = length / local;
 
  102 template <
typename Self, 
typename Op, 
bool Vectorizable>
 
  103 struct FullReducer<Self, Op, const 
Eigen::SyclDevice, Vectorizable> {
 
  105   typedef typename Self::CoeffReturnType CoeffReturnType;
 
  108   static void run(
const Self& 
self, Op& reducer, 
const Eigen::SyclDevice& dev, CoeffReturnType* output) {
 
  109     typedef const typename Self::ChildType HostExpr; 
 
  113     size_t inputSize =
self.impl().dimensions().TotalSize();
 
  114     size_t rng = inputSize/red_factor; 
 
  115     size_t remaining = inputSize% red_factor;
 
  119     size_t tileSize =dev.m_queue.get_device(). 
template get_info<cl::sycl::info::device::max_work_group_size>()/2;
 
  124     GRange |= GRange >> 1;
 
  125     GRange |= GRange >> 2;
 
  126     GRange |= GRange >> 4;
 
  127     GRange |= GRange >> 8;
 
  128     GRange |= GRange >> 16;
 
  129 #if __x86_64__ || __ppc64__ || _WIN64 
  130     GRange |= GRange >> 32;
 
  133     size_t  outTileSize = tileSize;
 
  135     if (GRange < outTileSize) outTileSize=GRange;
 
  137     auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(
self.dimensions().TotalSize(), output);
 
  141     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
 
  143     Dims dims= 
self.xprDims();
 
  144     Op functor = reducer;
 
  145     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
 
  148       auto tmp_global_accessor = temp_global_buffer. 
template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
 
  150       cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
 
  152         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
 
  156         const auto device_self_expr= 
TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
 
  161         auto globalid=itemID.get_global_linear_id();
 
  164           tmp_global_accessor.get_pointer()[globalid]=
InnerMostDimReducer<decltype(device_self_evaluator), Op, 
false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, 
const_cast<Op&
>(functor));
 
  166           tmp_global_accessor.get_pointer()[globalid]=
static_cast<CoeffReturnType>(0);
 
  168         if(remaining!=0 && globalid==0 )
 
  170           tmp_global_accessor.get_pointer()[globalid]+=
InnerMostDimReducer<decltype(device_self_evaluator), Op, 
false>::reduce(device_self_evaluator, red_factor*(rng), remaining, 
const_cast<Op&
>(functor));
 
  173   dev.m_queue.throw_asynchronous();
 
  181 template <
typename Self, 
typename Op>
 
  182 struct InnerReducer<Self, Op, const 
Eigen::SyclDevice> {
 
  184   typedef typename Self::CoeffReturnType CoeffReturnType;
 
  187   static bool run(
const Self& 
self, Op& reducer, 
const Eigen::SyclDevice& dev, CoeffReturnType* output, 
typename Self::Index , 
typename Self::Index num_coeffs_to_preserve) {
 
  188     typedef const typename Self::ChildType HostExpr; 
 
  192     size_t tileSize =dev.m_queue.get_device(). 
template get_info<cl::sycl::info::device::max_work_group_size>()/2;
 
  194     size_t GRange=num_coeffs_to_preserve;
 
  195     if (tileSize>GRange) tileSize=GRange;
 
  196     else if(GRange>tileSize){
 
  197       size_t xMode = GRange % tileSize;
 
  198       if (xMode != 0) GRange += (tileSize - xMode);
 
  205     Dims dims= 
self.xprDims();
 
  206     Op functor = reducer;
 
  208     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
 
  211       auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
 
  213       cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
 
  215         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
 
  219         const auto device_self_expr= 
TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
 
  225         auto globalid=itemID.get_global_linear_id();
 
  226         if (globalid< 
static_cast<size_t>(num_coeffs_to_preserve)) {
 
  227           typename DeiceSelf::CoeffReturnType accum = functor.initialize();
 
  229           functor.finalize(accum);
 
  230           output_accessor.get_pointer()[globalid]= accum;
 
  234   dev.m_queue.throw_asynchronous();
 
  242 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP