gtsam: TensorReductionSycl.h Source File

Go to the documentation of this file.
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*****************************************************************
  * TensorSyclPlaceHolderExpr.h
  *
  * \brief:
  *  This is the specialisation of the placeholder expression based on the
  * operation type
  *
 *****************************************************************/
 
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 
 namespace Eigen {
 namespace internal {
 
 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
 template<typename BufferTOut, typename BufferTIn>
 static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
   do {
           auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
             cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                     cl::sycl::range<1>{std::min(length, local)}};
             /* Two accessors are used: one to the buffer that is being reduced,
              * and a second to local memory, used to store intermediate data. */
             auto aI =
                 bufI.template get_access<cl::sycl::access::mode::read_write>(h);
             auto aOut =
                 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
             cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
                                cl::sycl::access::target::local>
                 scratch(cl::sycl::range<1>(local), h);
 
             /* The parallel_for invocation chosen is the variant with an nd_item
              * parameter, since the code requires barriers for correctness. */
             h.parallel_for<KernelName>(
                 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
                   size_t globalid = id.get_global(0);
                   size_t localid = id.get_local(0);
                   /* All threads collectively read from global memory into local.
                    * The barrier ensures all threads' IO is resolved before
                    * execution continues (strictly speaking, all threads within
                    * a single work-group - there is no co-ordination between
                    * work-groups, only work-items). */
                   if (globalid < length) {
                     scratch[localid] = aI[globalid];
                   }
                   id.barrier(cl::sycl::access::fence_space::local_space);
 
                   /* Apply the reduction operation between the current local
                    * id and the one on the other half of the vector. */
                   if (globalid < length) {
                     int min = (length < local) ? length : local;
                     for (size_t offset = min / 2; offset > 0; offset /= 2) {
                       if (localid < offset) {
                         scratch[localid] += scratch[localid + offset];
                       }
                       id.barrier(cl::sycl::access::fence_space::local_space);
                     }
                     /* The final result will be stored in local id 0. */
                     if (localid == 0) {
                       aI[id.get_group(0)] = scratch[localid];
                       if((length<=local) && globalid ==0){
                         aOut[globalid]=scratch[localid];
                       }
                     }
                   }
                 });
           };
             dev.m_queue.submit(f);
             dev.m_queue.throw_asynchronous();
 
           /* At this point, you could queue::wait_and_throw() to ensure that
            * errors are caught quickly. However, this would likely impact
            * performance negatively. */
           length = length / local;
 
         } while (length > 1);
 
 
 
 }
 
 };
 
 // a leafNode.
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
 
   typedef typename Self::CoeffReturnType CoeffReturnType;
   static const bool HasOptimizedImplementation = false;
 
   static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
     typedef const typename Self::ChildType HostExpr; 
     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
     auto functors = TensorSycl::internal::extractFunctors(self.impl());
     int red_factor =256; 
     size_t inputSize =self.impl().dimensions().TotalSize();
     size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
     size_t remaining = inputSize% red_factor;
     if(rng ==0) {
       red_factor=1;
     };
     size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
     size_t GRange=std::max((size_t )1, rng);
 
     // convert global range to power of 2 for redecution
     GRange--;
     GRange |= GRange >> 1;
     GRange |= GRange >> 2;
     GRange |= GRange >> 4;
     GRange |= GRange >> 8;
     GRange |= GRange >> 16;
 #if __x86_64__ || __ppc64__ || _WIN64
     GRange |= GRange >> 32;
 #endif
     GRange++;
     size_t  outTileSize = tileSize;
     if (GRange < outTileSize) outTileSize=GRange;
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
     Dims dims= self.xprDims();
     Op functor = reducer;
     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
       // create a tuple of accessors from Evaluator
       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
       auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
 
       cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
         typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
         const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
         auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
         auto globalid=itemID.get_global_linear_id();
 
         if(globalid<rng)
           tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
         else
           tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
 
         if(remaining!=0 && globalid==0 )
           // this will add the rest of input buffer when the input size is not devidable to red_factor.
           tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
       });
     });
   dev.m_queue.throw_asynchronous();
 
   syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
   }
 
 };
 
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
 
   typedef typename Self::CoeffReturnType CoeffReturnType;
   static const bool HasOptimizedImplementation = false;
 
   static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
     typedef const typename Self::ChildType HostExpr; 
     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
     auto functors = TensorSycl::internal::extractFunctors(self.impl());
 
     size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
 
     size_t GRange=num_coeffs_to_preserve;
     if (tileSize>GRange) tileSize=GRange;
     else if(GRange>tileSize){
       size_t xMode = GRange % tileSize;
       if (xMode != 0) GRange += (tileSize - xMode);
     }
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
     Dims dims= self.xprDims();
     Op functor = reducer;
 
     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
       // create a tuple of accessors from Evaluator
       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
       auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
 
       cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
         typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
         const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
         typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
         auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
         auto globalid=itemID.get_global_linear_id();
         if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
           typename DeiceSelf::CoeffReturnType accum = functor.initialize();
           GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
           functor.finalize(accum);
           output_accessor.get_pointer()[globalid]= accum;
         }
       });
     });
   dev.m_queue.throw_asynchronous();
     return false;
   }
 };
 
 }  // end namespace internal
 }  // namespace Eigen
 
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP