TensorReductionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 /*****************************************************************
14  * TensorSyclPlaceHolderExpr.h
15  *
16  * \brief:
17  * This is the specialisation of the placeholder expression based on the
18  * operation type
19  *
20 *****************************************************************/
21 
22 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
23 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
24 
25 namespace Eigen {
26 namespace internal {
27 
28 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
29 template<typename BufferTOut, typename BufferTIn>
30 static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
31  do {
32  auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
33  cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
34  cl::sycl::range<1>{std::min(length, local)}};
35  /* Two accessors are used: one to the buffer that is being reduced,
36  * and a second to local memory, used to store intermediate data. */
37  auto aI =
38  bufI.template get_access<cl::sycl::access::mode::read_write>(h);
39  auto aOut =
40  bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
41  cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
42  cl::sycl::access::target::local>
43  scratch(cl::sycl::range<1>(local), h);
44 
45  /* The parallel_for invocation chosen is the variant with an nd_item
46  * parameter, since the code requires barriers for correctness. */
47  h.parallel_for<KernelName>(
48  r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
49  size_t globalid = id.get_global(0);
50  size_t localid = id.get_local(0);
51  /* All threads collectively read from global memory into local.
52  * The barrier ensures all threads' IO is resolved before
53  * execution continues (strictly speaking, all threads within
54  * a single work-group - there is no co-ordination between
55  * work-groups, only work-items). */
56  if (globalid < length) {
57  scratch[localid] = aI[globalid];
58  }
59  id.barrier(cl::sycl::access::fence_space::local_space);
60 
61  /* Apply the reduction operation between the current local
62  * id and the one on the other half of the vector. */
63  if (globalid < length) {
64  int min = (length < local) ? length : local;
65  for (size_t offset = min / 2; offset > 0; offset /= 2) {
66  if (localid < offset) {
67  scratch[localid] += scratch[localid + offset];
68  }
69  id.barrier(cl::sycl::access::fence_space::local_space);
70  }
71  /* The final result will be stored in local id 0. */
72  if (localid == 0) {
73  aI[id.get_group(0)] = scratch[localid];
74  if((length<=local) && globalid ==0){
75  aOut[globalid]=scratch[localid];
76  }
77  }
78  }
79  });
80  };
81  dev.m_queue.submit(f);
82  dev.m_queue.throw_asynchronous();
83 
84  /* At this point, you could queue::wait_and_throw() to ensure that
85  * errors are caught quickly. However, this would likely impact
86  * performance negatively. */
87  length = length / local;
88 
89  } while (length > 1);
90 
91 
92 
93 }
94 
95 };
96 
101 // a leafNode.
102 template <typename Self, typename Op, bool Vectorizable>
103 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
104 
105  typedef typename Self::CoeffReturnType CoeffReturnType;
106  static const bool HasOptimizedImplementation = false;
107 
108  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
109  typedef const typename Self::ChildType HostExpr;
111  auto functors = TensorSycl::internal::extractFunctors(self.impl());
112  int red_factor =256;
113  size_t inputSize =self.impl().dimensions().TotalSize();
114  size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
115  size_t remaining = inputSize% red_factor;
116  if(rng ==0) {
117  red_factor=1;
118  };
119  size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
120  size_t GRange=std::max((size_t )1, rng);
121 
122  // convert global range to power of 2 for redecution
123  GRange--;
124  GRange |= GRange >> 1;
125  GRange |= GRange >> 2;
126  GRange |= GRange >> 4;
127  GRange |= GRange >> 8;
128  GRange |= GRange >> 16;
129 #if __x86_64__ || __ppc64__ || _WIN64
130  GRange |= GRange >> 32;
131 #endif
132  GRange++;
133  size_t outTileSize = tileSize;
135  if (GRange < outTileSize) outTileSize=GRange;
136  // getting final out buffer at the moment the created buffer is true because there is no need for assign
137  auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
141  auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
142  typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
143  Dims dims= self.xprDims();
144  Op functor = reducer;
145  dev.m_queue.submit([&](cl::sycl::handler &cgh) {
146  // create a tuple of accessors from Evaluator
147  auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
148  auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
149 
150  cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
152  auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
156  const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
161  auto globalid=itemID.get_global_linear_id();
162 
163  if(globalid<rng)
164  tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
165  else
166  tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
167 
168  if(remaining!=0 && globalid==0 )
169  // this will add the rest of input buffer when the input size is not devidable to red_factor.
170  tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
171  });
172  });
173  dev.m_queue.throw_asynchronous();
174 
176  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
177  }
178 
179 };
180 
181 template <typename Self, typename Op>
182 struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
183 
184  typedef typename Self::CoeffReturnType CoeffReturnType;
185  static const bool HasOptimizedImplementation = false;
186 
187  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
188  typedef const typename Self::ChildType HostExpr;
190  auto functors = TensorSycl::internal::extractFunctors(self.impl());
191 
192  size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
193 
194  size_t GRange=num_coeffs_to_preserve;
195  if (tileSize>GRange) tileSize=GRange;
196  else if(GRange>tileSize){
197  size_t xMode = GRange % tileSize;
198  if (xMode != 0) GRange += (tileSize - xMode);
199  }
200  // getting final out buffer at the moment the created buffer is true because there is no need for assign
204  typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
205  Dims dims= self.xprDims();
206  Op functor = reducer;
207 
208  dev.m_queue.submit([&](cl::sycl::handler &cgh) {
209  // create a tuple of accessors from Evaluator
210  auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
211  auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
212 
213  cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
215  auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
219  const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
225  auto globalid=itemID.get_global_linear_id();
226  if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
227  typename DeiceSelf::CoeffReturnType accum = functor.initialize();
228  GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
229  functor.finalize(accum);
230  output_accessor.get_pointer()[globalid]= accum;
231  }
232  });
233  });
234  dev.m_queue.throw_asynchronous();
235  return false;
236  }
237 };
238 
239 } // end namespace internal
240 } // namespace Eigen
241 
242 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
#define max(a, b)
Definition: datatypes.h:20
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Q id(Eigen::AngleAxisd(0, Q_z_axis))
#define min(a, b)
Definition: datatypes.h:19
static std::mt19937 rng
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
A cost model used to limit the number of threads used for evaluating tensor expression.
auto createTupleOfAccessors(cl::sycl::handler &cgh, const Evaluator &expr) -> decltype(ExtractAccessor< Evaluator >::getTuple(cgh, expr))
template deduction for ExtractAccessor
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, CoeffReturnType *output)
PlaceHolderExpression< Expr, TotalLeaves-1 >::Type Type
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:33
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
static void run(BufferTOut *bufOut, BufferTIn &bufI, const Eigen::SyclDevice &dev, size_t length, size_t local)
const double h
static bool run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, CoeffReturnType *output, typename Self::Index, typename Self::Index num_coeffs_to_preserve)
auto extractFunctors(const Evaluator &evaluator) -> FunctorExtractor< Evaluator >
template deduction function for FunctorExtractor
This struct is used to convert the MakePointer in the host expression to the MakeGlobalPointer for th...
Definition: pytypes.h:897


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:45:44