TensorReductionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 /*****************************************************************
14  * TensorSyclPlaceHolderExpr.h
15  *
16  * \brief:
17  * This is the specialisation of the placeholder expression based on the
18  * operation type
19  *
20 *****************************************************************/
21 
22 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
23 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
24 
25 namespace Eigen {
26 namespace internal {
27 
28 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
29 template<typename BufferTOut, typename BufferTIn>
30 static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
31  do {
32  auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
33  cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
34  cl::sycl::range<1>{std::min(length, local)}};
35  /* Two accessors are used: one to the buffer that is being reduced,
36  * and a second to local memory, used to store intermediate data. */
37  auto aI =
38  bufI.template get_access<cl::sycl::access::mode::read_write>(h);
39  auto aOut =
40  bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
41  cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
42  cl::sycl::access::target::local>
43  scratch(cl::sycl::range<1>(local), h);
44 
45  /* The parallel_for invocation chosen is the variant with an nd_item
46  * parameter, since the code requires barriers for correctness. */
47  h.parallel_for<KernelName>(
48  r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
49  size_t globalid = id.get_global(0);
50  size_t localid = id.get_local(0);
51  /* All threads collectively read from global memory into local.
52  * The barrier ensures all threads' IO is resolved before
53  * execution continues (strictly speaking, all threads within
54  * a single work-group - there is no co-ordination between
55  * work-groups, only work-items). */
56  if (globalid < length) {
57  scratch[localid] = aI[globalid];
58  }
59  id.barrier(cl::sycl::access::fence_space::local_space);
60 
61  /* Apply the reduction operation between the current local
62  * id and the one on the other half of the vector. */
63  if (globalid < length) {
64  int min = (length < local) ? length : local;
65  for (size_t offset = min / 2; offset > 0; offset /= 2) {
66  if (localid < offset) {
67  scratch[localid] += scratch[localid + offset];
68  }
69  id.barrier(cl::sycl::access::fence_space::local_space);
70  }
71  /* The final result will be stored in local id 0. */
72  if (localid == 0) {
73  aI[id.get_group(0)] = scratch[localid];
74  if((length<=local) && globalid ==0){
75  aOut[globalid]=scratch[localid];
76  }
77  }
78  }
79  });
80  };
81  dev.m_queue.submit(f);
82  dev.m_queue.throw_asynchronous();
83 
84  /* At this point, you could queue::wait_and_throw() to ensure that
85  * errors are caught quickly. However, this would likely impact
86  * performance negatively. */
87  length = length / local;
88 
89  } while (length > 1);
90 
91 
92 
93 }
94 
95 };
96 
101 // a leafNode.
102 template <typename Self, typename Op, bool Vectorizable>
103 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
104 
105  typedef typename Self::CoeffReturnType CoeffReturnType;
106  static const bool HasOptimizedImplementation = false;
107 
108  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
109  typedef const typename Self::ChildType HostExpr;
111  auto functors = TensorSycl::internal::extractFunctors(self.impl());
112  int red_factor =256;
113  size_t inputSize =self.impl().dimensions().TotalSize();
114  size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
115  size_t remaining = inputSize% red_factor;
116  if(rng ==0) {
117  red_factor=1;
118  };
119  size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
120  size_t GRange=std::max((size_t )1, rng);
121 
122  // convert global range to power of 2 for redecution
123  GRange--;
124  GRange |= GRange >> 1;
125  GRange |= GRange >> 2;
126  GRange |= GRange >> 4;
127  GRange |= GRange >> 8;
128  GRange |= GRange >> 16;
129 #if __x86_64__ || __ppc64__ || _WIN64
130  GRange |= GRange >> 32;
131 #endif
132  GRange++;
133  size_t outTileSize = tileSize;
135  if (GRange < outTileSize) outTileSize=GRange;
136  // getting final out buffer at the moment the created buffer is true because there is no need for assign
137  auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
141  auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
142  typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
143  Dims dims= self.xprDims();
144  Op functor = reducer;
145  dev.m_queue.submit([&](cl::sycl::handler &cgh) {
146  // create a tuple of accessors from Evaluator
147  auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
148  auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
149 
150  cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
152  auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
156  const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
161  auto globalid=itemID.get_global_linear_id();
162 
163  if(globalid<rng)
164  tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
165  else
166  tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
167 
168  if(remaining!=0 && globalid==0 )
169  // this will add the rest of input buffer when the input size is not devidable to red_factor.
170  tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
171  });
172  });
173  dev.m_queue.throw_asynchronous();
174 
176  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
177  }
178 
179 };
180 
181 template <typename Self, typename Op>
182 struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
183 
184  typedef typename Self::CoeffReturnType CoeffReturnType;
185  static const bool HasOptimizedImplementation = false;
186 
187  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
188  typedef const typename Self::ChildType HostExpr;
190  auto functors = TensorSycl::internal::extractFunctors(self.impl());
191 
192  size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
193 
194  size_t GRange=num_coeffs_to_preserve;
195  if (tileSize>GRange) tileSize=GRange;
196  else if(GRange>tileSize){
197  size_t xMode = GRange % tileSize;
198  if (xMode != 0) GRange += (tileSize - xMode);
199  }
200  // getting final out buffer at the moment the created buffer is true because there is no need for assign
204  typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
205  Dims dims= self.xprDims();
206  Op functor = reducer;
207 
208  dev.m_queue.submit([&](cl::sycl::handler &cgh) {
209  // create a tuple of accessors from Evaluator
210  auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
211  auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
212 
213  cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
215  auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
219  const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
225  auto globalid=itemID.get_global_linear_id();
226  if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
227  typename DeiceSelf::CoeffReturnType accum = functor.initialize();
228  GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
229  functor.finalize(accum);
230  output_accessor.get_pointer()[globalid]= accum;
231  }
232  });
233  });
234  dev.m_queue.throw_asynchronous();
235  return false;
236  }
237 };
238 
239 } // end namespace internal
240 } // namespace Eigen
241 
242 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
Definition: LDLT.h:16
A cost model used to limit the number of threads used for evaluating tensor expression.
auto createTupleOfAccessors(cl::sycl::handler &cgh, const Evaluator &expr) -> decltype(ExtractAccessor< Evaluator >::getTuple(cgh, expr))
template deduction for ExtractAccessor
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, CoeffReturnType *output)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half() max(const half &a, const half &b)
Definition: Half.h:438
PlaceHolderExpression< Expr, TotalLeaves-1 >::Type Type
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:33
static void run(BufferTOut *bufOut, BufferTIn &bufI, const Eigen::SyclDevice &dev, size_t length, size_t local)
static bool run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, CoeffReturnType *output, typename Self::Index, typename Self::Index num_coeffs_to_preserve)
int min(int a, int b)
auto extractFunctors(const Evaluator &evaluator) -> FunctorExtractor< Evaluator >
template deduction function for FunctorExtractor
This struct is used to convert the MakePointer in the host expression to the MakeGlobalPointer for th...


hebiros
Author(s): Xavier Artache , Matthew Tesch
autogenerated on Thu Sep 3 2020 04:09:30