TensorScanSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 /*****************************************************************
14  * TensorScanSycl.h
15  *
16  * \brief:
17  * Tensor Scan Sycl implement the extend version of
18  * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
19  * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
20  * the size of the tensor. In the first kernel (ScanKernelFunctor), each
21  * threads within the work-group individually reduces the allocated elements per
22  * thread in order to reduces the total number of blocks. In the next step all
23  * thread within the work-group will reduce the associated blocks into the
24  * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
25  * buffer is given as an input and all the threads within a work-group scan and
26  * reduces the boundaries between the blocks (generated from the previous
27  * kernel). and write the data on the temporary buffer. If the second kernel is
28  * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
29  * adjust the final result into the output buffer.
30  * The original algorithm for the parallel prefix sum can be found here:
31  *
32  * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
33  * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
34  *1, no. 1 (2008): 1-17.
35  *****************************************************************/
36 
37 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
38 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
39 
40 namespace Eigen {
41 namespace TensorSycl {
42 namespace internal {
43 
44 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
45 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
46 #endif
47 
48 template <typename index_t>
49 struct ScanParameters {
50  // must be power of 2
51  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
52  const index_t total_size;
53  const index_t non_scan_size;
54  const index_t scan_size;
55  const index_t non_scan_stride;
56  const index_t scan_stride;
57  const index_t panel_threads;
58  const index_t group_threads;
59  const index_t block_threads;
60  const index_t elements_per_group;
61  const index_t elements_per_block;
62  const index_t loop_range;
63 
64  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
65  index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
66  index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
67  : total_size(total_size_),
68  non_scan_size(non_scan_size_),
69  scan_size(scan_size_),
70  non_scan_stride(non_scan_stride_),
71  scan_stride(scan_stride_),
72  panel_threads(panel_threads_),
73  group_threads(group_threads_),
74  block_threads(block_threads_),
75  elements_per_group(elements_per_group_),
76  elements_per_block(elements_per_block_),
77  loop_range(loop_range_) {}
78 };
79 
80 enum class scan_step { first, second };
81 template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
82  scan_step stp>
83 struct ScanKernelFunctor {
84  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
87 
89  Evaluator dev_eval;
90  OutAccessor out_accessor;
91  OutAccessor temp_accessor;
92  const ScanParameters<Index> scanParameters;
93  Op accumulator;
94  const bool inclusive;
95  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
96  OutAccessor out_accessor_, OutAccessor temp_accessor_,
97  const ScanParameters<Index> scanParameters_, Op accumulator_,
98  const bool inclusive_)
99  : scratch(scratch_),
100  dev_eval(dev_eval_),
101  out_accessor(out_accessor_),
102  temp_accessor(temp_accessor_),
103  scanParameters(scanParameters_),
104  accumulator(accumulator_),
105  inclusive(inclusive_) {}
106 
107  template <scan_step sst = stp, typename Input>
110  read(const Input &inpt, Index global_id) {
111  return inpt.coeff(global_id);
112  }
113 
114  template <scan_step sst = stp, typename Input>
117  read(const Input &inpt, Index global_id) {
118  return inpt[global_id];
119  }
120 
121  template <scan_step sst = stp, typename InclusiveOp>
123  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
124  inclusive_op();
125  }
126 
127  template <scan_step sst = stp, typename InclusiveOp>
129  first_step_inclusive_Operation(InclusiveOp) {}
130 
131  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
132  auto out_ptr = out_accessor.get_pointer();
133  auto tmp_ptr = temp_accessor.get_pointer();
134  auto scratch_ptr = scratch.get_pointer().get();
135 
136  for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
137  Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
138  Index tmp = data_offset % scanParameters.panel_threads;
139  const Index panel_id = data_offset / scanParameters.panel_threads;
140  const Index group_id = tmp / scanParameters.group_threads;
141  tmp = tmp % scanParameters.group_threads;
142  const Index block_id = tmp / scanParameters.block_threads;
143  const Index local_id = tmp % scanParameters.block_threads;
144  // we put one element per packet in scratch_mem
145  const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
146  const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
147  CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
148  CoeffReturnType inclusive_scan;
149  // the actual panel size is scan_size * non_scan_size.
150  // elements_per_panel is roundup to power of 2 for binary tree
151  const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
152  const Index group_offset = group_id * scanParameters.non_scan_stride;
153  // This will be effective when the size is bigger than elements_per_block
156  const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
157  Index next_elements = 0;
159  for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
160  Index global_id = global_offset + next_elements;
161  private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
163  (global_id < scanParameters.total_size))
164  ? read(dev_eval, global_id)
166  next_elements += scanParameters.scan_stride;
167  }
169  if (inclusive) {
170  inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
171  }
172  });
173  // This for loop must be 2
175  for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
176  Index private_offset = 1;
177  // build sum in place up the tree
179  for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
181  for (Index l = 0; l < d; l++) {
182  Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
183  Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
184  CoeffReturnType accum = accumulator.initialize();
185  accumulator.reduce(private_scan[ai], &accum);
186  accumulator.reduce(private_scan[bi], &accum);
187  private_scan[bi] = accumulator.finalize(accum);
188  }
189  private_offset *= 2;
190  }
191  scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
192  private_scan[PacketSize - 1 + packetIndex];
193  private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
194  // traverse down tree & build scan
196  for (Index d = 1; d < PacketSize; d *= 2) {
197  private_offset >>= 1;
199  for (Index l = 0; l < d; l++) {
200  Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
201  Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
202  CoeffReturnType accum = accumulator.initialize();
203  accumulator.reduce(private_scan[ai], &accum);
204  accumulator.reduce(private_scan[bi], &accum);
205  private_scan[ai] = private_scan[bi];
206  private_scan[bi] = accumulator.finalize(accum);
207  }
208  }
209  }
210 
211  Index offset = 1;
212  // build sum in place up the tree
213  for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
214  // Synchronise
215  itemID.barrier(cl::sycl::access::fence_space::local_space);
216  if (local_id < d) {
217  Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
218  Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
219  CoeffReturnType accum = accumulator.initialize();
220  accumulator.reduce(scratch_ptr[ai], &accum);
221  accumulator.reduce(scratch_ptr[bi], &accum);
222  scratch_ptr[bi] = accumulator.finalize(accum);
223  }
224  offset *= 2;
225  }
226  // Synchronise
227  itemID.barrier(cl::sycl::access::fence_space::local_space);
228  // next step optimisation
229  if (local_id == 0) {
234  block_id;
235  tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
236  }
237  // clear the last element
238  scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
239  }
240  // traverse down tree & build scan
241  for (Index d = 1; d < scratch_stride; d *= 2) {
242  offset >>= 1;
243  // Synchronise
244  itemID.barrier(cl::sycl::access::fence_space::local_space);
245  if (local_id < d) {
246  Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
247  Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
248  CoeffReturnType accum = accumulator.initialize();
249  accumulator.reduce(scratch_ptr[ai], &accum);
250  accumulator.reduce(scratch_ptr[bi], &accum);
251  scratch_ptr[ai] = scratch_ptr[bi];
252  scratch_ptr[bi] = accumulator.finalize(accum);
253  }
254  }
255  // Synchronise
256  itemID.barrier(cl::sycl::access::fence_space::local_space);
257  // This for loop must be 2
259  for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
261  for (Index i = 0; i < PacketSize; i++) {
262  CoeffReturnType accum = private_scan[packetIndex + i];
263  accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
264  private_scan[packetIndex + i] = accumulator.finalize(accum);
265  }
266  }
268  if (inclusive) {
269  accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
270  private_scan[0] = accumulator.finalize(inclusive_scan);
271  }
272  });
273  next_elements = 0;
274  // right the first set of private param
276  for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
277  Index global_id = global_offset + next_elements;
278  if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
280  (global_id < scanParameters.total_size)) {
281  Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
282  out_ptr[global_id] = private_scan[private_id];
283  }
284  next_elements += scanParameters.scan_stride;
285  }
286  } // end for loop
287  }
288 };
289 
290 template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
291 struct ScanAdjustmentKernelFunctor {
292  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
295  InAccessor in_accessor;
296  OutAccessor out_accessor;
297  const ScanParameters<Index> scanParameters;
298  Op accumulator;
300  OutAccessor out_accessor_,
301  const ScanParameters<Index> scanParameters_,
302  Op accumulator_)
303  : in_accessor(in_accessor_),
304  out_accessor(out_accessor_),
305  scanParameters(scanParameters_),
306  accumulator(accumulator_) {}
307 
308  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
309  auto in_ptr = in_accessor.get_pointer();
310  auto out_ptr = out_accessor.get_pointer();
311 
312  for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
313  Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
314  Index tmp = data_offset % scanParameters.panel_threads;
315  const Index panel_id = data_offset / scanParameters.panel_threads;
316  const Index group_id = tmp / scanParameters.group_threads;
317  tmp = tmp % scanParameters.group_threads;
318  const Index block_id = tmp / scanParameters.block_threads;
319  const Index local_id = tmp % scanParameters.block_threads;
320 
321  // the actual panel size is scan_size * non_scan_size.
322  // elements_per_panel is roundup to power of 2 for binary tree
323  const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
324  const Index group_offset = group_id * scanParameters.non_scan_stride;
325  // This will be effective when the size is bigger than elements_per_block
326  const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
327  const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
328 
329  const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
331  const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
332  CoeffReturnType adjust_val = in_ptr[in_id];
333 
334  Index next_elements = 0;
336  for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
337  Index global_id = global_offset + next_elements;
338  if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
340  (global_id < scanParameters.total_size)) {
341  CoeffReturnType accum = adjust_val;
342  accumulator.reduce(out_ptr[global_id], &accum);
343  out_ptr[global_id] = accumulator.finalize(accum);
344  }
345  next_elements += scanParameters.scan_stride;
346  }
347  }
348  }
349 };
350 
351 template <typename Index>
352 struct ScanInfo {
353  const Index &total_size;
354  const Index &scan_size;
355  const Index &panel_size;
356  const Index &non_scan_size;
358  const Index &non_scan_stride;
359 
370  const Eigen::SyclDevice &dev;
371  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
372  const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
373  const Eigen::SyclDevice &dev_)
374  : total_size(total_size_),
375  scan_size(scan_size_),
376  panel_size(panel_size_),
377  non_scan_size(non_scan_size_),
378  scan_stride(scan_stride_),
379  non_scan_stride(non_scan_stride_),
380  dev(dev_) {
381  // must be power of 2
382  local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
383  Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
384 
386 
388  dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
389  const Index elements_per_panel = elements_per_group * non_scan_size;
395 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
397 #else
398  const Index max_threads = panel_threads * panel_size;
399 #endif
400  global_range = roundUp(max_threads, local_range);
401  loop_range = Index(
402  std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
403  }
407  }
408  inline cl::sycl::nd_range<1> get_thread_range() {
409  return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
410  }
411 };
412 
413 template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
414 struct SYCLAdjustBlockOffset {
415  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
416  Reducer &accumulator, const Index total_size,
417  const Index scan_size, const Index panel_size,
418  const Index non_scan_size, const Index scan_stride,
419  const Index non_scan_stride, const Eigen::SyclDevice &dev) {
420  auto scan_info =
421  ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
422 
424  AdjustFuctor;
425  dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
426  scan_info.max_elements_per_block,
427  scan_info.get_scan_parameter(), accumulator);
428  }
429 };
430 
431 template <typename CoeffReturnType, scan_step stp>
433  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
434  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
435  const Index total_size, const Index scan_size, const Index panel_size,
436  const Index non_scan_size, const Index scan_stride,
437  const Index non_scan_stride, const bool inclusive,
438  const Eigen::SyclDevice &dev) {
439  auto scan_info =
440  ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
441  const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
442  const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
443  CoeffReturnType *temp_pointer =
444  static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
445  EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
446 
448  dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
449  in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
450  scan_info.get_scan_parameter(), accumulator, inclusive);
451 
452  if (scan_info.block_size > 1) {
454  tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
455  non_scan_size, Index(1), scan_info.block_size, false, dev);
456 
458  tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
459  non_scan_stride, dev);
460  }
461  dev.deallocate_temp(temp_pointer);
462  }
463 };
464 
465 } // namespace internal
466 } // namespace TensorSycl
467 namespace internal {
468 template <typename Self, typename Reducer, bool vectorize>
469 struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
470  typedef typename Self::Index Index;
471  typedef typename Self::CoeffReturnType CoeffReturnType;
472  typedef typename Self::Storage Storage;
473  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
474  void operator()(Self &self, EvaluatorPointerType data) {
475  const Index total_size = internal::array_prod(self.dimensions());
476  const Index scan_size = self.size();
477  const Index scan_stride = self.stride();
478  // this is the scan op (can be sum or ...)
479  auto accumulator = self.accumulator();
480  auto inclusive = !self.exclusive();
481  auto consume_dim = self.consume_dim();
482  auto dev = self.device();
483 
484  auto dims = self.inner().dimensions();
485 
486  Index non_scan_size = 1;
487  Index panel_size = 1;
488  if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
489  for (int i = 0; i < consume_dim; i++) {
490  non_scan_size *= dims[i];
491  }
492  for (int i = consume_dim + 1; i < Self::NumDims; i++) {
493  panel_size *= dims[i];
494  }
495  } else {
496  for (int i = Self::NumDims - 1; i > consume_dim; i--) {
497  non_scan_size *= dims[i];
498  }
499  for (int i = consume_dim - 1; i >= 0; i--) {
500  panel_size *= dims[i];
501  }
502  }
503  const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
504  auto eval_impl = self.inner();
506  eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
507  inclusive, dev);
508  }
509 };
510 } // namespace internal
511 } // namespace Eigen
512 
513 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
Eigen::TensorSycl::internal::ScanParameters::ScanPerThread
static EIGEN_CONSTEXPR index_t ScanPerThread
Definition: TensorScanSycl.h:117
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::accumulator
Op accumulator
Definition: TensorScanSycl.h:364
Eigen::TensorSycl::internal::ScanInfo::global_range
Index global_range
Definition: TensorScanSycl.h:434
Eigen::TensorSycl::internal::SYCLAdjustBlockOffset::adjust_scan_block_offset
static EIGEN_STRONG_INLINE void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const Eigen::SyclDevice &dev)
Definition: TensorScanSycl.h:481
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
Eigen::TensorSycl::internal::ScanParameters::panel_threads
const index_t panel_threads
Definition: TensorScanSycl.h:123
dimensions
const std::vector< size_t > dimensions
Definition: testVerticalBlockMatrix.cpp:27
Eigen::TensorSycl::internal::ScanKernelFunctor::ScanKernelFunctor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, OutAccessor out_accessor_, OutAccessor temp_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_, const bool inclusive_)
Definition: TensorScanSycl.h:161
Eigen::TensorSycl::internal::ScanParameters::block_threads
const index_t block_threads
Definition: TensorScanSycl.h:125
Eigen::TensorSycl::internal::ScanInfo::scan_stride
const Index & scan_stride
Definition: TensorScanSycl.h:423
gtsam.examples.DogLegOptimizerExample.type
type
Definition: DogLegOptimizerExample.py:111
Eigen::TensorSycl::internal::ScanKernelFunctor::accumulator
Op accumulator
Definition: TensorScanSycl.h:159
Eigen::TensorSycl::internal::ScanKernelFunctor::inclusive
const bool inclusive
Definition: TensorScanSycl.h:160
Eigen::TensorSycl::internal::ScanInfo
Definition: TensorScanSycl.h:418
d
static const double d[K][N]
Definition: igam.h:11
Eigen::TensorSycl::internal::ScanKernelFunctor::temp_accessor
OutAccessor temp_accessor
Definition: TensorScanSycl.h:157
Eigen::TensorSycl::internal::ScanInfo::block_size
Index block_size
Definition: TensorScanSycl.h:427
Eigen::TensorSycl::internal::ScanInfo::get_scan_parameter
ScanParameters< Index > get_scan_parameter()
Definition: TensorScanSycl.h:470
Eigen::TensorSycl::internal::ScanInfo::panel_threads
Index panel_threads
Definition: TensorScanSycl.h:428
Eigen::TensorSycl::internal::ScanInfo::non_scan_size
const Index & non_scan_size
Definition: TensorScanSycl.h:422
Eigen::internal::ScanLauncher::operator()
void operator()(Self &self, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:192
Eigen::TensorSycl::internal::ScanParameters::loop_range
const index_t loop_range
Definition: TensorScanSycl.h:128
Eigen::TensorSycl::internal::ScanParameters::scan_stride
const index_t scan_stride
Definition: TensorScanSycl.h:122
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition: TensorScanSycl.h:374
Eigen::TensorSycl::internal::ScanParameters::group_threads
const index_t group_threads
Definition: TensorScanSycl.h:124
Eigen::TensorSycl::internal::ScanParameters::ScanParameters
ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
Definition: TensorScanSycl.h:130
EIGEN_CONSTEXPR
#define EIGEN_CONSTEXPR
Definition: Macros.h:787
Eigen::TensorSycl::internal::ScanInfo::loop_range
Index loop_range
Definition: TensorScanSycl.h:433
Eigen::TensorSycl::internal::ScanParameters::total_size
const index_t total_size
Definition: TensorScanSycl.h:118
Eigen::TensorSycl::internal::ScanInfo::get_thread_range
cl::sycl::nd_range< 1 > get_thread_range()
Definition: TensorScanSycl.h:474
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::scanParameters
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:363
Eigen::TensorSycl::internal::ScanLauncher_impl::scan_block
static EIGEN_STRONG_INLINE void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const bool inclusive, const Eigen::SyclDevice &dev)
Definition: TensorScanSycl.h:500
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::LocalAccessor
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorScanSycl.h:359
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::ScanAdjustmentKernelFunctor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, OutAccessor out_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_)
Definition: TensorScanSycl.h:365
Eigen::TensorSycl::internal::scan_step
scan_step
Definition: TensorScanSycl.h:146
Eigen::TensorSycl::internal::ScanKernelFunctor::LocalAccessor
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorScanSycl.h:151
Eigen::TensorSycl::internal::ScanKernelFunctor
Definition: TensorScanSycl.h:149
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::out_accessor
OutAccessor out_accessor
Definition: TensorScanSycl.h:362
Eigen::TensorSycl::internal::ScanParameters::scan_size
const index_t scan_size
Definition: TensorScanSycl.h:120
Eigen::TensorSycl::internal::ScanKernelFunctor::PacketSize
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:152
data
int data[]
Definition: Map_placement_new.cpp:1
Eigen::TensorSycl::internal::scan_step::first
@ first
Eigen::TensorSycl::internal::ScanInfo::group_threads
Index group_threads
Definition: TensorScanSycl.h:429
Eigen::TensorSycl::internal::ScanKernelFunctor::scanParameters
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:158
Eigen::TensorSycl::internal::ScanParameters::non_scan_stride
const index_t non_scan_stride
Definition: TensorScanSycl.h:121
Eigen::TensorSycl::internal::ScanLauncher_impl
Definition: TensorScanSycl.h:498
Eigen::TensorSycl::internal::ScanParameters::non_scan_size
const index_t non_scan_size
Definition: TensorScanSycl.h:119
l
static const Line3 l(Rot3(), 1, 1)
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
EIGEN_UNROLL_LOOP
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1461
Eigen::TensorSycl::internal::ScanInfo::total_size
const Index & total_size
Definition: TensorScanSycl.h:419
Eigen::TensorSycl::internal::ScanInfo::scan_size
const Index & scan_size
Definition: TensorScanSycl.h:420
vectorize
detail::vectorize_helper< Return(*)(Args...), Return, Args... > vectorize(Return(*f)(Args...))
Definition: numpy.h:2096
Eigen::TensorSycl::internal::ScanInfo::elements_per_block
Index elements_per_block
Definition: TensorScanSycl.h:432
Eigen::TensorSycl::internal::ScanInfo::block_threads
Index block_threads
Definition: TensorScanSycl.h:430
Eigen::TensorSycl::internal::ScanParameters::elements_per_block
const index_t elements_per_block
Definition: TensorScanSycl.h:127
Eigen::TensorSycl::internal::ScanKernelFunctor::read
typename ::Eigen::internal::enable_if< sst==scan_step::first, CoeffReturnType >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id)
Definition: TensorScanSycl.h:176
Eigen::TensorSycl::internal::ScanKernelFunctor::first_step_inclusive_Operation
typename ::Eigen::internal::enable_if< sst==scan_step::first >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op)
Definition: TensorScanSycl.h:189
Eigen::TensorSycl::internal::ScanKernelFunctor::out_accessor
OutAccessor out_accessor
Definition: TensorScanSycl.h:156
Eigen::TensorSycl::internal::ScanParameters
Definition: TensorScanSycl.h:115
Eigen::internal::array_prod
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition: TensorDimensions.h:140
offset
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition: gnuplot_common_settings.hh:64
Eigen::TensorSycl::internal::ScanKernelFunctor::dev_eval
Evaluator dev_eval
Definition: TensorScanSycl.h:155
Eigen::TensorSycl::internal::ScanInfo::max_elements_per_block
Index max_elements_per_block
Definition: TensorScanSycl.h:426
Eigen::TensorSycl::internal::ScanInfo::panel_size
const Index & panel_size
Definition: TensorScanSycl.h:421
Eigen::TensorSycl::internal::ScanInfo::dev
const Eigen::SyclDevice & dev
Definition: TensorScanSycl.h:436
min
#define min(a, b)
Definition: datatypes.h:19
Eigen::TensorSycl::internal::ScanKernelFunctor::scratch
LocalAccessor scratch
Definition: TensorScanSycl.h:154
Eigen::TensorSycl::internal::ScanInfo::elements_per_group
Index elements_per_group
Definition: TensorScanSycl.h:431
ceil
const EIGEN_DEVICE_FUNC CeilReturnType ceil() const
Definition: ArrayCwiseUnaryOps.h:495
Eigen::TensorSycl::internal::scan_step::second
@ second
Eigen::TensorSycl::internal::ScanInfo::ScanInfo
EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, const Eigen::SyclDevice &dev_)
Definition: TensorScanSycl.h:437
internal
Definition: BandTriangularSolver.h:13
Eigen::TensorSycl::internal::ScanInfo::local_range
Index local_range
Definition: TensorScanSycl.h:435
Eigen::ColMajor
@ ColMajor
Definition: Constants.h:319
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor
Definition: TensorScanSycl.h:357
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::PacketSize
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:360
Eigen::TensorSycl::internal::ScanParameters::elements_per_group
const index_t elements_per_group
Definition: TensorScanSycl.h:126
Eigen::TensorSycl::internal::ScanKernelFunctor::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition: TensorScanSycl.h:197
Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::in_accessor
InAccessor in_accessor
Definition: TensorScanSycl.h:361
EIGEN_SYCL_MAX_GLOBAL_RANGE
#define EIGEN_SYCL_MAX_GLOBAL_RANGE
Definition: TensorScanSycl.h:111
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9
gtsam::lago::initialize
Values initialize(const NonlinearFactorGraph &graph, bool useOdometricPath)
Definition: lago.cpp:375
Eigen::TensorSycl::internal::ScanInfo::non_scan_stride
const Index & non_scan_stride
Definition: TensorScanSycl.h:424
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74


gtsam
Author(s):
autogenerated on Sat Jan 4 2025 04:05:30