TensorExecutor.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 namespace Eigen {
14 
29 namespace internal {
30 
39 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
40 
41 template<typename Expression>
43  enum { value = false };
44 };
45 
46 template<typename LhsXprType, typename RhsXprType>
48  const TensorAssignOp<LhsXprType, RhsXprType> > {
50 };
51 
52 template<typename UnaryOp, typename XprType>
54  const TensorCwiseUnaryOp<UnaryOp, XprType> > {
56 };
57 
58 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
60  const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
61  enum {
64  };
65 };
66 
67 template<typename Broadcast, typename XprType>
69  const TensorBroadcastingOp<Broadcast, XprType> > {
70  enum { value = true };
71 };
72 
73 // -------------------------------------------------------------------------- //
74 
79 template <typename Expression, typename Device, bool Vectorizable,
80  TiledEvaluation Tiling>
82  public:
83  typedef typename Expression::Index StorageIndex;
84 
85  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
86  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
87  // violation. If this template is instantiated with a non-default device, it
88  // means that this header file was included without defining
89  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
91  "Default executor instantiated with non-default device. "
92  "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
93  "EIGEN_USE_SYCL before including Eigen headers.");
94 
96  static EIGEN_STRONG_INLINE void run(const Expression& expr,
97  const Device& device = Device()) {
99  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
100  if (needs_assign) {
101  const StorageIndex size = array_prod(evaluator.dimensions());
102  for (StorageIndex i = 0; i < size; ++i) {
103  evaluator.evalScalar(i);
104  }
105  }
106  evaluator.cleanup();
107  }
108 };
109 
114 template <typename Expression, typename Device, typename DoneCallback,
115  bool Vectorizable, TiledEvaluation Tiling>
117 
121 template <typename Expression>
122 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
123  /*Tiling=*/TiledEvaluation::Off> {
124  public:
125  typedef typename Expression::Index StorageIndex;
126 
129  const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
131  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
132  if (needs_assign) {
133  const StorageIndex size = array_prod(evaluator.dimensions());
134  const int PacketSize = unpacket_traits<typename TensorEvaluator<
135  Expression, DefaultDevice>::PacketReturnType>::size;
136 
137  // Give compiler a strong possibility to unroll the loop. But don't insist
138  // on unrolling, because if the function is expensive compiler should not
139  // unroll the loop at the expense of inlining.
140  const StorageIndex UnrolledSize =
141  (size / (4 * PacketSize)) * 4 * PacketSize;
142  for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
143  for (StorageIndex j = 0; j < 4; j++) {
144  evaluator.evalPacket(i + j * PacketSize);
145  }
146  }
147  const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
148  for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
149  evaluator.evalPacket(i);
150  }
151  for (StorageIndex i = VectorizedSize; i < size; ++i) {
152  evaluator.evalScalar(i);
153  }
154  }
155  evaluator.cleanup();
156  }
157 };
158 
163 template <typename Expression, bool Vectorizable>
164 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
165  /*Tiling=*/TiledEvaluation::On> {
166  public:
169 
172 
173  static const int NumDims = traits<Expression>::NumDimensions;
174 
176  static EIGEN_STRONG_INLINE void run(const Expression& expr,
177  const DefaultDevice& device = DefaultDevice()) {
180 
182  TensorBlockDesc;
184  TensorBlockScratch;
185 
186  Evaluator evaluator(expr, device);
187 
188  // TODO(ezhulenev): Do not use tiling for small tensors?
189  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
190 
191  if (needs_assign) {
192  // Query expression tree for desired block size/shape.
193  const TensorBlockResourceRequirements requirements =
194  evaluator.getResourceRequirements();
195 
196  const TensorBlockMapper block_mapper(
197  typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
198  requirements);
199 
200  // Share scratch memory allocator between all blocks.
201  TensorBlockScratch scratch(device);
202 
203  const StorageIndex total_block_count = block_mapper.blockCount();
204  for (StorageIndex i = 0; i < total_block_count; ++i) {
205  TensorBlockDesc desc = block_mapper.blockDescriptor(i);
206  evaluator.evalBlock(desc, scratch);
207  scratch.reset();
208  }
209  }
210  evaluator.cleanup();
211  }
212 };
213 
225 #ifdef EIGEN_USE_THREADS
226 
227 template <typename TensorBlockMapper>
228 struct TensorExecutorTilingContext {
229  TensorExecutorTilingContext() = default;
230  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
231  const TensorOpCost& b_cost, size_t b_aligned_size)
232  : block_mapper(b_mapper),
233  cost(b_cost),
234  aligned_blocksize(b_aligned_size) {}
235 
236  TensorBlockMapper block_mapper; // navigate through blocks
237  TensorOpCost cost; // cost of computing a single block
238  size_t aligned_blocksize; // block size after memory alignment
239 };
240 
241 // Computes a block evaluation parameters, and allocates temporary memory buffer
242 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
243 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
245  const Evaluator& evaluator) {
246  // Query expression tree for desired block size/shape.
247  TensorBlockResourceRequirements requirements =
248  evaluator.getResourceRequirements();
249 
250  // Update target block size based on cost model.
252  1, requirements.cost_per_coeff);
253  requirements.size = static_cast<size_t>(1.0 / taskSize);
254 
255  TensorBlockMapper block_mapper(
256  typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
257  requirements);
258 
259  size_t block_size = block_mapper.blockTotalSize();
260  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
261  const size_t aligned_blocksize =
262  align *
263  divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
264 
265  return {block_mapper, requirements.cost_per_coeff * block_size,
266  aligned_blocksize};
267 }
268 
269 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
270 struct EvalRange {
271  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
272  const StorageIndex lastIdx) {
273  Evaluator evaluator = *evaluator_in;
274  eigen_assert(lastIdx >= firstIdx);
275  for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
276  evaluator.evalScalar(i);
277  }
278  }
279 
280  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
281 };
282 
283 template <typename Evaluator, typename StorageIndex>
284 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
285  static const int PacketSize =
287 
288  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
289  const StorageIndex lastIdx) {
290  Evaluator evaluator = *evaluator_in;
291  eigen_assert(lastIdx >= firstIdx);
292  StorageIndex i = firstIdx;
293  if (lastIdx - firstIdx >= PacketSize) {
294  eigen_assert(firstIdx % PacketSize == 0);
295  StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
296  // Give compiler a strong possibility to unroll the loop. But don't insist
297  // on unrolling, because if the function is expensive compiler should not
298  // unroll the loop at the expense of inlining.
299  for (; i <= last_chunk_offset; i += 4 * PacketSize) {
300  for (StorageIndex j = 0; j < 4; j++) {
301  evaluator.evalPacket(i + j * PacketSize);
302  }
303  }
304  last_chunk_offset = lastIdx - PacketSize;
305  for (; i <= last_chunk_offset; i += PacketSize) {
306  evaluator.evalPacket(i);
307  }
308  }
309  for (; i < lastIdx; ++i) {
310  evaluator.evalScalar(i);
311  }
312  }
313 
314  static StorageIndex alignBlockSize(StorageIndex size) {
315  // Align block size to packet size and account for unrolling in run above.
316  if (size >= 16 * PacketSize) {
317  return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
318  }
319  // Aligning to 4 * PacketSize would increase block size by more than 25%.
320  return (size + PacketSize - 1) & ~(PacketSize - 1);
321  }
322 };
323 
324 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
326  public:
327  typedef typename Expression::Index StorageIndex;
328 
329  static EIGEN_STRONG_INLINE void run(const Expression& expr,
330  const ThreadPoolDevice& device) {
331  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
332  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
333 
334  Evaluator evaluator(expr, device);
335  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
336  if (needs_assign) {
337  const StorageIndex size = array_prod(evaluator.dimensions());
338  device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
339  EvalRange::alignBlockSize,
340  [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
341  EvalRange::run(&evaluator, firstIdx, lastIdx);
342  });
343  }
344  evaluator.cleanup();
345  }
346 };
347 
348 template <typename Expression, bool Vectorizable>
349 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
350  /*Tiling=*/TiledEvaluation::On> {
351  public:
352  typedef typename traits<Expression>::Index IndexType;
353  typedef typename traits<Expression>::Scalar Scalar;
354  typedef typename remove_const<Scalar>::type ScalarNoConst;
355 
356  static const int NumDims = traits<Expression>::NumDimensions;
357 
358  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
359  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
360  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
361 
362  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
363  TensorBlockDesc;
364  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
365  TensorBlockScratch;
366 
367  static EIGEN_STRONG_INLINE void run(const Expression& expr,
368  const ThreadPoolDevice& device) {
369  Evaluator evaluator(expr, device);
370 
371  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
372  if (needs_assign) {
373  const TilingContext tiling =
374  internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
375  Vectorizable>(evaluator);
376 
377  auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
378  IndexType lastBlockIdx) {
379  TensorBlockScratch scratch(device);
380 
381  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
382  ++block_idx) {
383  TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
384  evaluator.evalBlock(desc, scratch);
385  scratch.reset();
386  }
387  };
388 
389  // Evaluate small expressions directly as a single block.
390  if (tiling.block_mapper.blockCount() == 1) {
391  TensorBlockScratch scratch(device);
392  TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
393  evaluator.evalBlock(desc, scratch);
394  } else {
395  device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
396  eval_block);
397  }
398  }
399  evaluator.cleanup();
400  }
401 };
402 
403 template <typename Expression, typename DoneCallback, bool Vectorizable,
404  TiledEvaluation Tiling>
405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
406  Vectorizable, Tiling> {
407  public:
408  typedef typename Expression::Index StorageIndex;
409  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
410 
411  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
412  const ThreadPoolDevice& device,
413  DoneCallback done) {
414  TensorAsyncExecutorContext* const ctx =
415  new TensorAsyncExecutorContext(expr, device, std::move(done));
416 
417  const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
418  if (!need_assign) {
419  delete ctx;
420  return;
421  }
422 
423  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424  const StorageIndex size = array_prod(ctx->evaluator.dimensions());
425  device.parallelForAsync(
426  size, ctx->evaluator.costPerCoeff(Vectorizable),
427  EvalRange::alignBlockSize,
428  [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
429  EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
430  },
431  [ctx]() { delete ctx; });
432  };
433 
434  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
435  }
436 
437  private:
438  struct TensorAsyncExecutorContext {
439  TensorAsyncExecutorContext(const Expression& expr,
440  const ThreadPoolDevice& thread_pool,
441  DoneCallback done)
442  : evaluator(expr, thread_pool), on_done(std::move(done)) {}
443 
444  ~TensorAsyncExecutorContext() {
445  evaluator.cleanup();
446  on_done();
447  }
448 
449  Evaluator evaluator;
450 
451  private:
452  DoneCallback on_done;
453  };
454 };
455 
456 template <typename Expression, typename DoneCallback, bool Vectorizable>
457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
458  Vectorizable, /*Tileable*/ TiledEvaluation::On> {
459  public:
460  typedef typename traits<Expression>::Index IndexType;
461  typedef typename traits<Expression>::Scalar Scalar;
462  typedef typename remove_const<Scalar>::type ScalarNoConst;
463 
464  static const int NumDims = traits<Expression>::NumDimensions;
465 
466  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
467  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
468  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
469 
470  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
471  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
472  TensorBlockScratch;
473 
474  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
475  const ThreadPoolDevice& device,
476  DoneCallback done) {
477 
478  TensorAsyncExecutorContext* const ctx =
479  new TensorAsyncExecutorContext(expr, device, std::move(done));
480 
481  const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
482  if (!need_assign) {
483  delete ctx;
484  return;
485  }
486 
487  ctx->tiling = internal::GetTensorExecutorTilingContext<
488  Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
489 
490  auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491  TensorBlockScratch scratch(ctx->device);
492 
493  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
494  ++block_idx) {
495  TensorBlockDesc desc =
496  ctx->tiling.block_mapper.blockDescriptor(block_idx);
497  ctx->evaluator.evalBlock(desc, scratch);
498  scratch.reset();
499  }
500  };
501 
502  // Evaluate small expressions directly as a single block.
503  if (ctx->tiling.block_mapper.blockCount() == 1) {
504  TensorBlockScratch scratch(ctx->device);
505  TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
506  ctx->evaluator.evalBlock(desc, scratch);
507  delete ctx;
508  } else {
509  ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510  ctx->tiling.cost, eval_block,
511  [ctx]() { delete ctx; });
512  }
513  };
514 
515  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
516  }
517 
518  private:
519  struct TensorAsyncExecutorContext {
520  TensorAsyncExecutorContext(const Expression& expr,
521  const ThreadPoolDevice& thread_pool,
522  DoneCallback done)
523  : device(thread_pool),
524  evaluator(expr, thread_pool),
525  on_done(std::move(done)) {}
526 
527  ~TensorAsyncExecutorContext() {
528  evaluator.cleanup();
529  on_done();
530  }
531 
532  const ThreadPoolDevice& device;
533  Evaluator evaluator;
534  TilingContext tiling;
535 
536  private:
537  DoneCallback on_done;
538  };
539 };
540 
541 #endif // EIGEN_USE_THREADS
542 
543 // GPU: the evaluation of the expression is offloaded to a GPU.
544 #if defined(EIGEN_USE_GPU)
545 
546 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
548  public:
549  typedef typename Expression::Index StorageIndex;
550  static void run(const Expression& expr, const GpuDevice& device);
551 };
552 
553 #if defined(EIGEN_GPUCC)
554 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
555 struct EigenMetaKernelEval {
557  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
558  for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
559  eval.evalScalar(i);
560  }
561  }
562 };
563 
564 template <typename Evaluator, typename StorageIndex>
565 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
567  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
568  const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
569  const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
570  const StorageIndex vectorized_step_size = step_size * PacketSize;
571 
572  // Use the vector path
573  for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
574  i += vectorized_step_size) {
575  eval.evalPacket(i);
576  }
577  for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
578  eval.evalScalar(i);
579  }
580  }
581 };
582 
583 template <typename Evaluator, typename StorageIndex>
584 __global__ void
585 __launch_bounds__(1024)
586 EigenMetaKernel(Evaluator eval, StorageIndex size) {
587 
588  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
589  const StorageIndex step_size = blockDim.x * gridDim.x;
590 
591  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
593 }
594 
595 /*static*/
596 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
598  const Expression& expr, const GpuDevice& device) {
599  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
600  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
601  if (needs_assign) {
602 
603  const int block_size = device.maxGpuThreadsPerBlock();
604  const int max_blocks = device.getNumGpuMultiProcessors() *
605  device.maxGpuThreadsPerMultiProcessor() / block_size;
606  const StorageIndex size = array_prod(evaluator.dimensions());
607  // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
608  const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
609 
610  LAUNCH_GPU_KERNEL(
611  (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
612  num_blocks, block_size, 0, device, evaluator, size);
613  }
614  evaluator.cleanup();
615 }
616 
617 #endif // EIGEN_GPUCC
618 #endif // EIGEN_USE_GPU
619 
620 // SYCL Executor policy
621 #ifdef EIGEN_USE_SYCL
622 
623 template <typename Evaluator>
624 struct ExecExprFunctorKernel {
625  typedef typename Evaluator::Index Index;
626  Evaluator evaluator;
627  const Index range;
628  template <typename Scratch>
629  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
630  const Scratch, Evaluator evaluator_, const Index range_)
631  : evaluator(evaluator_), range(range_) {}
632 
634  cl::sycl::nd_item<1> itemID) {
635  compute(itemID);
636  }
637  template <bool is_vec = Evaluator::PacketAccess>
639  compute(const cl::sycl::nd_item<1>& itemID) {
640  Index gId = static_cast<Index>(itemID.get_global_linear_id());
641  Index total_threads = itemID.get_global_range(0);
642 
643  for (Index i = gId; i < range; i += total_threads) {
644  evaluator.evalScalar(i);
645  }
646  }
647  template <bool is_vec = Evaluator::PacketAccess>
649  compute(const cl::sycl::nd_item<1>& itemID) {
650  const Index vectorizedRange =
651  (range / Evaluator::PacketSize) * Evaluator::PacketSize;
652  Index gId = static_cast<Index>(itemID.get_global_linear_id());
653  const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
654  const Index start = Evaluator::PacketSize * gId;
655  for (Index i = start; i < vectorizedRange; i += step) {
656  evaluator.evalPacket(i);
657  }
658  gId += vectorizedRange;
659  for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
660  evaluator.evalScalar(i);
661  }
662  }
663 };
664 
665 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
666 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
667  public:
668  typedef typename Expression::Index Index;
669  static EIGEN_STRONG_INLINE void run(const Expression& expr,
670  const Eigen::SyclDevice& dev) {
672  Evaluator evaluator(expr, dev);
673  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
674  if (needs_assign) {
675  Index range, GRange, tileSize;
676  Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
677  total_size = (total_size == 0) ? 1 : total_size;
678  const int PacketSize =
679  Eigen::PacketType<typename Evaluator::CoeffReturnType,
680  Eigen::SyclDevice>::size;
681  Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
682  dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
683  range = total_size;
684 
685  dev.template nullary_kernel_launcher<
686  typename Evaluator::CoeffReturnType,
687  ExecExprFunctorKernel<Evaluator> >(
688  evaluator,
689  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
690  cl::sycl::range<1>(tileSize)),
691  Index(1), range);
692  }
693  evaluator.cleanup();
694  }
695 };
696 
697 #endif
698 
699 } // end namespace internal
700 
701 } // end namespace Eigen
702 
703 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
Eigen::internal::TensorBlockScratchAllocator
Definition: TensorBlock.h:525
Eigen::internal::ExpressionHasTensorBroadcastingOp::value
@ value
Definition: TensorExecutor.h:43
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
Eigen::internal::TiledEvaluation
TiledEvaluation
Definition: TensorForwardDeclarations.h:158
gtsam.examples.DogLegOptimizerExample.type
type
Definition: DogLegOptimizerExample.py:111
Eigen::CwiseBinaryOp
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:77
Eigen::TensorCwiseUnaryOp
Definition: TensorExpr.h:115
Eigen::internal::TensorAsyncExecutor
Definition: TensorExecutor.h:116
Eigen::internal::TensorBlockMapper::blockDescriptor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition: TensorBlock.h:375
Eigen::internal::TensorExecutor< Expression, DefaultDevice, Vectorizable, TiledEvaluation::On >::StorageIndex
traits< Expression >::Index StorageIndex
Definition: TensorExecutor.h:171
blockDim
dim3 blockDim
Definition: gpu_common.h:19
eigen_assert
#define eigen_assert(x)
Definition: Macros.h:1037
gtsam::internal::align
static Similarity3 align(const Point3Pairs &d_abPointPairs, const Rot3 &aRb, const Point3Pair &centroids)
This method estimates the similarity transform from differences point pairs,.
Definition: Similarity3.cpp:69
make_changelog.desc
desc
Definition: make_changelog.py:71
TensorExecutor
The tensor executor class.
Eigen::DefaultDevice
Definition: TensorDeviceDefault.h:17
gtsam::range
Double_ range(const Point2_ &p, const Point2_ &q)
Definition: slam/expressions.h:30
Eigen::PacketType
Definition: TensorMeta.h:50
Eigen::internal::TensorExecutor< Expression, DefaultDevice, Vectorizable, TiledEvaluation::On >::ScalarNoConst
remove_const< Scalar >::type ScalarNoConst
Definition: TensorExecutor.h:168
Eigen::internal::TensorBlockDescriptor
Definition: TensorBlock.h:188
Eigen::internal::unpacket_traits::size
@ size
Definition: GenericPacketMath.h:138
j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Eigen::internal::TensorBlockMapper
Definition: TensorBlock.h:347
Eigen::internal::TensorExecutor< Expression, DefaultDevice, true, TiledEvaluation::Off >::StorageIndex
Expression::Index StorageIndex
Definition: TensorExecutor.h:125
Eigen::internal::On
@ On
Definition: TensorForwardDeclarations.h:160
operator()
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Definition: IndexedViewMethods.h:73
Eigen::internal::unpacket_traits
Definition: GenericPacketMath.h:132
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
gtsam::utils.visual_isam.step
def step(data, isam, result, truth, currPoseIndex, isamArgs=())
Definition: visual_isam.py:82
blockIdx
dim3 blockIdx
Definition: gpu_common.h:19
Eigen::TensorCwiseBinaryOp
Definition: TensorExpr.h:196
EIGEN_MAX_ALIGN_BYTES
#define EIGEN_MAX_ALIGN_BYTES
Definition: ConfigureVectorization.h:175
gtsam.examples.DogLegOptimizerExample.run
def run(args)
Definition: DogLegOptimizerExample.py:21
compute
EIGEN_DONT_INLINE void compute(Solver &solver, const MatrixType &A)
Definition: dense_solvers.cpp:25
EIGEN_ALWAYS_INLINE
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:932
Eigen::Triplet< double >
Eigen::internal::evaluator
Definition: CoreEvaluators.h:90
Eigen::internal::array_prod
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition: TensorDimensions.h:140
Eigen::internal::remove_const::type
T type
Definition: Meta.h:121
Eigen::internal::TensorExecutor
Definition: TensorExecutor.h:81
move
detail::enable_if_t<!detail::move_never< T >::value, T > move(object &&obj)
Definition: cast.h:1243
Eigen::internal::TensorExecutor< Expression, DefaultDevice, Vectorizable, TiledEvaluation::On >::run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
Definition: TensorExecutor.h:176
threadIdx
dim3 threadIdx
Definition: gpu_common.h:19
Eigen::internal::traits
Definition: ForwardDeclarations.h:17
Eigen::internal::Off
@ Off
Definition: TensorForwardDeclarations.h:159
Eigen::internal::TensorBlockMapper::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:351
Eigen::internal::TensorExecutor< Expression, DefaultDevice, true, TiledEvaluation::Off >::run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
Definition: TensorExecutor.h:128
std
Definition: BFloat16.h:88
Eigen::TensorCostModel::taskSize
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:187
Eigen::internal::TensorBlockResourceRequirements
Definition: TensorBlock.h:75
Eigen::TensorEvaluator
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:28
Eigen::internal::TensorExecutor< Expression, DefaultDevice, Vectorizable, TiledEvaluation::On >::Evaluator
TensorEvaluator< Expression, DefaultDevice > Evaluator
Definition: TensorExecutor.h:170
Eigen::TensorBroadcastingOp
Definition: TensorBroadcasting.h:69
internal
Definition: BandTriangularSolver.h:13
Eigen::numext::maxi
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: Eigen/src/Core/MathFunctions.h:1093
NULL
#define NULL
Definition: ccolamd.c:609
Eigen::internal::size
EIGEN_CONSTEXPR Index size(const T &x)
Definition: Meta.h:479
Eigen::internal::TensorBlockMapper::blockCount
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition: TensorBlock.h:361
Eigen::TensorAssignOp
Definition: TensorAssign.h:61
Eigen::TensorOpCost
Definition: TensorCostModel.h:25
Eigen::internal::TensorExecutor::StorageIndex
Expression::Index StorageIndex
Definition: TensorExecutor.h:83
test_callbacks.value
value
Definition: test_callbacks.py:160
eval
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:38
Eigen::internal::ExpressionHasTensorBroadcastingOp
Definition: TensorExecutor.h:42
i
int i
Definition: BiCGSTAB_step_by_step.cpp:9
Eigen::internal::TensorExecutor< Expression, DefaultDevice, Vectorizable, TiledEvaluation::On >::Scalar
traits< Expression >::Scalar Scalar
Definition: TensorExecutor.h:167
Scalar
SCALAR Scalar
Definition: bench_gemm.cpp:46
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
Eigen::internal::TensorExecutor::run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())
Definition: TensorExecutor.h:96


gtsam
Author(s):
autogenerated on Wed Jan 1 2025 04:04:49