TensorExecutor.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 namespace Eigen {
14 
29 namespace internal {
30 
39 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
40 
41 template<typename Expression>
43  enum { value = false };
44 };
45 
46 template<typename LhsXprType, typename RhsXprType>
48  const TensorAssignOp<LhsXprType, RhsXprType> > {
50 };
51 
52 template<typename UnaryOp, typename XprType>
54  const TensorCwiseUnaryOp<UnaryOp, XprType> > {
56 };
57 
58 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
60  const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
61  enum {
64  };
65 };
66 
67 template<typename Broadcast, typename XprType>
69  const TensorBroadcastingOp<Broadcast, XprType> > {
70  enum { value = true };
71 };
72 
73 // -------------------------------------------------------------------------- //
74 
79 template <typename Expression, typename Device, bool Vectorizable,
80  TiledEvaluation Tiling>
82  public:
83  typedef typename Expression::Index StorageIndex;
84 
85  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
86  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
87  // violation. If this template is instantiated with a non-default device, it
88  // means that this header file was included without defining
89  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
91  "Default executor instantiated with non-default device. "
92  "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
93  "EIGEN_USE_SYCL before including Eigen headers.");
94 
96  static EIGEN_STRONG_INLINE void run(const Expression& expr,
97  const Device& device = Device()) {
99  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
100  if (needs_assign) {
101  const StorageIndex size = array_prod(evaluator.dimensions());
102  for (StorageIndex i = 0; i < size; ++i) {
103  evaluator.evalScalar(i);
104  }
105  }
106  evaluator.cleanup();
107  }
108 };
109 
114 template <typename Expression, typename Device, typename DoneCallback,
115  bool Vectorizable, TiledEvaluation Tiling>
117 
121 template <typename Expression>
122 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
123  /*Tiling=*/TiledEvaluation::Off> {
124  public:
125  typedef typename Expression::Index StorageIndex;
126 
129  const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
131  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
132  if (needs_assign) {
133  const StorageIndex size = array_prod(evaluator.dimensions());
134  const int PacketSize = unpacket_traits<typename TensorEvaluator<
135  Expression, DefaultDevice>::PacketReturnType>::size;
136 
137  // Give compiler a strong possibility to unroll the loop. But don't insist
138  // on unrolling, because if the function is expensive compiler should not
139  // unroll the loop at the expense of inlining.
140  const StorageIndex UnrolledSize =
141  (size / (4 * PacketSize)) * 4 * PacketSize;
142  for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
143  for (StorageIndex j = 0; j < 4; j++) {
144  evaluator.evalPacket(i + j * PacketSize);
145  }
146  }
147  const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
148  for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
149  evaluator.evalPacket(i);
150  }
151  for (StorageIndex i = VectorizedSize; i < size; ++i) {
152  evaluator.evalScalar(i);
153  }
154  }
155  evaluator.cleanup();
156  }
157 };
158 
163 template <typename Expression, bool Vectorizable>
164 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
165  /*Tiling=*/TiledEvaluation::On> {
166  public:
169 
172 
173  static const int NumDims = traits<Expression>::NumDimensions;
174 
176  static EIGEN_STRONG_INLINE void run(const Expression& expr,
177  const DefaultDevice& device = DefaultDevice()) {
180 
182  TensorBlockDesc;
184  TensorBlockScratch;
185 
186  Evaluator evaluator(expr, device);
187 
188  // TODO(ezhulenev): Do not use tiling for small tensors?
189  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
190 
191  if (needs_assign) {
192  // Query expression tree for desired block size/shape.
193  const TensorBlockResourceRequirements requirements =
194  evaluator.getResourceRequirements();
195 
196  const TensorBlockMapper block_mapper(
197  typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
198  requirements);
199 
200  // Share scratch memory allocator between all blocks.
201  TensorBlockScratch scratch(device);
202 
203  const StorageIndex total_block_count = block_mapper.blockCount();
204  for (StorageIndex i = 0; i < total_block_count; ++i) {
205  TensorBlockDesc desc = block_mapper.blockDescriptor(i);
206  evaluator.evalBlock(desc, scratch);
207  scratch.reset();
208  }
209  }
210  evaluator.cleanup();
211  }
212 };
213 
225 #ifdef EIGEN_USE_THREADS
226 
227 template <typename TensorBlockMapper>
228 struct TensorExecutorTilingContext {
229  TensorExecutorTilingContext() = default;
230  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
231  const TensorOpCost& b_cost, size_t b_aligned_size)
232  : block_mapper(b_mapper),
233  cost(b_cost),
234  aligned_blocksize(b_aligned_size) {}
235 
236  TensorBlockMapper block_mapper; // navigate through blocks
237  TensorOpCost cost; // cost of computing a single block
238  size_t aligned_blocksize; // block size after memory alignment
239 };
240 
241 // Computes a block evaluation parameters, and allocates temporary memory buffer
242 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
243 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
245  const Evaluator& evaluator) {
246  // Query expression tree for desired block size/shape.
247  TensorBlockResourceRequirements requirements =
248  evaluator.getResourceRequirements();
249 
250  // Update target block size based on cost model.
252  1, requirements.cost_per_coeff);
253  requirements.size = static_cast<size_t>(1.0 / taskSize);
254 
255  TensorBlockMapper block_mapper(
256  typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
257  requirements);
258 
259  size_t block_size = block_mapper.blockTotalSize();
260  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
261  const size_t aligned_blocksize =
262  align *
263  divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
264 
265  return {block_mapper, requirements.cost_per_coeff * block_size,
266  aligned_blocksize};
267 }
268 
269 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
270 struct EvalRange {
271  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
272  const StorageIndex lastIdx) {
273  Evaluator evaluator = *evaluator_in;
274  eigen_assert(lastIdx >= firstIdx);
275  for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
276  evaluator.evalScalar(i);
277  }
278  }
279 
280  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
281 };
282 
283 template <typename Evaluator, typename StorageIndex>
284 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
285  static const int PacketSize =
287 
288  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
289  const StorageIndex lastIdx) {
290  Evaluator evaluator = *evaluator_in;
291  eigen_assert(lastIdx >= firstIdx);
292  StorageIndex i = firstIdx;
293  if (lastIdx - firstIdx >= PacketSize) {
294  eigen_assert(firstIdx % PacketSize == 0);
295  StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
296  // Give compiler a strong possibility to unroll the loop. But don't insist
297  // on unrolling, because if the function is expensive compiler should not
298  // unroll the loop at the expense of inlining.
299  for (; i <= last_chunk_offset; i += 4 * PacketSize) {
300  for (StorageIndex j = 0; j < 4; j++) {
301  evaluator.evalPacket(i + j * PacketSize);
302  }
303  }
304  last_chunk_offset = lastIdx - PacketSize;
305  for (; i <= last_chunk_offset; i += PacketSize) {
306  evaluator.evalPacket(i);
307  }
308  }
309  for (; i < lastIdx; ++i) {
310  evaluator.evalScalar(i);
311  }
312  }
313 
314  static StorageIndex alignBlockSize(StorageIndex size) {
315  // Align block size to packet size and account for unrolling in run above.
316  if (size >= 16 * PacketSize) {
317  return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
318  }
319  // Aligning to 4 * PacketSize would increase block size by more than 25%.
320  return (size + PacketSize - 1) & ~(PacketSize - 1);
321  }
322 };
323 
324 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
326  public:
327  typedef typename Expression::Index StorageIndex;
328 
329  static EIGEN_STRONG_INLINE void run(const Expression& expr,
330  const ThreadPoolDevice& device) {
332  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
333 
334  Evaluator evaluator(expr, device);
335  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
336  if (needs_assign) {
337  const StorageIndex size = array_prod(evaluator.dimensions());
338  device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
339  EvalRange::alignBlockSize,
340  [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
341  EvalRange::run(&evaluator, firstIdx, lastIdx);
342  });
343  }
344  evaluator.cleanup();
345  }
346 };
347 
348 template <typename Expression, bool Vectorizable>
349 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
350  /*Tiling=*/TiledEvaluation::On> {
351  public:
352  typedef typename traits<Expression>::Index IndexType;
353  typedef typename traits<Expression>::Scalar Scalar;
354  typedef typename remove_const<Scalar>::type ScalarNoConst;
355 
356  static const int NumDims = traits<Expression>::NumDimensions;
357 
360  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
361 
363  TensorBlockDesc;
365  TensorBlockScratch;
366 
367  static EIGEN_STRONG_INLINE void run(const Expression& expr,
368  const ThreadPoolDevice& device) {
369  Evaluator evaluator(expr, device);
370 
371  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
372  if (needs_assign) {
373  const TilingContext tiling =
374  internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
375  Vectorizable>(evaluator);
376 
377  auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
378  IndexType lastBlockIdx) {
379  TensorBlockScratch scratch(device);
380 
381  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
382  ++block_idx) {
383  TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
384  evaluator.evalBlock(desc, scratch);
385  scratch.reset();
386  }
387  };
388 
389  // Evaluate small expressions directly as a single block.
390  if (tiling.block_mapper.blockCount() == 1) {
391  TensorBlockScratch scratch(device);
392  TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
393  evaluator.evalBlock(desc, scratch);
394  } else {
395  device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
396  eval_block);
397  }
398  }
399  evaluator.cleanup();
400  }
401 };
402 
403 template <typename Expression, typename DoneCallback, bool Vectorizable,
404  TiledEvaluation Tiling>
405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
406  Vectorizable, Tiling> {
407  public:
408  typedef typename Expression::Index StorageIndex;
410 
411  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
412  const ThreadPoolDevice& device,
413  DoneCallback done) {
414  TensorAsyncExecutorContext* const ctx =
415  new TensorAsyncExecutorContext(expr, device, std::move(done));
416 
417  const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
418  if (!need_assign) {
419  delete ctx;
420  return;
421  }
422 
423  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424  const StorageIndex size = array_prod(ctx->evaluator.dimensions());
425  device.parallelForAsync(
426  size, ctx->evaluator.costPerCoeff(Vectorizable),
427  EvalRange::alignBlockSize,
428  [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
429  EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
430  },
431  [ctx]() { delete ctx; });
432  };
433 
434  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
435  }
436 
437  private:
438  struct TensorAsyncExecutorContext {
439  TensorAsyncExecutorContext(const Expression& expr,
440  const ThreadPoolDevice& thread_pool,
441  DoneCallback done)
442  : evaluator(expr, thread_pool), on_done(std::move(done)) {}
443 
444  ~TensorAsyncExecutorContext() {
445  evaluator.cleanup();
446  on_done();
447  }
448 
449  Evaluator evaluator;
450 
451  private:
452  DoneCallback on_done;
453  };
454 };
455 
456 template <typename Expression, typename DoneCallback, bool Vectorizable>
457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
458  Vectorizable, /*Tileable*/ TiledEvaluation::On> {
459  public:
460  typedef typename traits<Expression>::Index IndexType;
461  typedef typename traits<Expression>::Scalar Scalar;
462  typedef typename remove_const<Scalar>::type ScalarNoConst;
463 
464  static const int NumDims = traits<Expression>::NumDimensions;
465 
468  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
469 
472  TensorBlockScratch;
473 
474  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
475  const ThreadPoolDevice& device,
476  DoneCallback done) {
477 
478  TensorAsyncExecutorContext* const ctx =
479  new TensorAsyncExecutorContext(expr, device, std::move(done));
480 
481  const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
482  if (!need_assign) {
483  delete ctx;
484  return;
485  }
486 
487  ctx->tiling = internal::GetTensorExecutorTilingContext<
488  Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
489 
490  auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491  TensorBlockScratch scratch(ctx->device);
492 
493  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
494  ++block_idx) {
495  TensorBlockDesc desc =
496  ctx->tiling.block_mapper.blockDescriptor(block_idx);
497  ctx->evaluator.evalBlock(desc, scratch);
498  scratch.reset();
499  }
500  };
501 
502  // Evaluate small expressions directly as a single block.
503  if (ctx->tiling.block_mapper.blockCount() == 1) {
504  TensorBlockScratch scratch(ctx->device);
505  TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
506  ctx->evaluator.evalBlock(desc, scratch);
507  delete ctx;
508  } else {
509  ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510  ctx->tiling.cost, eval_block,
511  [ctx]() { delete ctx; });
512  }
513  };
514 
515  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
516  }
517 
518  private:
519  struct TensorAsyncExecutorContext {
520  TensorAsyncExecutorContext(const Expression& expr,
521  const ThreadPoolDevice& thread_pool,
522  DoneCallback done)
523  : device(thread_pool),
524  evaluator(expr, thread_pool),
525  on_done(std::move(done)) {}
526 
527  ~TensorAsyncExecutorContext() {
528  evaluator.cleanup();
529  on_done();
530  }
531 
532  const ThreadPoolDevice& device;
533  Evaluator evaluator;
534  TilingContext tiling;
535 
536  private:
537  DoneCallback on_done;
538  };
539 };
540 
541 #endif // EIGEN_USE_THREADS
542 
543 // GPU: the evaluation of the expression is offloaded to a GPU.
544 #if defined(EIGEN_USE_GPU)
545 
546 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
548  public:
549  typedef typename Expression::Index StorageIndex;
550  static void run(const Expression& expr, const GpuDevice& device);
551 };
552 
553 #if defined(EIGEN_GPUCC)
554 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
555 struct EigenMetaKernelEval {
557  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
558  for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
559  eval.evalScalar(i);
560  }
561  }
562 };
563 
564 template <typename Evaluator, typename StorageIndex>
565 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
567  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
568  const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
569  const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
570  const StorageIndex vectorized_step_size = step_size * PacketSize;
571 
572  // Use the vector path
573  for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
574  i += vectorized_step_size) {
575  eval.evalPacket(i);
576  }
577  for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
578  eval.evalScalar(i);
579  }
580  }
581 };
582 
583 template <typename Evaluator, typename StorageIndex>
584 __global__ void
585 __launch_bounds__(1024)
586 EigenMetaKernel(Evaluator eval, StorageIndex size) {
587 
588  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
589  const StorageIndex step_size = blockDim.x * gridDim.x;
590 
591  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
592  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
593 }
594 
595 /*static*/
596 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
598  const Expression& expr, const GpuDevice& device) {
599  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
600  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
601  if (needs_assign) {
602 
603  const int block_size = device.maxGpuThreadsPerBlock();
604  const int max_blocks = device.getNumGpuMultiProcessors() *
605  device.maxGpuThreadsPerMultiProcessor() / block_size;
606  const StorageIndex size = array_prod(evaluator.dimensions());
607  // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
608  const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
609 
610  LAUNCH_GPU_KERNEL(
611  (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
612  num_blocks, block_size, 0, device, evaluator, size);
613  }
614  evaluator.cleanup();
615 }
616 
617 #endif // EIGEN_GPUCC
618 #endif // EIGEN_USE_GPU
619 
620 // SYCL Executor policy
621 #ifdef EIGEN_USE_SYCL
622 
623 template <typename Evaluator>
624 struct ExecExprFunctorKernel {
625  typedef typename Evaluator::Index Index;
626  Evaluator evaluator;
627  const Index range;
628  template <typename Scratch>
629  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
630  const Scratch, Evaluator evaluator_, const Index range_)
631  : evaluator(evaluator_), range(range_) {}
632 
634  cl::sycl::nd_item<1> itemID) {
635  compute(itemID);
636  }
637  template <bool is_vec = Evaluator::PacketAccess>
639  compute(const cl::sycl::nd_item<1>& itemID) {
640  Index gId = static_cast<Index>(itemID.get_global_linear_id());
641  Index total_threads = itemID.get_global_range(0);
642 
643  for (Index i = gId; i < range; i += total_threads) {
644  evaluator.evalScalar(i);
645  }
646  }
647  template <bool is_vec = Evaluator::PacketAccess>
649  compute(const cl::sycl::nd_item<1>& itemID) {
650  const Index vectorizedRange =
651  (range / Evaluator::PacketSize) * Evaluator::PacketSize;
652  Index gId = static_cast<Index>(itemID.get_global_linear_id());
653  const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
654  const Index start = Evaluator::PacketSize * gId;
655  for (Index i = start; i < vectorizedRange; i += step) {
656  evaluator.evalPacket(i);
657  }
658  gId += vectorizedRange;
659  for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
660  evaluator.evalScalar(i);
661  }
662  }
663 };
664 
665 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
666 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
667  public:
668  typedef typename Expression::Index Index;
669  static EIGEN_STRONG_INLINE void run(const Expression& expr,
670  const Eigen::SyclDevice& dev) {
672  Evaluator evaluator(expr, dev);
673  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
674  if (needs_assign) {
675  Index range, GRange, tileSize;
676  Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
677  total_size = (total_size == 0) ? 1 : total_size;
678  const int PacketSize =
680  Eigen::SyclDevice>::size;
681  Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
682  dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
683  range = total_size;
684 
685  dev.template nullary_kernel_launcher<
686  typename Evaluator::CoeffReturnType,
687  ExecExprFunctorKernel<Evaluator> >(
688  evaluator,
689  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
690  cl::sycl::range<1>(tileSize)),
691  Index(1), range);
692  }
693  evaluator.cleanup();
694  }
695 };
696 
697 #endif
698 
699 } // end namespace internal
700 
701 } // end namespace Eigen
702 
703 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const
Definition: TensorBlock.h:365
def step(data, isam, result, truth, currPoseIndex)
Definition: visual_isam.py:82
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:932
SCALAR Scalar
Definition: bench_gemm.cpp:46
EIGEN_STRONG_INLINE void cleanup()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
Derived::Scalar CoeffReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition: TensorBlock.h:375
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
dim3 threadIdx
Definition: gpu_common.h:19
A cost model used to limit the number of threads used for evaluating tensor expression.
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
static Similarity3 align(const Point3Pairs &d_abPointPairs, const Rot3 &aRb, const Point3Pair &centroids)
This method estimates the similarity transform from differences point pairs,.
Definition: Similarity3.cpp:69
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:77
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
#define eigen_assert(x)
Definition: Macros.h:1037
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
#define NULL
Definition: ccolamd.c:609
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
EIGEN_CONSTEXPR Index size(const T &x)
Definition: Meta.h:479
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition: TensorBlock.h:361
Double_ range(const Point2_ &p, const Point2_ &q)
#define EIGEN_MAX_ALIGN_BYTES
EIGEN_DONT_INLINE void compute(Solver &solver, const MatrixType &A)
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
dim3 blockIdx
Definition: gpu_common.h:19
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
std::ptrdiff_t j
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())
dim3 blockDim
Definition: gpu_common.h:19


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:36:54