TensorBlock.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // This Source Code Form is subject to the terms of the Mozilla
5 // Public License v. 2.0. If a copy of the MPL was not distributed
6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 
8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
10 
11 namespace Eigen {
12 namespace internal {
13 
14 // -------------------------------------------------------------------------- //
15 // Forward declarations for templates defined below.
16 template <typename Scalar, typename IndexType, int NumDims, int Layout>
18 
19 // -------------------------------------------------------------------------- //
20 // Helper function to compute strides for densely stored buffer of given
21 // dimensions.
22 
23 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
24 // this function instead everywhere.
25 template <int Layout, typename IndexType, int NumDims>
29  if (NumDims == 0) return strides;
30 
31  // TODO(ezhulenev): Use templates to unroll this loop (similar to
32  // h_array_reduce in CXX11meta.h)? Benchmark it.
33  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
34  strides[0] = 1;
35  for (int i = 1; i < NumDims; ++i) {
36  strides[i] = strides[i - 1] * dimensions[i - 1];
37  }
38  } else {
39  strides[NumDims - 1] = 1;
40  for (int i = NumDims - 2; i >= 0; --i) {
41  strides[i] = strides[i + 1] * dimensions[i + 1];
42  }
43  }
44 
45  return strides;
46 }
47 
48 template <int Layout, typename IndexType, size_t NumDims>
51  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
52 }
53 
54 template <int Layout, std::ptrdiff_t... Indices>
55 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
56  const Sizes<Indices...>& sizes) {
57  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
58 }
59 
60 // -------------------------------------------------------------------------- //
61 
62 // Tensor block shape type defines what are the shape preference for the blocks
63 // extracted from the larger tensor.
64 //
65 // Example: blocks of 100 elements from the large 100x100 tensor:
66 // - tensor: 100x100
67 // - target_block_size: 100
68 //
69 // TensorBlockShapeType:
70 // - kUniformAllDims: 100 blocks of size 10x10
71 // - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
72 // or row major layout)
74 
76  TensorBlockShapeType shape_type; // target block shape
77  size_t size; // target block size
78  TensorOpCost cost_per_coeff; // cost of computing a single block element
79 
80 #ifdef EIGEN_HIPCC
81  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
82  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
83  // errors out complaining about the lack of a matching constructor
85  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
86  TensorOpCost cost_)
87  : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
88  {}
89 #endif
90 
91  template <typename Scalar>
93  TensorBlockShapeType shape_type, size_t size_in_bytes,
94  TensorOpCost cost) {
95  const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
96  return {shape_type, size, cost};
97  }
98 
99  template <typename Scalar>
101  TensorBlockShapeType shape_type, size_t size_in_bytes) {
102  // This default cost per coefficient is valid for most materialized tensor
103  // block evaluation implementations, because they typically just read
104  // coefficients from the underlying tensor storage, and write to the tensor
105  // block buffer (scratch or destination memory, reads and writes have linear
106  // access pattern). We ignore the fixed cost of block evaluation, because in
107  // practice it should negligible.
108  //
109  // Lazy block evaluation adds the cost of calling a functor for each
110  // coefficient.
111  //
112  // All non-trivial block evaluation implementations must provide their own
113  // cost approximation (e.g. shuffling inner dimension has a much higher cost
114  // because it reads memory randomly, although the total number of moved
115  // bytes is the same).
116  return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
117  {/*bytes_loaded=*/sizeof(Scalar),
118  /*bytes_stored=*/sizeof(Scalar),
119  /*compute_cycles=*/0});
120  }
121 
122  template <typename Scalar>
124  size_t size_in_bytes) {
125  return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
126  size_in_bytes);
127  }
128 
129  template <typename Scalar>
131  size_t size_in_bytes) {
132  return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
133  size_in_bytes);
134  }
135 
139  const TensorBlockResourceRequirements& rhs) {
140  return {merge(lhs.shape_type, rhs.shape_type), // shape_type
141  merge(lhs.size, rhs.size), // size
142  merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
143  }
144 
146  TensorOpCost cost) {
147  cost_per_coeff += cost;
148  return *this;
149  }
150 
151  // This is a resource requirement that should be returned from expressions
152  // that do not have any block evaluation preference (e.g. default tensor
153  // expression with raw buffer access).
156  return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
157  }
158 
159  private:
161 
163  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
164  return numext::maxi(lhs_size, rhs_size);
165  }
166 
174  }
175 
178  TensorOpCost rhs_cost) {
179  return lhs_cost + rhs_cost;
180  }
181 };
182 
183 // -------------------------------------------------------------------------- //
184 // TensorBlockDescriptor specifies a block offset within a tensor and the block
185 // sizes along each of the tensor dimensions.
186 
187 template <int NumDims, typename IndexType = Eigen::Index>
189  public:
191 
192  // If we evaluate a Tensor assignment, and expression on the left, already has
193  // a memory buffer, then we might do performance optimization, and evaluate
194  // the root expression directly into the final output memory. Some time it's
195  // possible to reuse it for materializing subexpressions inside an expression
196  // tree, to to avoid dynamic memory allocation.
197  //
198  // The pointer type of the underlying storage is erased, because passing
199  // Scalar type through all the expression evaluation layers is way too many
200  // templates. In practice destination buffer type should always match the
201  // evaluated expression scalar type.
203  public:
205  // The above explicit specification of "int" as the enum basetype is
206  // needed to get around a HIPCC link error ("the field type is not
207  // amp-compatible")
208  // which is issued for class members with the enum type.
209  // TODO(rocm):
210  // remove the "int" basetype once HIPCC has been fixed to not error out
211  // in the above scenario.
212 
213  // Destination buffer is not defined (`m_data` == nullptr).
215 
216  // Tensor block defined by an owning tensor block descriptor can fit
217  // contiguously into the destination buffer. In this case it's safe to
218  // materialize tensor block in the destination buffer, wrap it in a
219  // TensorMap, and use to build Eigen expression on top of it.
221 
222  // Destination buffer strides do not match strides of the contiguously
223  // stored block, and it's impossible to define a TensorMap over this
224  // buffer. However if we are evaluating a root of an expression tree, we
225  // still can materialize an output into this destination, because we can
226  // guarantee that no one will ever access it through block API.
227  //
228  // In theory it is possible to build valid TensorStriding<TensorMap>
229  // expression on top of this destination buffer, however it has
230  // inefficient coeff/packet access, and defeats the purpose of fast block
231  // evaluation API.
232  kStrided
233  };
234 
235  template <typename Scalar>
236  Scalar* data() const {
237  eigen_assert(m_data_type_size == sizeof(Scalar));
238  return static_cast<Scalar*>(m_data);
239  }
240 
241  const Dimensions& strides() const { return m_strides; }
242  const DestinationBufferKind& kind() const { return m_kind; }
243 
244  private:
245  friend class TensorBlockDescriptor;
246 
247  DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
248 
249  template <typename Scalar>
250  DestinationBuffer(Scalar* data, const Dimensions& strides,
252  : m_data(static_cast<void*>(data)),
253  m_data_type_size(sizeof(Scalar)),
254  m_strides(strides),
255  m_kind(kind) {}
256 
257  template <int Layout, typename Scalar>
259  Scalar* data, const Dimensions& strides) {
260  return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
261  }
262 
263  template <int Layout>
265  const Dimensions& strides) {
266  const Dimensions& desc_dims = desc.dimensions();
267  const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
268  for (int i = 0; i < NumDims; ++i) {
269  if (desc_dims[i] == 1) continue;
270  if (desc_strides[i] != strides[i]) return kStrided;
271  }
272  return kContiguous;
273  }
274 
275  // Storage pointer is type erased, to reduce template bloat, but we still
276  // keep the size of the underlying element type for error checking.
277  void* m_data;
279 
280  // Destination buffer dimensions always match the dimensions of a tensor
281  // block descriptor it belongs to, however strides might be different.
282  Dimensions m_strides;
283 
285  };
286 
287  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
288  const DestinationBuffer& destination)
289  : m_offset(offset),
290  m_dimensions(dimensions),
291  m_destination(destination) {}
292 
293  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
294  : m_offset(offset),
295  m_dimensions(dimensions),
296  m_destination(DestinationBuffer()) {}
297 
298  IndexType offset() const { return m_offset; }
299  const Dimensions& dimensions() const { return m_dimensions; }
300  IndexType dimension(int index) const { return m_dimensions[index]; }
301  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
302 
303  const DestinationBuffer& destination() const { return m_destination; }
304 
305  template <int Layout, typename Scalar>
306  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
307  eigen_assert(dst_base != NULL);
308  m_destination =
309  DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
310  }
311 
312  template <int Layout, typename Scalar, typename DstStridesIndexType>
314  Scalar* dst_base,
315  const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
316  // DSizes constructor will do index type promotion if it's safe.
317  AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
318  }
319 
321  m_destination.m_data = NULL;
322  m_destination.m_kind = DestinationBuffer::kEmpty;
323  return *this;
324  }
325 
326  bool HasDestinationBuffer() const {
327  return m_destination.kind() != DestinationBuffer::kEmpty;
328  }
329 
330  // Returns a copy of `*this` with updated offset.
332  return TensorBlockDescriptor(offset, m_dimensions, m_destination);
333  }
334 
335  private:
336  // Offset and dimensions are immutable after construction. Block descriptor
337  // can only be mutated by adding or dropping destination.
338  const IndexType m_offset;
339  const Dimensions m_dimensions;
340  DestinationBuffer m_destination;
341 };
342 
343 // -------------------------------------------------------------------------- //
344 // TensorBlockMapper is responsible for iterating over the blocks of a tensor.
345 
346 template <int NumDims, int Layout, typename IndexType = Eigen::Index>
349 
350  public:
352 
353  TensorBlockMapper() = default;
355  const TensorBlockResourceRequirements& requirements)
356  : m_tensor_dimensions(dimensions), m_requirements(requirements) {
357  // Compute block dimensions and the total number of blocks.
358  InitializeBlockDimensions();
359  }
360 
362  return m_total_block_count;
363  }
364 
366  return m_block_dimensions.TotalSize();
367  }
368 
370  blockDimensions() const {
371  return m_block_dimensions;
372  }
373 
374  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
375  blockDescriptor(IndexType block_index) const {
376  static const bool isColMajor = Layout == static_cast<int>(ColMajor);
377 
378  IndexType offset = 0;
380 
381  if (NumDims == 0) return BlockDescriptor(offset, dimensions);
382 
383  // Iterate outer -> inner dimensions.
384  for (int i = NumDims - 1; i >= 0; --i) {
385  const int dim = isColMajor ? i : NumDims - i - 1;
386 
387  const IndexType idx = block_index / m_block_strides[dim];
388  block_index -= idx * m_block_strides[dim];
389 
390  const IndexType coord = idx * m_block_dimensions[dim];
391  dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
392  m_block_dimensions[dim]);
393  offset += coord * m_tensor_strides[dim];
394  }
395 
396  return {offset, dimensions};
397  }
398 
399  private:
401  // Requested block shape and size.
402  const TensorBlockShapeType shape_type = m_requirements.shape_type;
403  IndexType target_block_size =
404  numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
405 
406  IndexType tensor_size = m_tensor_dimensions.TotalSize();
407 
408  // Corner case: one of the dimensions is zero. Logic below is too complex
409  // to handle this case on a general basis, just use unit block size.
410  // Note: we must not yield blocks with zero dimensions (recipe for
411  // overflows/underflows, divisions by zero and NaNs later).
412  if (tensor_size == 0) {
413  for (int i = 0; i < NumDims; ++i) {
414  m_block_dimensions[i] = 1;
415  }
416  m_total_block_count = 0;
417  return;
418  }
419 
420  // If tensor fits into a target block size, evaluate it as a single block.
421  if (tensor_size <= target_block_size) {
422  m_block_dimensions = m_tensor_dimensions;
423  m_total_block_count = 1;
424  // The only valid block index is `0`, and in this case we do not need
425  // to compute real strides for tensor or blocks (see blockDescriptor).
426  for (int i = 0; i < NumDims; ++i) {
427  m_tensor_strides[i] = 0;
428  m_block_strides[i] = 1;
429  }
430  return;
431  }
432 
433  static const bool isColMajor = Layout == static_cast<int>(ColMajor);
434 
435  // Block shape skewed towards inner dimension.
436  if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
437  IndexType coeff_to_allocate = target_block_size;
438 
439  for (int i = 0; i < NumDims; ++i) {
440  const int dim = isColMajor ? i : NumDims - i - 1;
441  m_block_dimensions[dim] =
442  numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
443  coeff_to_allocate = divup(
444  coeff_to_allocate,
445  numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
446  }
447  eigen_assert(coeff_to_allocate == 1);
448 
449  } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
450  // Tensor will not fit within 'target_block_size' budget: calculate tensor
451  // block dimension sizes based on "square" dimension size target.
452  const IndexType dim_size_target = convert_index<IndexType>(
453  std::pow(static_cast<float>(target_block_size),
454  1.0f / static_cast<float>(m_block_dimensions.rank())));
455 
456  for (int i = 0; i < NumDims; ++i) {
457  // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
458  // a multiple of the packet size. Note that reducing
459  // 'block_dim_size' in this manner can increase the number of
460  // blocks, and so will amplify any per-block overhead.
461  m_block_dimensions[i] =
462  numext::mini(dim_size_target, m_tensor_dimensions[i]);
463  }
464 
465  // Add any un-allocated coefficients to inner dimension(s).
466  IndexType total_size = m_block_dimensions.TotalSize();
467  for (int i = 0; i < NumDims; ++i) {
468  const int dim = isColMajor ? i : NumDims - i - 1;
469 
470  if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
471  const IndexType total_size_other_dims =
472  total_size / m_block_dimensions[dim];
473  const IndexType alloc_avail =
474  divup<IndexType>(target_block_size, total_size_other_dims);
475  if (alloc_avail == m_block_dimensions[dim]) {
476  // Insufficient excess coefficients to allocate.
477  break;
478  }
479  m_block_dimensions[dim] =
480  numext::mini(m_tensor_dimensions[dim], alloc_avail);
481  total_size = total_size_other_dims * m_block_dimensions[dim];
482  }
483  }
484 
485  } else {
486  eigen_assert(false); // unknown block shape
487  }
488 
489  eigen_assert(m_block_dimensions.TotalSize() >=
490  numext::mini<IndexType>(target_block_size,
491  m_tensor_dimensions.TotalSize()));
492 
493  // Calculate block counts by dimension and total block count.
494  DSizes<IndexType, NumDims> block_count;
495  for (int i = 0; i < NumDims; ++i) {
496  block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
497  }
498  m_total_block_count = array_prod(block_count);
499 
500  // Calculate block strides (used for enumerating blocks).
501  m_tensor_strides = strides<Layout>(m_tensor_dimensions);
502  m_block_strides = strides<Layout>(block_count);
503  }
504 
507 
510 
513 };
514 
515 // -------------------------------------------------------------------------- //
516 // TensorBlockScratchAllocator is responsible for allocating temporary buffers
517 // for block evaluation (output or input block materialization). Given that
518 // Eigen expression traversal order is deterministic, all temporary allocations
519 // are happening in the same order, and usually have exactly the same size.
520 // Scratch allocator keeps a trace of all dynamic allocations, and after the
521 // first block evaluation is completed, we should be able to reuse all the
522 // temporary buffers for the next block evaluation.
523 
524 template <typename Device>
526  public:
527  explicit TensorBlockScratchAllocator(const Device& device)
528  : m_device(device), m_allocation_index(0) {}
529 
531  for (size_t i = 0; i < m_allocations.size(); ++i) {
532  m_device.deallocate(m_allocations[i].ptr);
533  }
534  }
535 
536  void* allocate(size_t size) {
537  // TODO(ezhulenev): Remove when replaced with inlined vector.
538  if (m_allocations.capacity() == 0) m_allocations.reserve(8);
539 
540  // Check if we already have an existing allocation att current index.
541  const int num_allocations = static_cast<int>(m_allocations.size());
542  const bool has_allocation = m_allocation_index < num_allocations;
543 
544  // Allocation index can't be larger than the number of allocations.
545  eigen_assert(m_allocation_index <= num_allocations);
546 
547  // If we have existing allocation, and its size is larger or equal to
548  // requested size, we do nothing.
549 
550  // If current allocation can't fit requested size, we deallocate it, and
551  // replace with a larger allocation.
552  if (has_allocation && m_allocations[m_allocation_index].size < size) {
553  m_device.deallocate(m_allocations[m_allocation_index].ptr);
554  m_allocations[m_allocation_index].ptr = m_device.allocate(size);
555  m_allocations[m_allocation_index].size = size;
556  }
557 
558  // Make a new allocation if we don't have and existing one.
559  if (!has_allocation) {
560  Allocation allocation;
561  allocation.ptr = m_device.allocate(size);
562  allocation.size = size;
563  m_allocations.push_back(allocation);
564  }
565 
566  eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
567  eigen_assert(m_allocations[m_allocation_index].size >= size);
568 
569  return m_allocations[m_allocation_index++].ptr;
570  }
571 
572  void reset() { m_allocation_index = 0; }
573 
574  private:
575  struct Allocation {
576  void* ptr;
577  size_t size;
578  };
579 
580  const Device& m_device;
582  // TODO(ezhulenev): This should be an inlined vector.
583  std::vector<Allocation> m_allocations;
584 };
585 
586 // -------------------------------------------------------------------------- //
587 // TensorBlockKind represents all possible block kinds, that can be produced by
588 // TensorEvaluator::evalBlock function.
590  // Tensor block that is a lazy expression that must be assigned to a
591  // destination using TensorBlockAssign.
593 
594  // Tensor block that is a view into a memory buffer owned by an underlying
595  // Tensor expression (e.g. it can be a view into a Tensor buffer).
597 
598  // Tensor block that was materialized in a scratch memory buffer, allocated
599  // with TensorBlockScratchAllocator. This block must be copied to a
600  // destination, similar to a block of `kExpr` type.
602 
603  // Tensor block that was materialized directly into the final output memory
604  // buffer. For example if the left side of an assignment is a Tensor, we can
605  // directly materialize the block in the destination memory.
606  //
607  // If strides in the output buffer do not match tensor block strides, the
608  // Tensor expression will be invalid, and should not be used by
609  // TensorBlockAssign or for constructing another block expression.
611 };
612 
613 // -------------------------------------------------------------------------- //
614 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in
615 // TensorEvaluators that do not support block evaluation.
616 
618  public:
619  typedef void XprType;
620 };
621 
622 // -------------------------------------------------------------------------- //
623 // XprScalar extracts Scalar type from the Eigen expressions (if expression type
624 // is not void). It's required to be able to define lazy block expression for
625 // argument types, that do not support block evaluation.
626 
627 template <typename XprType>
628 struct XprScalar {
629  typedef typename XprType::Scalar type;
630 };
631 template <>
632 struct XprScalar<void> {
633  typedef void type;
634 };
635 
636 // -------------------------------------------------------------------------- //
637 // TensorMaterializedBlock is a fully evaluated block of the original tensor,
638 // and XprType is just a TensorMap over the data. This block type is typically
639 // used to materialize blocks of tensor expressions, that can't be efficiently
640 // represented as lazy Tensor expressions with fast coeff/packet operations,
641 // e.g. we materialize all broadcasts into evaluated blocks.
642 //
643 // TensorMaterializedBlock does not own its memory buffer, it's either a memory
644 // buffer that backs the original expression (e.g. block is just a view into a
645 // Tensor), or a memory buffer allocated with scratch allocator, and in this
646 // case the scratch allocator will deallocate it at the end of block based
647 // expression execution.
648 //
649 // If the block was evaluated directly into the output buffer, and strides in
650 // the output buffer do not match block strides, the TensorMap expression will
651 // be invalid, and should never be used in block assignment or any other tensor
652 // expression.
653 
654 template <typename Scalar, int NumDims, int Layout,
655  typename IndexType = Eigen::Index>
657  public:
660 
662  const Dimensions& dimensions, bool valid_expr = true)
663  : m_kind(kind),
664  m_data(data),
665  m_dimensions(dimensions),
666  m_expr(m_data, m_dimensions),
667  m_valid_expr(valid_expr) {
671  }
672 
673  TensorBlockKind kind() const { return m_kind; }
674  // NOTE(ezhulenev): Returning XprType by value like in other block types
675  // causes asan failures. The theory is that XprType::Nested doesn't work
676  // properly for TensorMap.
677  const XprType& expr() const {
678  eigen_assert(m_valid_expr);
679  return m_expr;
680  }
681  const Scalar* data() const { return m_data; }
682  void cleanup() {}
683 
685 
686  // TensorMaterializedBlock can be backed by different types of storage:
687  //
688  // (1) Contiguous block of memory allocated with scratch allocator.
689  // (2) Contiguous block of memory reused from tensor block descriptor
690  // destination buffer.
691  // (3) Strided block of memory reused from tensor block descriptor
692  // destination buffer.
693  //
694  class Storage {
695  public:
696  Scalar* data() const { return m_data; }
697  const Dimensions& dimensions() const { return m_dimensions; }
698  const Dimensions& strides() const { return m_strides; }
699 
702  m_materialized_in_output
705  m_data, m_dimensions, !m_strided_storage);
706  }
707 
708  private:
710 
711  Storage(Scalar* data, const Dimensions& dimensions,
712  const Dimensions& strides, bool materialized_in_output,
713  bool strided_storage)
714  : m_data(data),
715  m_dimensions(dimensions),
716  m_strides(strides),
717  m_materialized_in_output(materialized_in_output),
718  m_strided_storage(strided_storage) {}
719 
721  Dimensions m_dimensions;
722  Dimensions m_strides;
725  };
726 
727  // Creates a storage for materialized block either from the block descriptor
728  // destination buffer, or allocates a new buffer with scratch allocator.
729  template <typename TensorBlockScratch>
731  TensorBlockDesc& desc, TensorBlockScratch& scratch,
732  bool allow_strided_storage = false) {
733  // Try to reuse destination as an output block buffer.
734  typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
735 
736  if (desc.destination().kind() == DestinationBuffer::kContiguous) {
737  Scalar* buffer = desc.destination().template data<Scalar>();
738  desc.DropDestinationBuffer();
739  return Storage(buffer, desc.dimensions(),
740  internal::strides<Layout>(desc.dimensions()),
741  /*materialized_in_output=*/true,
742  /*strided_storage=*/false);
743 
744  } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
745  allow_strided_storage) {
746  Scalar* buffer = desc.destination().template data<Scalar>();
747  desc.DropDestinationBuffer();
748  return Storage(buffer, desc.dimensions(), desc.destination().strides(),
749  /*materialized_in_output=*/true, /*strided_storage=*/true);
750 
751  } else {
752  void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
753  return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
754  internal::strides<Layout>(desc.dimensions()),
755  /*materialized_in_output=*/false,
756  /*strided_storage=*/false);
757  }
758  }
759 
760  // Creates a materialized block for the given descriptor from a memory buffer.
761  template <typename DataDimensions, typename TensorBlockScratch>
763  const Scalar* data, const DataDimensions& data_dims,
764  TensorBlockDesc& desc, TensorBlockScratch& scratch) {
766 
767  // If a tensor block dimensions covers a contiguous block of the underlying
768  // memory, we can skip block buffer memory allocation, and construct a block
769  // from existing `data` memory buffer.
770  //
771  // Example: (RowMajor layout)
772  // data_dims: [11, 12, 13, 14]
773  // desc.dimensions(): [1, 1, 3, 14]
774  //
775  // In this case we can construct a TensorBlock starting at
776  // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
777  static const bool is_col_major = Layout == ColMajor;
778 
779  // Find out how many inner dimensions have a matching size.
780  int num_matching_inner_dims = 0;
781  for (int i = 0; i < NumDims; ++i) {
782  int dim = is_col_major ? i : NumDims - i - 1;
783  if (data_dims[dim] != desc.dimensions()[dim]) break;
784  ++num_matching_inner_dims;
785  }
786 
787  // All the outer dimensions must be of size `1`, except a single dimension
788  // before the matching inner dimension (`3` in the example above).
789  bool can_use_direct_access = true;
790  for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
791  int dim = is_col_major ? i : NumDims - i - 1;
792  if (desc.dimension(dim) != 1) {
793  can_use_direct_access = false;
794  break;
795  }
796  }
797 
798  if (can_use_direct_access) {
799  const Scalar* block_start = data + desc.offset();
801  block_start, desc.dimensions());
802 
803  } else {
804  // Reuse destination buffer or allocate new buffer with scratch allocator.
805  const Storage storage = prepareStorage(desc, scratch);
806 
809  typedef typename TensorBlockIO::Dst TensorBlockIODst;
810  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
811 
812  TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
813  data, desc.offset());
814  TensorBlockIODst dst(storage.dimensions(), storage.strides(),
815  storage.data());
816 
817  TensorBlockIO::Copy(dst, src);
818  return storage.AsTensorMaterializedBlock();
819  }
820  }
821 
822  private:
824  const Scalar* m_data;
825  Dimensions m_dimensions;
826  XprType m_expr;
828 };
829 
830 // -------------------------------------------------------------------------- //
831 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
832 // functor to the blocks produced by the underlying Tensor expression.
833 
834 template <typename UnaryOp, typename ArgTensorBlock>
836  static const bool NoArgBlockAccess =
838 
839  public:
840  typedef typename conditional<
841  NoArgBlockAccess, void,
843  type XprType;
844 
846 
847  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
848  : m_arg_block(arg_block), m_functor(functor) {}
849 
851 
852  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
853  const Scalar* data() const { return NULL; }
854  void cleanup() { m_arg_block.cleanup(); }
855 
856  private:
857  ArgTensorBlock m_arg_block;
858  UnaryOp m_functor;
859 };
860 
861 // -------------------------------------------------------------------------- //
862 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
863 // functor to the blocks produced by the underlying Tensor expression.
864 
865 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
867  static const bool NoArgBlockAccess =
870 
871  public:
872  typedef typename conditional<
873  NoArgBlockAccess, void,
874  TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
875  const typename RhsTensorBlock::XprType> >::type
877 
879 
880  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
881  const RhsTensorBlock& right_block,
882  const BinaryOp& functor)
883  : m_left_block(left_block),
884  m_right_block(right_block),
885  m_functor(functor) {}
886 
888 
889  XprType expr() const {
890  return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
891  }
892 
893  const Scalar* data() const { return NULL; }
894 
895  void cleanup() {
896  m_left_block.cleanup();
897  m_right_block.cleanup();
898  }
899 
900  private:
901  LhsTensorBlock m_left_block;
902  RhsTensorBlock m_right_block;
903  BinaryOp m_functor;
904 };
905 
906 // -------------------------------------------------------------------------- //
907 // TensorUnaryExprBlock is a lazy tensor expression block that can construct
908 // an arbitrary tensor expression from a block of the underlying type (this is a
909 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
910 
911 template <typename BlockFactory, typename ArgTensorBlock>
914  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
915 
916  public:
917  typedef typename conditional<
918  NoArgBlockAccess, void,
919  typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
920 
922 
923  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
924  const BlockFactory& factory)
925  : m_arg_block(arg_block), m_factory(factory) {}
926 
928  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
929  const Scalar* data() const { return NULL; }
930  void cleanup() { m_arg_block.cleanup(); }
931 
932  private:
933  ArgTensorBlock m_arg_block;
934  BlockFactory m_factory;
935 };
936 
937 // -------------------------------------------------------------------------- //
938 // TensorTernaryExprBlock is a lazy tensor expression block that can construct
939 // an arbitrary tensor expression from three blocks of the underlying type.
940 
941 template <typename BlockFactory, typename Arg1TensorBlock,
942  typename Arg2TensorBlock, typename Arg3TensorBlock>
947 
948  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
951 
952  public:
953  typedef typename conditional<
954  NoArgBlockAccess, void,
955  typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
956  Arg3XprType>::type>::type XprType;
957 
959 
960  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
961  const Arg2TensorBlock& arg2_block,
962  const Arg3TensorBlock& arg3_block,
963  const BlockFactory& factory)
964  : m_arg1_block(arg1_block),
965  m_arg2_block(arg2_block),
966  m_arg3_block(arg3_block),
967  m_factory(factory) {}
968 
970  XprType expr() const {
971  return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
972  m_arg3_block.expr());
973  }
974  const Scalar* data() const { return NULL; }
975  void cleanup() {
976  m_arg1_block.cleanup();
977  m_arg2_block.cleanup();
978  m_arg3_block.cleanup();
979  }
980 
981  private:
982  Arg1TensorBlock m_arg1_block;
983  Arg2TensorBlock m_arg2_block;
984  Arg3TensorBlock m_arg3_block;
985  BlockFactory m_factory;
986 };
987 
988 // -------------------------------------------------------------------------- //
989 // StridedLinearBufferCopy provides a method to copy data between two linear
990 // buffers with different strides, with optimized paths for scatter/gather.
991 
992 template <typename Scalar, typename IndexType>
995  enum {
998  };
999 
1000  public:
1001  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
1002  enum class Kind {
1003  Linear = 0, // src_stride == 1 && dst_stride == 1
1004  Scatter = 1, // src_stride == 1 && dst_stride != 1
1005  FillLinear = 2, // src_stride == 0 && dst_stride == 1
1006  FillScatter = 3, // src_stride == 0 && dst_stride != 1
1007  Gather = 4, // dst_stride == 1
1008  Random = 5 // everything else
1009  };
1010 
1011  struct Dst {
1012  Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1013 
1014  IndexType offset;
1015  IndexType stride;
1017  };
1018 
1019  struct Src {
1020  Src(IndexType o, IndexType s, const Scalar* d)
1021  : offset(o), stride(s), data(d) {}
1022 
1023  IndexType offset;
1024  IndexType stride;
1025  const Scalar* data;
1026  };
1027 
1028  template <typename StridedLinearBufferCopy::Kind kind>
1030  const Src& src,
1031  const size_t count) {
1032  Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1033  src.data);
1034  }
1035 
1036  private:
1037  template <typename StridedLinearBufferCopy::Kind kind>
1039  const IndexType count, const IndexType dst_offset,
1040  const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1041  const IndexType src_offset, const IndexType src_stride,
1042  const Scalar* EIGEN_RESTRICT src_data) {
1043  const Scalar* src = &src_data[src_offset];
1044  Scalar* dst = &dst_data[dst_offset];
1045 
1046  if (!Vectorizable) {
1047  for (Index i = 0; i < count; ++i) {
1048  dst[i * dst_stride] = src[i * src_stride];
1049  }
1050  return;
1051  }
1052 
1053  const IndexType vectorized_size = count - PacketSize;
1054  IndexType i = 0;
1055 
1057  // ******************************************************************** //
1058  // Linear copy from `src` to `dst`.
1059  const IndexType unrolled_size = count - 4 * PacketSize;
1060  eigen_assert(src_stride == 1 && dst_stride == 1);
1061  for (; i <= unrolled_size; i += 4 * PacketSize) {
1062  for (int j = 0; j < 4; ++j) {
1063  Packet p = ploadu<Packet>(src + i + j * PacketSize);
1064  pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1065  }
1066  }
1067  for (; i <= vectorized_size; i += PacketSize) {
1068  Packet p = ploadu<Packet>(src + i);
1069  pstoreu<Scalar, Packet>(dst + i, p);
1070  }
1071  for (; i < count; ++i) {
1072  dst[i] = src[i];
1073  }
1074  // ******************************************************************** //
1075  } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1076  // Scatter from `src` to `dst`.
1077  eigen_assert(src_stride == 1 && dst_stride != 1);
1078  for (; i <= vectorized_size; i += PacketSize) {
1079  Packet p = ploadu<Packet>(src + i);
1080  pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1081  }
1082  for (; i < count; ++i) {
1083  dst[i * dst_stride] = src[i];
1084  }
1085  // ******************************************************************** //
1086  } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1087  // Fill `dst` with value at `*src`.
1088  eigen_assert(src_stride == 0 && dst_stride == 1);
1089  const IndexType unrolled_size = count - 4 * PacketSize;
1090  Packet p = pload1<Packet>(src);
1091  for (; i <= unrolled_size; i += 4 * PacketSize) {
1092  for (int j = 0; j < 4; ++j) {
1093  pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1094  }
1095  }
1096  for (; i <= vectorized_size; i += PacketSize) {
1097  pstoreu<Scalar, Packet>(dst + i, p);
1098  }
1099  for (; i < count; ++i) {
1100  dst[i] = *src;
1101  }
1102  // ******************************************************************** //
1103  } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1104  // Scatter `*src` into `dst`.
1105  eigen_assert(src_stride == 0 && dst_stride != 1);
1106  Packet p = pload1<Packet>(src);
1107  for (; i <= vectorized_size; i += PacketSize) {
1108  pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1109  }
1110  for (; i < count; ++i) {
1111  dst[i * dst_stride] = *src;
1112  }
1113  // ******************************************************************** //
1114  } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1115  // Gather from `src` into `dst`.
1116  eigen_assert(dst_stride == 1);
1117  for (; i <= vectorized_size; i += PacketSize) {
1118  Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1119  pstoreu<Scalar, Packet>(dst + i, p);
1120  }
1121  for (; i < count; ++i) {
1122  dst[i] = src[i * src_stride];
1123  }
1124  // ******************************************************************** //
1125  } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1126  // Random.
1127  for (; i < count; ++i) {
1128  dst[i * dst_stride] = src[i * src_stride];
1129  }
1130  } else {
1131  eigen_assert(false);
1132  }
1133  }
1134 };
1135 
1136 // -------------------------------------------------------------------------- //
1137 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
1138 // It's possible to specify src->dst dimension mapping for the copy operation.
1139 // Dimensions of `dst` specify how many elements have to be copied, for the
1140 // `src` we need to know only stride to navigate through source memory buffer.
1141 
1142 template <typename Scalar, typename IndexType, int NumDims, int Layout>
1143 class TensorBlockIO {
1144  static const bool IsColMajor = (Layout == ColMajor);
1145 
1147 
1148  public:
1151 
1152  struct Dst {
1153  Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
1154  IndexType dst_offset = 0)
1155  : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1156 
1157  Dimensions dims;
1158  Dimensions strides;
1160  IndexType offset;
1161  };
1162 
1163  struct Src {
1164  Src(const Dimensions& src_strides, const Scalar* src,
1165  IndexType src_offset = 0)
1166  : strides(src_strides), data(src), offset(src_offset) {}
1167 
1168  Dimensions strides;
1169  const Scalar* data;
1170  IndexType offset;
1171  };
1172 
1173  // Copies data to `dst` from `src`, using provided dimensions mapping:
1174  //
1175  // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
1176  //
1177  // Returns the number of copied elements.
1179  const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
1180  // Copy single scalar value from `src` to `dst`.
1181  if (NumDims == 0) {
1182  *(dst.data + dst.offset) = *(src.data + src.offset);
1183  return 1;
1184  }
1185 
1186  // Both `dst` and `src` must have contiguous innermost dimension. We also
1187  // accept the special case with stride '0', because it's used as a trick to
1188  // implement broadcasting.
1189  {
1190  int inner_dim = IsColMajor ? 0 : NumDims - 1;
1191  EIGEN_UNUSED_VARIABLE(inner_dim);
1192  eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1193  eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1194  }
1195 
1196  // Give a shorter name to `dst_to_src_dim_map`.
1197  const DimensionsMap& dim_map = dst_to_src_dim_map;
1198 
1199  // Do not squeeze reordered inner dimensions.
1200  int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1201 
1202  // NOTE: We find the innermost dimension (contiguous in memory) in the dst
1203  // block, and we write data linearly into that dimension, reading it from
1204  // the src. If dimensions are reordered, we might end up reading data from
1205  // the src with `stride != 1`.
1206  //
1207  // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
1208  // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
1209 
1210  // Find the innermost dimension in the dst whose size is not 1. This is the
1211  // effective inner dim.
1212  int num_size_one_inner_dims = 0;
1213  for (int i = 0; i < num_squeezable_dims; ++i) {
1214  const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1215  if (dst.dims[dst_dim] != 1) break;
1216  num_size_one_inner_dims++;
1217  }
1218 
1219  // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
1220  if (num_size_one_inner_dims == NumDims) {
1221  *(dst.data + dst.offset) = *(src.data + src.offset);
1222  return 1;
1223  }
1224 
1225  // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
1226  const int dst_stride1_dim = IsColMajor
1227  ? num_size_one_inner_dims
1228  : NumDims - num_size_one_inner_dims - 1;
1229 
1230  // Dimension in the src that corresponds to the dst innermost dimension.
1231  const int src_dim_for_dst_stride1_dim =
1232  NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1233 
1234  // Size of the innermost dimension (length of contiguous blocks of memory).
1235  IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1236 
1237  // Squeeze multiple inner dims into one if they are contiguous in `dst` and
1238  // `src` memory, so we can do less linear copy calls.
1239  for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1240  const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1241  const IndexType dst_stride = dst.strides[dst_dim];
1242  const IndexType src_stride = src.strides[dim_map[dst_dim]];
1243  if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1244  dst_inner_dim_size *= dst.dims[dst_dim];
1245  ++num_size_one_inner_dims;
1246  } else {
1247  break;
1248  }
1249  }
1250 
1251  // Setup strides to read data from `src` and write to `dst`.
1252  IndexType input_offset = src.offset;
1253  IndexType output_offset = dst.offset;
1254  IndexType input_stride =
1255  NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1256  IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1257 
1258  const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1260 
1261  // Initialize block iterator state. Squeeze away any dimension of size 1.
1262  int idx = 0; // currently initialized iterator state index
1263  for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1264  const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1265  if (dst.dims[dst_dim] == 1) continue;
1266 
1267  it[idx].size = dst.dims[dst_dim];
1268  it[idx].input_stride = src.strides[dim_map[dst_dim]];
1269  it[idx].output_stride = dst.strides[dst_dim];
1270 
1271  it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1272  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1273 
1274  idx++;
1275  }
1276 
1277  // Iterate copying data from src to dst.
1278  const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1279 
1280 #define COPY_INNER_DIM(KIND) \
1281  IndexType num_copied = 0; \
1282  for (num_copied = 0; num_copied < block_total_size; \
1283  num_copied += dst_inner_dim_size) { \
1284  LinCopy::template Run<KIND>( \
1285  typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1286  typename LinCopy::Src(input_offset, input_stride, src.data), \
1287  dst_inner_dim_size); \
1288  \
1289  for (int j = 0; j < idx; ++j) { \
1290  if (++it[j].count < it[j].size) { \
1291  input_offset += it[j].input_stride; \
1292  output_offset += it[j].output_stride; \
1293  break; \
1294  } \
1295  it[j].count = 0; \
1296  input_offset -= it[j].input_span; \
1297  output_offset -= it[j].output_span; \
1298  } \
1299  } \
1300  return num_copied;
1301 
1302  if (input_stride == 1 && output_stride == 1) {
1303  COPY_INNER_DIM(LinCopy::Kind::Linear);
1304  } else if (input_stride == 1 && output_stride != 1) {
1305  COPY_INNER_DIM(LinCopy::Kind::Scatter);
1306  } else if (input_stride == 0 && output_stride == 1) {
1307  COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1308  } else if (input_stride == 0 && output_stride != 1) {
1309  COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1310  } else if (output_stride == 1) {
1311  COPY_INNER_DIM(LinCopy::Kind::Gather);
1312  } else {
1313  COPY_INNER_DIM(LinCopy::Kind::Random);
1314  }
1315 
1316 #undef COPY_INNER_DIM
1317  }
1318 
1319  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
1320  // the number of copied elements.
1321  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
1322  const Src& src) {
1323  DimensionsMap dst_to_src_map;
1324  for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1325  return Copy(dst, src, dst_to_src_map);
1326  }
1327 
1328  private:
1331  : size(0),
1332  count(0),
1333  input_stride(0),
1334  output_stride(0),
1335  input_span(0),
1336  output_span(0) {}
1337 
1338  IndexType size;
1339  IndexType count;
1340  IndexType input_stride;
1341  IndexType output_stride;
1342  IndexType input_span;
1343  IndexType output_span;
1344  };
1345 
1346  // Compute how many inner dimensions it's allowed to squeeze when doing IO
1347  // between two tensor blocks. It's safe to squeeze inner dimensions, only
1348  // if they are not reordered.
1349  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1350  int num_squeezable_dims = 0;
1351  for (int i = 0; i < NumDims; ++i) {
1352  const int dim = IsColMajor ? i : NumDims - i - 1;
1353  if (dim_map[dim] != dim) break;
1354  num_squeezable_dims++;
1355  }
1356  return num_squeezable_dims;
1357  }
1358 };
1359 
1360 // -------------------------------------------------------------------------- //
1361 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
1362 // a Tensor block defined by `desc`, backed by a memory buffer at `target`.
1363 //
1364 // Currently there is no way to write from a Tensor expression to a block of
1365 // memory, if dimensions are reordered. If you need to do that, you should
1366 // materialize a Tensor block expression into a memory buffer, and then use
1367 // TensorBlockIO to copy data between two memory buffers with a custom
1368 // `target->src` dimension map (see definition above).
1369 //
1370 // Also currently the innermost dimension of `target` must have a stride '1'
1371 // (contiguous in memory). This restriction could be lifted with a `pscatter`,
1372 // but in practice it's never needed, and there is a similar TensorBlockIO
1373 // workaround for that.
1374 //
1375 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1376 // where `src` is a tensor expression. Explore if it is possible to rewrite IO
1377 // to use expressions instead of pointers, and after that TensorBlockAssignment
1378 // will become an alias to IO.
1379 template <typename Scalar, int NumDims, typename TensorBlockExpr,
1380  typename IndexType = Eigen::Index>
1382  // We will use coeff/packet path to evaluate block expressions.
1385 
1387 
1388  enum {
1391  };
1392 
1393  template <bool Vectorizable, typename Evaluator>
1395  EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1396  const Evaluator& eval,
1397  IndexType eval_offset) {
1398  for (IndexType i = 0; i < count; ++i) {
1399  target[i] = eval.coeff(eval_offset + i);
1400  }
1401  }
1402  };
1403 
1404  template <typename Evaluator>
1405  struct InnerDimAssign<true, Evaluator> {
1406  EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1407  const Evaluator& eval,
1408  IndexType eval_offset) {
1409  typedef typename packet_traits<Scalar>::type Packet;
1410 
1411  const IndexType unrolled_size = count - 4 * PacketSize;
1412  const IndexType vectorized_size = count - PacketSize;
1413  IndexType i = 0;
1414 
1415  for (; i <= unrolled_size; i += 4 * PacketSize) {
1416  for (int j = 0; j < 4; ++j) {
1417  const IndexType idx = eval_offset + i + j * PacketSize;
1418  Packet p = eval.template packet<Unaligned>(idx);
1419  pstoreu<Scalar>(target + i + j * PacketSize, p);
1420  }
1421  }
1422 
1423  for (; i <= vectorized_size; i += PacketSize) {
1424  Packet p = eval.template packet<Unaligned>(eval_offset + i);
1425  pstoreu<Scalar>(target + i, p);
1426  }
1427 
1428  for (; i < count; ++i) {
1429  target[i] = eval.coeff(eval_offset + i);
1430  }
1431  }
1432  };
1433 
1434  public:
1435  struct Target {
1436  Target(const Dimensions& target_dims, const Dimensions& target_strides,
1437  Scalar* target_data, IndexType target_offset = 0)
1438  : dims(target_dims),
1439  strides(target_strides),
1440  data(target_data),
1441  offset(target_offset) {}
1442 
1443  Dimensions dims;
1444  Dimensions strides;
1446  IndexType offset;
1447  };
1448 
1449  static Target target(const Dimensions& target_dims,
1450  const Dimensions& target_strides, Scalar* target_data,
1451  IndexType target_offset = 0) {
1452  return Target(target_dims, target_strides, target_data, target_offset);
1453  }
1454 
1455  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1456  static Target target(
1457  const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1458  const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1459  Scalar* target_data, IndexType target_offset = 0) {
1460  // DSizes constructor will do index type promotion if it's safe.
1461  return Target(Dimensions(target_dims), Dimensions(target_strides),
1462  target_data, target_offset);
1463  }
1464 
1466  const Target& target, const TensorBlockExpr& expr) {
1467  // Prepare evaluator for block expression.
1468  DefaultDevice default_device;
1469  TensorBlockEvaluator eval(expr, default_device);
1470 
1471  // Tensor block expression dimension should match destination dimensions.
1472  eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1473 
1474  static const int Layout = TensorBlockEvaluator::Layout;
1475  static const bool is_col_major = Layout == ColMajor;
1476 
1477  // Initialize output inner dimension size based on a layout.
1478  const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1479  const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1480  IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1481 
1482  // Target inner dimension stride must be '1'.
1483  eigen_assert(target.strides[inner_dim_idx] == 1);
1484 
1485  // Squeeze multiple inner dims into one if they are contiguous in `target`.
1486  IndexType num_squeezed_dims = 0;
1487  for (Index i = 1; i < NumDims; ++i) {
1488  const Index dim = is_col_major ? i : NumDims - i - 1;
1489  const IndexType target_stride = target.strides[dim];
1490 
1491  if (output_inner_dim_size == target_stride) {
1492  output_inner_dim_size *= target.dims[dim];
1493  num_squeezed_dims++;
1494  } else {
1495  break;
1496  }
1497  }
1498 
1499  // Initialize output block iterator state. Dimension in this array are
1500  // always in inner_most -> outer_most order (col major layout).
1502 
1503  int idx = 0; // currently initialized iterator state index
1504  for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1505  const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1506 
1507  it[idx].count = 0;
1508  it[idx].size = target.dims[dim];
1509  it[idx].output_stride = target.strides[dim];
1510  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1511  idx++;
1512  }
1513 
1514  // We read block expression from the beginning, and start writing data to
1515  // `target` at given offset.
1516  IndexType input_offset = 0;
1517  IndexType output_offset = target.offset;
1518 
1519  // Iterate copying data from `eval` to `target`.
1520  for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1521  // Assign to `target` at current offset.
1522  InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1523  TensorBlockEvaluator>::Run(target.data + output_offset,
1524  output_inner_dim_size, eval,
1525  input_offset);
1526 
1527  // Move input offset forward by the number of assigned coefficients.
1528  input_offset += output_inner_dim_size;
1529 
1530  // Update index.
1531  for (int j = 0; j < idx; ++j) {
1532  if (++it[j].count < it[j].size) {
1533  output_offset += it[j].output_stride;
1534  break;
1535  }
1536  it[j].count = 0;
1537  output_offset -= it[j].output_span;
1538  }
1539  }
1540  }
1541 
1542  private:
1545  : count(0), size(0), output_stride(0), output_span(0) {}
1546 
1547  IndexType count;
1548  IndexType size;
1549  IndexType output_stride;
1550  IndexType output_span;
1551  };
1552 };
1553 
1554 // -------------------------------------------------------------------------- //
1555 
1556 } // namespace internal
1557 } // namespace Eigen
1558 
1559 #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const
Definition: TensorBlock.h:365
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes< IndexType, NumDims > & blockDimensions() const
Definition: TensorBlock.h:370
Arg3TensorBlock::XprType Arg3XprType
Definition: TensorBlock.h:946
internal::packet_traits< Scalar >::type Packet
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:932
SCALAR Scalar
Definition: bench_gemm.cpp:46
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions, const DestinationBuffer &destination)
Definition: TensorBlock.h:287
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
packet_traits< Scalar >::type Packet
Definition: TensorBlock.h:994
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes, TensorOpCost cost)
Definition: TensorBlock.h:92
TensorBlockKind kind() const
Definition: TensorBlock.h:927
TensorBlockMapper(const DSizes< IndexType, NumDims > &dimensions, const TensorBlockResourceRequirements &requirements)
Definition: TensorBlock.h:354
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE std::size_t size()
Definition: EmulateArray.h:44
TensorTernaryExprBlock(const Arg1TensorBlock &arg1_block, const Arg2TensorBlock &arg2_block, const Arg3TensorBlock &arg3_block, const BlockFactory &factory)
Definition: TensorBlock.h:960
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst &dst, const Src &src)
Definition: TensorBlock.h:1321
std::vector< Array2i > sizes
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition: TensorBlock.h:375
TensorBlockScratchAllocator(const Device &device)
Definition: TensorBlock.h:527
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
static EIGEN_STRONG_INLINE Storage prepareStorage(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool allow_strided_storage=false)
Definition: TensorBlock.h:730
A cost model used to limit the number of threads used for evaluating tensor expression.
TensorMaterializedBlock AsTensorMaterializedBlock() const
Definition: TensorBlock.h:700
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
DSizes< IndexType, NumDims > m_tensor_strides
Definition: TensorBlock.h:511
IndexType dimension(int index) const
Definition: TensorBlock.h:300
static EIGEN_ALWAYS_INLINE void Run(Scalar *target, IndexType count, const Evaluator &eval, IndexType eval_offset)
Definition: TensorBlock.h:1406
TensorMap< const Tensor< Scalar, NumDims, Layout > > XprType
Definition: TensorBlock.h:659
TensorBlockDescriptor & DropDestinationBuffer()
Definition: TensorBlock.h:320
static DestinationBuffer make(const TensorBlockDescriptor &desc, Scalar *data, const Dimensions &strides)
Definition: TensorBlock.h:258
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst &dst, const Src &src, const size_t count)
Definition: TensorBlock.h:1029
Definition: init.h:269
const DestinationBuffer & destination() const
Definition: TensorBlock.h:303
Src(IndexType o, IndexType s, const Scalar *d)
Definition: TensorBlock.h:1020
DestinationBuffer(Scalar *data, const Dimensions &strides, DestinationBufferKind kind)
Definition: TensorBlock.h:250
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:1149
Storage(Scalar *data, const Dimensions &dimensions, const Dimensions &strides, bool materialized_in_output, bool strided_storage)
Definition: TensorBlock.h:711
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target &target, const TensorBlockExpr &expr)
Definition: TensorBlock.h:1465
DSizes< IndexType, NumDims > m_tensor_dimensions
Definition: TensorBlock.h:505
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:658
A tensor expression mapping an existing array of data.
TensorCwiseBinaryBlock(const LhsTensorBlock &left_block, const RhsTensorBlock &right_block, const BinaryOp &functor)
Definition: TensorBlock.h:880
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:351
Src(const Dimensions &src_strides, const Scalar *src, IndexType src_offset=0)
Definition: TensorBlock.h:1164
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements uniform(size_t size_in_bytes)
Definition: TensorBlock.h:130
const DestinationBufferKind & kind() const
Definition: TensorBlock.h:242
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:845
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Definition: TensorBlock.h:26
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
#define COPY_INNER_DIM(KIND)
static Target target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1449
TensorBlockDescriptor< NumDims, IndexType > BlockDescriptor
Definition: TensorBlock.h:348
#define eigen_assert(x)
Definition: Macros.h:1037
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst &dst, const Src &src, const DimensionsMap &dst_to_src_dim_map)
Definition: TensorBlock.h:1178
conditional< NoArgBlockAccess, void, typename BlockFactory::template XprType< Arg1XprType, Arg2XprType, Arg3XprType >::type >::type XprType
Definition: TensorBlock.h:956
DSizes< IndexType, NumDims > m_block_dimensions
Definition: TensorBlock.h:508
int data[]
TensorBlockDescriptor WithOffset(IndexType offset) const
Definition: TensorBlock.h:331
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
RealScalar s
TensorEvaluator< const TensorBlockExpr, DefaultDevice > TensorBlockEvaluator
Definition: TensorBlock.h:1384
TensorMaterializedBlock(TensorBlockKind kind, const Scalar *data, const Dimensions &dimensions, bool valid_expr=true)
Definition: TensorBlock.h:661
Dst(const Dimensions &dst_dims, const Dimensions &dst_strides, Scalar *dst, IndexType dst_offset=0)
Definition: TensorBlock.h:1153
static EIGEN_ALWAYS_INLINE void Run(Scalar *target, IndexType count, const Evaluator &eval, IndexType eval_offset)
Definition: TensorBlock.h:1395
#define EIGEN_RESTRICT
Definition: Macros.h:1160
#define NULL
Definition: ccolamd.c:609
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:1386
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:921
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
TensorCwiseUnaryBlock(const ArgTensorBlock &arg_block, const UnaryOp &functor)
Definition: TensorBlock.h:847
EIGEN_CONSTEXPR Index size(const T &x)
Definition: Meta.h:479
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
EIGEN_DEVICE_FUNC TensorBlockResourceRequirements & addCostPerCoeff(TensorOpCost cost)
Definition: TensorBlock.h:145
static EIGEN_STRONG_INLINE TensorMaterializedBlock materialize(const Scalar *data, const DataDimensions &data_dims, TensorBlockDesc &desc, TensorBlockScratch &scratch)
Definition: TensorBlock.h:762
StridedLinearBufferCopy< Scalar, IndexType > LinCopy
Definition: TensorBlock.h:1146
void AddDestinationBuffer(Scalar *dst_base, const DSizes< DstStridesIndexType, NumDims > &dst_strides)
Definition: TensorBlock.h:313
DSizes< int, NumDims > DimensionsMap
Definition: TensorBlock.h:1150
internal::TensorBlockDescriptor< NumDims, IndexType > TensorBlockDesc
Definition: TensorBlock.h:684
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition: TensorBlock.h:361
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size)
Definition: TensorBlock.h:163
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
const Dimensions & dimensions() const
Definition: TensorBlock.h:299
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset, const IndexType dst_stride, Scalar *EIGEN_RESTRICT dst_data, const IndexType src_offset, const IndexType src_stride, const Scalar *EIGEN_RESTRICT src_data)
Definition: TensorBlock.h:1038
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:190
void AddDestinationBuffer(Scalar *dst_base, const Dimensions &dst_strides)
Definition: TensorBlock.h:306
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:958
ArgTensorBlock::XprType ArgXprType
Definition: TensorBlock.h:913
Arg2TensorBlock::XprType Arg2XprType
Definition: TensorBlock.h:945
float * p
static Target target(const DSizes< TargetDimsIndexType, NumDims > &target_dims, const DSizes< TargetStridesIndexType, NumDims > &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1456
static int NumSqueezableInnerDims(const DimensionsMap &dim_map)
Definition: TensorBlock.h:1349
conditional< NoArgBlockAccess, void, typename BlockFactory::template XprType< ArgXprType >::type >::type XprType
Definition: TensorBlock.h:919
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements skewed(size_t size_in_bytes)
Definition: TensorBlock.h:123
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes)
Definition: TensorBlock.h:100
std::vector< size_t > Indices
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost)
Definition: TensorBlock.h:177
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Dst(IndexType o, IndexType s, Scalar *d)
Definition: TensorBlock.h:1012
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:878
TensorUnaryExprBlock(const ArgTensorBlock &arg_block, const BlockFactory &factory)
Definition: TensorBlock.h:923
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs)
Definition: TensorBlock.h:169
TensorBlockResourceRequirements m_requirements
Definition: TensorBlock.h:506
Jet< T, N > pow(const Jet< T, N > &f, double g)
Definition: jet.h:570
Generic expression where a coefficient-wise unary operator is applied to an expression.
Definition: CwiseUnaryOp.h:55
Arg1TensorBlock::XprType Arg1XprType
Definition: TensorBlock.h:944
conditional< NoArgBlockAccess, void, TensorCwiseUnaryOp< UnaryOp, const typename ArgTensorBlock::XprType > >::type XprType
Definition: TensorBlock.h:843
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements any()
Definition: TensorBlock.h:155
static DestinationBufferKind kind(const TensorBlockDescriptor &desc, const Dimensions &strides)
Definition: TensorBlock.h:264
const std::vector< size_t > dimensions
conditional< NoArgBlockAccess, void, TensorCwiseBinaryOp< BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType > >::type XprType
Definition: TensorBlock.h:876
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition: TensorMeta.h:30
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements merge(const TensorBlockResourceRequirements &lhs, const TensorBlockResourceRequirements &rhs)
Definition: TensorBlock.h:138
std::ptrdiff_t j
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:1076
Definition: pytypes.h:1370
Target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1436
DSizes< IndexType, NumDims > m_block_strides
Definition: TensorBlock.h:512
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions)
Definition: TensorBlock.h:293
std::vector< Allocation > m_allocations
Definition: TensorBlock.h:583


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:36:41