TensorBroadcasting.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 template<typename Broadcast, typename XprType>
24 struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
25 {
26  typedef typename XprType::Scalar Scalar;
28  typedef typename XprTraits::StorageKind StorageKind;
29  typedef typename XprTraits::Index Index;
30  typedef typename XprType::Nested Nested;
32  static const int NumDimensions = XprTraits::NumDimensions;
33  static const int Layout = XprTraits::Layout;
34  typedef typename XprTraits::PointerType PointerType;
35 };
36 
37 template<typename Broadcast, typename XprType>
39 {
41 };
42 
43 template<typename Broadcast, typename XprType>
44 struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
45 {
47 };
48 
49 template <typename Dims>
51  static const bool value = false;
52 };
53 template <>
54 struct is_input_scalar<Sizes<> > {
55  static const bool value = true;
56 };
57 #ifndef EIGEN_EMULATE_CXX11_META_H
58 template <typename std::ptrdiff_t... Indices>
60  static const bool value = (Sizes<Indices...>::total_size == 1);
61 };
62 #endif
63 
64 } // end namespace internal
65 
66 
67 
68 template<typename Broadcast, typename XprType>
69 class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
70 {
71  public:
74  typedef typename XprType::CoeffReturnType CoeffReturnType;
78 
80  : m_xpr(expr), m_broadcast(broadcast) {}
81 
83  const Broadcast& broadcast() const { return m_broadcast; }
84 
87  expression() const { return m_xpr; }
88 
89  protected:
90  typename XprType::Nested m_xpr;
91  const Broadcast m_broadcast;
92 };
93 
94 
95 // Eval as rvalue
96 template<typename Broadcast, typename ArgType, typename Device>
97 struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
98 {
100  typedef typename XprType::Index Index;
103  typedef typename XprType::Scalar Scalar;
107  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
108  protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
109  bool isCopy, nByOne, oneByN;
110  public:
113 
114  enum {
118  PreferBlockAccess = true,
120  RawAccess = false
121  };
122 
124 
125  // We do block based broadcasting using a trick with 2x tensor rank and 0
126  // strides. See block method implementation for details.
128 
129  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
132 
135 
136  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
137  Layout, Index>
139  //===--------------------------------------------------------------------===//
140 
141  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
142  : isCopy(false), nByOne(false), oneByN(false),
143  m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
144  {
145 
146  // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
147  // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
148  // tensor with N >= 1 of 1 element first and then broadcast.
149  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
150  const InputDimensions& input_dims = m_impl.dimensions();
151  isCopy = true;
152  for (int i = 0; i < NumDims; ++i) {
153  eigen_assert(input_dims[i] > 0);
154  m_dimensions[i] = input_dims[i] * m_broadcast[i];
155  if (m_broadcast[i] != 1) {
156  isCopy = false;
157  }
158  }
159 
160  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
161  m_inputStrides[0] = 1;
162  m_outputStrides[0] = 1;
163  for (int i = 1; i < NumDims; ++i) {
164  m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
165  m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
166  }
167  } else {
168  m_inputStrides[NumDims-1] = 1;
169  m_outputStrides[NumDims-1] = 1;
170  for (int i = NumDims-2; i >= 0; --i) {
171  m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
172  m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
173  }
174  }
175 
176  if (input_dims[0] == 1) {
177  oneByN = true;
178  for (int i = 1; i < NumDims; ++i) {
179  if (m_broadcast[i] != 1) {
180  oneByN = false;
181  break;
182  }
183  }
184  } else if (input_dims[NumDims-1] == 1) {
185  nByOne = true;
186  for (int i = 0; i < NumDims-1; ++i) {
187  if (m_broadcast[i] != 1) {
188  nByOne = false;
189  break;
190  }
191  }
192  }
193 
194  // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
195  // broadcast shape is '[N, 1..., N]'
196  if (!oneByN && !nByOne) {
197  if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
198  nByOne = true;
199  oneByN = true;
200  for (int i = 1; i < NumDims-1; ++i) {
201  if (m_broadcast[i] != 1) {
202  nByOne = false;
203  oneByN = false;
204  break;
205  }
206  }
207  }
208  }
209  }
210 
211  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
212 
213  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
214  m_impl.evalSubExprsIfNeeded(NULL);
215  return true;
216  }
217 
218 #ifdef EIGEN_USE_THREADS
219  template <typename EvalSubExprsCallback>
220  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
221  EvaluatorPointerType, EvalSubExprsCallback done) {
222  m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
223  }
224 #endif // EIGEN_USE_THREADS
225 
227  m_impl.cleanup();
228  }
229 
230  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
231  {
233  return m_impl.coeff(0);
234  }
235 
236  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
237  if (isCopy) {
238  return m_impl.coeff(index);
239  } else {
240  return coeffColMajor(index);
241  }
242  } else {
243  if (isCopy) {
244  return m_impl.coeff(index);
245  } else {
246  return coeffRowMajor(index);
247  }
248  }
249  }
250 
251  // TODO: attempt to speed this up. The integer divisions and modulo are slow
253  Index inputIndex = 0;
255  for (int i = NumDims - 1; i > 0; --i) {
256  const Index idx = index / m_outputStrides[i];
257  if (internal::index_statically_eq<Broadcast>(i, 1)) {
258  eigen_assert(idx < m_impl.dimensions()[i]);
259  inputIndex += idx * m_inputStrides[i];
260  } else {
261  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
262  eigen_assert(idx % m_impl.dimensions()[i] == 0);
263  } else {
264  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
265  }
266  }
267  index -= idx * m_outputStrides[i];
268  }
269  if (internal::index_statically_eq<Broadcast>(0, 1)) {
270  eigen_assert(index < m_impl.dimensions()[0]);
271  inputIndex += index;
272  } else {
273  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
274  eigen_assert(index % m_impl.dimensions()[0] == 0);
275  } else {
276  inputIndex += (index % m_impl.dimensions()[0]);
277  }
278  }
279  return inputIndex;
280  }
281 
282  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
283  {
284  return m_impl.coeff(indexColMajor(index));
285  }
286 
288  Index inputIndex = 0;
290  for (int i = 0; i < NumDims - 1; ++i) {
291  const Index idx = index / m_outputStrides[i];
292  if (internal::index_statically_eq<Broadcast>(i, 1)) {
293  eigen_assert(idx < m_impl.dimensions()[i]);
294  inputIndex += idx * m_inputStrides[i];
295  } else {
296  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
297  eigen_assert(idx % m_impl.dimensions()[i] == 0);
298  } else {
299  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
300  }
301  }
302  index -= idx * m_outputStrides[i];
303  }
304  if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
305  eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
306  inputIndex += index;
307  } else {
308  if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
309  eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
310  } else {
311  inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
312  }
313  }
314  return inputIndex;
315  }
316 
317  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
318  {
319  return m_impl.coeff(indexRowMajor(index));
320  }
321 
322  template<int LoadMode>
323  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
324  {
326  return internal::pset1<PacketReturnType>(m_impl.coeff(0));
327  }
328 
329  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
330  if (isCopy) {
331  #ifdef EIGEN_GPU_COMPILE_PHASE
332  // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
333  // unaligned loads here. The reason is unclear though.
334  return m_impl.template packet<Unaligned>(index);
335  #else
336  return m_impl.template packet<LoadMode>(index);
337  #endif
338  } else if (oneByN && !nByOne) {
339  return packetNByOne<LoadMode>(index);
340  } else if (!oneByN && nByOne) {
341  return packetOneByN<LoadMode>(index);
342  } else if (oneByN && nByOne) {
343  return packetOneByNByOne<LoadMode>(index);
344  } else {
345  return packetColMajor<LoadMode>(index);
346  }
347  } else {
348  if (isCopy) {
349  #ifdef EIGEN_GPU_COMPILE_PHASE
350  // See above.
351  return m_impl.template packet<Unaligned>(index);
352  #else
353  return m_impl.template packet<LoadMode>(index);
354  #endif
355  } else if (oneByN && !nByOne) {
356  return packetOneByN<LoadMode>(index);
357  } else if (!oneByN && nByOne) {
358  return packetNByOne<LoadMode>(index);
359  } else if (oneByN && nByOne) {
360  return packetOneByNByOne<LoadMode>(index);
361  } else {
362  return packetRowMajor<LoadMode>(index);
363  }
364  }
365  }
366 
367  template<int LoadMode>
368  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
369  (Index index) const
370  {
371  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
372  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
373 
375  Index startDim, endDim;
376  Index inputIndex, outputOffset, batchedIndex;
377 
378  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
379  startDim = NumDims - 1;
380  endDim = 1;
381  } else {
382  startDim = 0;
383  endDim = NumDims - 2;
384  }
385 
386  batchedIndex = index % m_outputStrides[startDim];
387  inputIndex = batchedIndex / m_outputStrides[endDim];
388  outputOffset = batchedIndex % m_outputStrides[endDim];
389 
390  if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
391  values[0] = m_impl.coeff(inputIndex);
392  return internal::pload1<PacketReturnType>(values);
393  } else {
395  for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
396  if (outputOffset + cur < m_outputStrides[endDim]) {
397  values[i] = m_impl.coeff(inputIndex);
398  } else {
399  ++inputIndex;
400  inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
401  values[i] = m_impl.coeff(inputIndex);
402  outputOffset = 0;
403  cur = 0;
404  }
405  }
406  return internal::pload<PacketReturnType>(values);
407  }
408  }
409 
410  template<int LoadMode>
411  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
412  {
413  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
414  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
415 
416  Index dim, inputIndex;
417 
418  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
419  dim = NumDims - 1;
420  } else {
421  dim = 0;
422  }
423 
424  inputIndex = index % m_inputStrides[dim];
425  if (inputIndex + PacketSize <= m_inputStrides[dim]) {
426  return m_impl.template packet<Unaligned>(inputIndex);
427  } else {
430  for (int i = 0; i < PacketSize; ++i) {
431  if (inputIndex > m_inputStrides[dim]-1) {
432  inputIndex = 0;
433  }
434  values[i] = m_impl.coeff(inputIndex++);
435  }
436  return internal::pload<PacketReturnType>(values);
437  }
438  }
439 
440  template<int LoadMode>
441  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
442  {
443  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
444  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
445 
447  Index dim, inputIndex, outputOffset;
448 
449  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
450  dim = 1;
451  } else {
452  dim = NumDims - 2;
453  }
454 
455  inputIndex = index / m_outputStrides[dim];
456  outputOffset = index % m_outputStrides[dim];
457  if (outputOffset + PacketSize <= m_outputStrides[dim]) {
458  values[0] = m_impl.coeff(inputIndex);
459  return internal::pload1<PacketReturnType>(values);
460  } else {
462  for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
463  if (outputOffset + cur < m_outputStrides[dim]) {
464  values[i] = m_impl.coeff(inputIndex);
465  } else {
466  values[i] = m_impl.coeff(++inputIndex);
467  outputOffset = 0;
468  cur = 0;
469  }
470  }
471  return internal::pload<PacketReturnType>(values);
472  }
473  }
474 
475  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
476  // the alignment at compile time.
477  template<int LoadMode>
478  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
479  {
480  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
481  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
482 
483  const Index originalIndex = index;
484 
485  Index inputIndex = 0;
487  for (int i = NumDims - 1; i > 0; --i) {
488  const Index idx = index / m_outputStrides[i];
489  if (internal::index_statically_eq<Broadcast>(i, 1)) {
490  eigen_assert(idx < m_impl.dimensions()[i]);
491  inputIndex += idx * m_inputStrides[i];
492  } else {
493  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
494  eigen_assert(idx % m_impl.dimensions()[i] == 0);
495  } else {
496  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
497  }
498  }
499  index -= idx * m_outputStrides[i];
500  }
501  Index innermostLoc;
502  if (internal::index_statically_eq<Broadcast>(0, 1)) {
503  eigen_assert(index < m_impl.dimensions()[0]);
504  innermostLoc = index;
505  } else {
506  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
507  eigen_assert(index % m_impl.dimensions()[0] == 0);
508  innermostLoc = 0;
509  } else {
510  innermostLoc = index % m_impl.dimensions()[0];
511  }
512  }
513  inputIndex += innermostLoc;
514 
515  // Todo: this could be extended to the second dimension if we're not
516  // broadcasting alongside the first dimension, and so on.
517  if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
518  return m_impl.template packet<Unaligned>(inputIndex);
519  } else {
521  values[0] = m_impl.coeff(inputIndex);
523  for (int i = 1; i < PacketSize; ++i) {
524  if (innermostLoc + i < m_impl.dimensions()[0]) {
525  values[i] = m_impl.coeff(inputIndex+i);
526  } else {
527  values[i] = coeffColMajor(originalIndex+i);
528  }
529  }
530  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
531  return rslt;
532  }
533  }
534 
535  template<int LoadMode>
536  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
537  {
538  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
539  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
540 
541  const Index originalIndex = index;
542 
543  Index inputIndex = 0;
545  for (int i = 0; i < NumDims - 1; ++i) {
546  const Index idx = index / m_outputStrides[i];
547  if (internal::index_statically_eq<Broadcast>(i, 1)) {
548  eigen_assert(idx < m_impl.dimensions()[i]);
549  inputIndex += idx * m_inputStrides[i];
550  } else {
551  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
552  eigen_assert(idx % m_impl.dimensions()[i] == 0);
553  } else {
554  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
555  }
556  }
557  index -= idx * m_outputStrides[i];
558  }
559  Index innermostLoc;
560  if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
561  eigen_assert(index < m_impl.dimensions()[NumDims-1]);
562  innermostLoc = index;
563  } else {
564  if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
565  eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
566  innermostLoc = 0;
567  } else {
568  innermostLoc = index % m_impl.dimensions()[NumDims-1];
569  }
570  }
571  inputIndex += innermostLoc;
572 
573  // Todo: this could be extended to the second dimension if we're not
574  // broadcasting alongside the first dimension, and so on.
575  if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
576  return m_impl.template packet<Unaligned>(inputIndex);
577  } else {
579  values[0] = m_impl.coeff(inputIndex);
581  for (int i = 1; i < PacketSize; ++i) {
582  if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
583  values[i] = m_impl.coeff(inputIndex+i);
584  } else {
585  values[i] = coeffRowMajor(originalIndex+i);
586  }
587  }
588  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
589  return rslt;
590  }
591  }
592 
594  costPerCoeff(bool vectorized) const {
595  double compute_cost = TensorOpCost::AddCost<Index>();
596  if (!isCopy && NumDims > 0) {
598  for (int i = NumDims - 1; i > 0; --i) {
599  compute_cost += TensorOpCost::DivCost<Index>();
600  if (internal::index_statically_eq<Broadcast>(i, 1)) {
601  compute_cost +=
602  TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
603  } else {
604  if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
605  compute_cost += TensorOpCost::MulCost<Index>() +
606  TensorOpCost::ModCost<Index>() +
607  TensorOpCost::AddCost<Index>();
608  }
609  }
610  compute_cost +=
611  TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
612  }
613  }
614  return m_impl.costPerCoeff(vectorized) +
615  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
616  }
617 
620  // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
621  // tensors. But this might need further tuning.
622  const size_t target_size = m_device.firstLevelCacheSize();
624  m_impl.getResourceRequirements(),
625  internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
626  }
627 
629  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
630  bool /*root_of_expr_ast*/ = false) const {
631  BlockBroadcastingParams params = blockBroadcastingParams(desc);
632 
633  if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
634  return emptyBlock();
635  }
636 
637  // Prepare storage for the materialized broadcasting result.
638  const typename TensorBlock::Storage block_storage =
639  TensorBlock::prepareStorage(desc, scratch);
640  ScalarNoConst* materialized_output = block_storage.data();
641 
642  // We potentially will need to materialize input blocks.
643  size_t materialized_input_size = 0;
644  ScalarNoConst* materialized_input = NULL;
645 
646  // Initialize block broadcating iterator state for outer dimensions (outer
647  // with regard to bcast dimension). Dimension in this array are always in
648  // inner_most -> outer_most order (col major layout).
650  int idx = 0;
651 
652  for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
653  const Index dim = IsColMajor ? i : NumDims - 1 - i;
654  it[idx].size = params.output_dims[dim];
655  it[idx].count = 0;
656  it[idx].output_stride = m_outputStrides[dim];
657  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
658  idx++;
659  }
660 
661  // Write output into the beginning of `materialized_output`.
662  Index output_offset = 0;
663 
664  // We will fill output block by broadcasting along the bcast dim, and
665  // iterating over outer dimension.
666  const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
667 
668  for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
669  ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
670  Index bcast_offset = desc.offset() + output_offset;
671 
672  // Broadcast along the bcast dimension.
673  num_output_coeffs += BroadcastBlockAlongBcastDim(
674  params, bcast_offset, scratch, bcast_output, &materialized_input,
675  &materialized_input_size);
676 
677  // Switch to the next outer dimension.
678  for (int j = 0; j < idx; ++j) {
679  if (++it[j].count < it[j].size) {
680  output_offset += it[j].output_stride;
681  break;
682  }
683  it[j].count = 0;
684  output_offset -= it[j].output_span;
685  }
686  }
687 
688  return block_storage.AsTensorMaterializedBlock();
689  }
690 
691  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
692 
693  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
694 
695  Broadcast functor() const { return m_broadcast; }
696 #ifdef EIGEN_USE_SYCL
697  // binding placeholder accessors to a command group handler for SYCL
699  cl::sycl::handler& cgh) const {
700  m_impl.bind(cgh);
701  }
702 #endif
703  private:
704  static const bool IsColMajor =
705  static_cast<int>(Layout) == static_cast<int>(ColMajor);
706 
707  // We will build a general case block broadcasting on top of broadcasting
708  // primitive that will do broadcasting only for the inner dimension(s) along
709  // the first dimension smaller than the input size (it's called `bcast_dim`).
710  //
711  // Example:
712  // dim: 0 1 2 (ColMajor)
713  // input size: [9, 3, 6]
714  // block size: [9, 2, 6]
715  //
716  // We will compute broadcasted block by iterating over the outer dimensions
717  // before `bcast_dim` (only dimension `2` in this example) and computing
718  // broadcasts along the `bcast_dim` (dimension `1` in this example).
719 
720  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
721  // single block along the broadcasting dimension. Sizes and strides along the
722  // `bcast_dim` might be invalid, they will be adjusted later in
723  // `BroadcastBlockAlongBcastDim`.
724  struct BlockBroadcastingParams {
725  Dimensions input_dims; // input expression dimensions
726  Dimensions output_dims; // output block sizes
727  Dimensions output_strides; // output block strides
728 
729  int inner_dim_count; // count inner dimensions matching in size
730  int bcast_dim; // broadcasting dimension index
731  Index bcast_dim_size; // broadcasting dimension size
732  Index inner_dim_size; // inner dimensions size
733 
734  // Block sizes and strides for the input block where all dimensions before
735  // `bcast_dim` are equal to `1`.
736  Dimensions input_block_sizes;
738 
739  // Block sizes and strides for blocks with extra dimensions and strides `0`.
740  BroadcastDimensions bcast_block_sizes;
741  BroadcastDimensions bcast_block_strides;
742  BroadcastDimensions bcast_input_strides;
743  };
744 
745  struct BlockBroadcastingIteratorState {
746  Index size;
747  Index count;
749  Index output_span;
750  };
751 
752  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
753  blockBroadcastingParams(TensorBlockDesc& desc) const {
754  BlockBroadcastingParams params;
755 
756  params.input_dims = Dimensions(m_impl.dimensions());
757 
758  // Output block sizes and strides.
759  params.output_dims = desc.dimensions();
760  params.output_strides = internal::strides<Layout>(params.output_dims);
761 
762  // Find the broadcasting dimension (first dimension with output size smaller
763  // that the input size).
764  params.bcast_dim = 0;
765  params.bcast_dim_size = 1;
766  params.inner_dim_size = 1;
767 
768  // Count the number of inner dimensions that have the same size in the block
769  // and in the broadcast expression.
770  params.inner_dim_count = 0;
771 
772  for (int i = 0; i < NumDims; ++i) {
773  const int dim = IsColMajor ? i : NumDims - i - 1;
774 
775  if (params.output_dims[dim] == m_dimensions[dim]) {
776  params.inner_dim_size *= params.output_dims[dim];
777  ++params.inner_dim_count;
778  continue;
779  }
780 
781  // First non-matching dimension is the broadcasting dimension.
782  eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
783  params.bcast_dim = dim;
784  params.bcast_dim_size = params.output_dims[dim];
785  break;
786  }
787 
788  // Calculate the input block size for looking into the input.
789  for (int i = 0; i < params.inner_dim_count; ++i) {
790  const int dim = IsColMajor ? i : NumDims - i - 1;
791  params.input_block_sizes[dim] = params.input_dims[dim];
792  }
793  for (int i = params.inner_dim_count; i < NumDims; ++i) {
794  const int dim = IsColMajor ? i : NumDims - i - 1;
795  params.input_block_sizes[dim] = 1;
796  }
797  params.input_block_strides =
798  internal::strides<Layout>(params.input_block_sizes);
799 
800  // Broadcast with the 0-stride trick: Create 1 extra dim for each
801  // broadcast, set the input stride to 0.
802  //
803  // When ColMajor:
804  //
805  // - bcast_block_sizes:
806  // [d_0, b_0, d_1, b_1, ...]
807  //
808  // - bcast_block_strides:
809  // [output_block_strides[0], output_block_strides[0] * d_0,
810  // output_block_strides[1], output_block_strides[1] * d_1,
811  // ...]
812  //
813  // - bcast_input_strides:
814  // [input_block_strides[0], 0,
815  // input_block_strides[1], 0,
816  // ...].
817  //
818  for (int i = 0; i < params.inner_dim_count; ++i) {
819  const int dim = IsColMajor ? i : NumDims - i - 1;
820 
821  const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
822  const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
823 
824  params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
825  params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
826  params.bcast_block_strides[copy_dim] = params.output_strides[dim];
827  params.bcast_block_strides[broadcast_dim] =
828  params.output_strides[dim] * params.input_dims[dim];
829  params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
830  params.bcast_input_strides[broadcast_dim] = 0;
831  }
832 
833  for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
834  const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
835  params.bcast_block_sizes[dim] = 1;
836  params.bcast_block_strides[dim] = 0;
837  params.bcast_input_strides[dim] = 0;
838  }
839 
840  return params;
841  }
842 
845  for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
847  }
848 
850  BlockBroadcastingParams params, Index bcast_offset,
851  TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
852  ScalarNoConst** materialized_input,
853  size_t* materialized_input_size) const {
854  if (params.bcast_dim_size == 1) {
855  // We just need one block read using the ready-set values above.
856  return BroadcastBlock(
857  params.input_block_sizes, params.input_block_strides,
858  params.bcast_block_sizes, params.bcast_block_strides,
859  params.bcast_input_strides, bcast_offset, 0, scratch,
860  materialized_output, materialized_input, materialized_input_size);
861 
862  } else if (params.input_dims[params.bcast_dim] == 1) {
863  // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
864  const int broadcast_bcast_dim =
865  IsColMajor ? 2 * params.inner_dim_count + 1
866  : 2 * NumDims - 2 * params.inner_dim_count - 2;
867 
868  params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
869  params.bcast_input_strides[broadcast_bcast_dim] = 0;
870  params.bcast_block_strides[broadcast_bcast_dim] =
871  params.output_strides[params.bcast_dim];
872 
873  return BroadcastBlock(
874  params.input_block_sizes, params.input_block_strides,
875  params.bcast_block_sizes, params.bcast_block_strides,
876  params.bcast_input_strides, bcast_offset, 0, scratch,
877  materialized_output, materialized_input, materialized_input_size);
878 
879  } else {
880  // Keep track of the total number of the coefficients written to the
881  // output block.
882  Index num_output_coeffs = 0;
883 
884  // The general case. Let's denote the output block as
885  //
886  // x[..., a:a+bcast_dim_size, :, ..., :]
887  //
888  // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
889  // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
890  // sub-blocks:
891  //
892  // (1) a:b, where b is the smallest multiple of
893  // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
894  //
895  // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
896  // in [a, a+bcast_dim_size].
897  //
898  // (3) c:a+bcast_dim_size .
899  //
900  // Or, when b and c do not exist, we just need to process the whole block
901  // together.
902 
903  // Find a.
904  const Index bcast_dim_left_index =
905  bcast_offset / m_outputStrides[params.bcast_dim];
906 
907  // Find b and c.
908  const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
909 
910  // First multiple after a. This is b when <= bcast_dim_left_index +
911  // bcast_dim_size.
912  const Index first_multiple =
913  divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
914  input_bcast_dim_size;
915 
916  if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
917  // b exists, so does c. Find it.
918  const Index last_multiple =
919  (bcast_dim_left_index + params.bcast_dim_size) /
920  input_bcast_dim_size * input_bcast_dim_size;
921  const int copy_bcast_dim =
922  IsColMajor ? 2 * params.inner_dim_count
923  : 2 * NumDims - 2 * params.inner_dim_count - 1;
924  const int broadcast_bcast_dim =
925  IsColMajor ? 2 * params.inner_dim_count + 1
926  : 2 * NumDims - 2 * params.inner_dim_count - 2;
927 
928  if (first_multiple > bcast_dim_left_index) {
929  const Index head_size = first_multiple - bcast_dim_left_index;
930  params.input_block_sizes[params.bcast_dim] = head_size;
931  params.bcast_block_sizes[copy_bcast_dim] = head_size;
932  params.bcast_input_strides[copy_bcast_dim] =
933  params.input_block_strides[params.bcast_dim];
934  params.bcast_block_strides[copy_bcast_dim] =
935  params.output_strides[params.bcast_dim];
936  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
937  params.bcast_input_strides[broadcast_bcast_dim] = 0;
938  params.bcast_block_strides[broadcast_bcast_dim] =
939  params.output_strides[params.bcast_dim] *
940  params.input_dims[params.bcast_dim];
941 
942  num_output_coeffs += BroadcastBlock(
943  params.input_block_sizes, params.input_block_strides,
944  params.bcast_block_sizes, params.bcast_block_strides,
945  params.bcast_input_strides, bcast_offset, 0, scratch,
946  materialized_output, materialized_input, materialized_input_size);
947  }
948  if (first_multiple < last_multiple) {
949  params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
950  params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
951  params.bcast_input_strides[copy_bcast_dim] =
952  params.input_block_strides[params.bcast_dim];
953  params.bcast_block_strides[copy_bcast_dim] =
954  params.output_strides[params.bcast_dim];
955  params.bcast_block_sizes[broadcast_bcast_dim] =
956  (last_multiple - first_multiple) / input_bcast_dim_size;
957  params.bcast_input_strides[broadcast_bcast_dim] = 0;
958  params.bcast_block_strides[broadcast_bcast_dim] =
959  params.output_strides[params.bcast_dim] *
960  params.input_dims[params.bcast_dim];
961  const Index offset = (first_multiple - bcast_dim_left_index) *
962  m_outputStrides[params.bcast_dim];
963 
964  num_output_coeffs += BroadcastBlock(
965  params.input_block_sizes, params.input_block_strides,
966  params.bcast_block_sizes, params.bcast_block_strides,
967  params.bcast_input_strides, bcast_offset, offset, scratch,
968  materialized_output, materialized_input, materialized_input_size);
969  }
970  if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
971  const Index tail_size =
972  bcast_dim_left_index + params.bcast_dim_size - last_multiple;
973  params.input_block_sizes[params.bcast_dim] = tail_size;
974  params.bcast_block_sizes[copy_bcast_dim] = tail_size;
975  params.bcast_input_strides[copy_bcast_dim] =
976  params.input_block_strides[params.bcast_dim];
977  params.bcast_block_strides[copy_bcast_dim] =
978  params.output_strides[params.bcast_dim];
979  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
980  params.bcast_input_strides[broadcast_bcast_dim] = 0;
981  params.bcast_block_strides[broadcast_bcast_dim] =
982  params.output_strides[params.bcast_dim] *
983  params.input_dims[params.bcast_dim];
984  const Index offset = (last_multiple - bcast_dim_left_index) *
985  m_outputStrides[params.bcast_dim];
986 
987  num_output_coeffs += BroadcastBlock(
988  params.input_block_sizes, params.input_block_strides,
989  params.bcast_block_sizes, params.bcast_block_strides,
990  params.bcast_input_strides, bcast_offset, offset, scratch,
991  materialized_output, materialized_input, materialized_input_size);
992  }
993  } else {
994  // b and c do not exist.
995  const int copy_bcast_dim =
996  IsColMajor ? 2 * params.inner_dim_count
997  : 2 * NumDims - 2 * params.inner_dim_count - 1;
998  params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
999  params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
1000  params.bcast_input_strides[copy_bcast_dim] =
1001  params.input_block_strides[params.bcast_dim];
1002  params.bcast_block_strides[copy_bcast_dim] =
1003  params.output_strides[params.bcast_dim];
1004 
1005  num_output_coeffs += BroadcastBlock(
1006  params.input_block_sizes, params.input_block_strides,
1007  params.bcast_block_sizes, params.bcast_block_strides,
1008  params.bcast_input_strides, bcast_offset, 0, scratch,
1009  materialized_output, materialized_input, materialized_input_size);
1010  }
1011 
1012  return num_output_coeffs;
1013  }
1014  }
1015 
1017  const Dimensions& input_block_sizes,
1018  const Dimensions& input_block_strides,
1019  const BroadcastDimensions& bcast_block_sizes,
1020  const BroadcastDimensions& bcast_block_strides,
1021  const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
1022  Index offset, TensorBlockScratch& scratch,
1023  ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
1024  size_t* materialized_input_size) const {
1025  // ---------------------------------------------------------------------- //
1026  // Tensor block descriptor for reading block from the input.
1027  const Index input_offset = bcast_offset + offset;
1028  TensorBlockDesc input_desc(
1029  IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
1030  input_block_sizes);
1031 
1032  ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
1033 
1034  // ---------------------------------------------------------------------- //
1035  // Materialize input block into a temporary memory buffer only if it's not
1036  // already available in the arg block.
1037  const ScalarNoConst* input_buffer = NULL;
1038 
1039  if (input_block.data() != NULL) {
1040  // Input block already has raw data, there is no need to materialize it.
1041  input_buffer = input_block.data();
1042 
1043  } else {
1044  // Otherwise we have to do block assignment into a temporary buffer.
1045 
1046  // Maybe reuse previously allocated buffer, or allocate a new one with a
1047  // scratch allocator.
1048  const size_t input_total_size = input_block_sizes.TotalSize();
1049  if (*materialized_input == NULL ||
1050  *materialized_input_size < input_total_size) {
1051  *materialized_input_size = input_total_size;
1052  void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
1053  *materialized_input = static_cast<ScalarNoConst*>(mem);
1054  }
1055 
1057  ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
1058  TensorBlockAssignment;
1059 
1061  TensorBlockAssignment::target(input_block_sizes, input_block_strides,
1062  *materialized_input),
1063  input_block.expr());
1064 
1065  input_buffer = *materialized_input;
1066  }
1067 
1068  // ---------------------------------------------------------------------- //
1069  // Copy data from materialized input block to the materialized output, using
1070  // given broadcast strides (strides with zeroes).
1072  TensorBlockIO;
1073 
1074  typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
1075  typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
1076  materialized_output + offset);
1077 
1078  return TensorBlockIO::Copy(dst, src);
1079  }
1080 
1081 protected:
1084  Dimensions m_dimensions;
1088 };
1089 
1090 
1091 } // end namespace Eigen
1092 
1093 #endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Eigen::internal::traits< TensorBroadcastingOp >::Index Index
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:932
SCALAR Scalar
Definition: bench_gemm.cpp:46
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch &scratch, ScalarNoConst *materialized_output, ScalarNoConst **materialized_input, size_t *materialized_input_size) const
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
Eigen::internal::traits< TensorBroadcastingOp >::Scalar Scalar
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE std::size_t size()
Definition: EmulateArray.h:44
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
EIGEN_DEVICE_FUNC const internal::remove_all< typename XprType::Nested >::type & expression() const
leaf::MyValues values
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool=false) const
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
A cost model used to limit the number of threads used for evaluating tensor expression.
TensorMaterializedBlock AsTensorMaterializedBlock() const
Definition: TensorBlock.h:700
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition: StaticAssert.h:127
const TensorBroadcastingOp< Broadcast, XprType > EIGEN_DEVICE_REF type
#define EIGEN_ALIGN_MAX
static const SmartProjectionParams params
Index first_multiple(Index size, Index base)
Definition: Memory.h:506
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target &target, const TensorBlockExpr &expr)
Definition: TensorBlock.h:1465
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:77
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
Eigen::internal::traits< TensorBroadcastingOp >::StorageKind StorageKind
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
static Target target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1449
#define eigen_assert(x)
Definition: Macros.h:1037
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst &dst, const Src &src, const DimensionsMap &dst_to_src_dim_map)
Definition: TensorBlock.h:1178
#define NULL
Definition: ccolamd.c:609
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Eigen::internal::nested< TensorBroadcastingOp >::type Nested
The tensor base class.
Definition: TensorBase.h:973
EIGEN_CONSTEXPR Index size(const T &x)
Definition: Meta.h:479
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
const Dimensions & dimensions() const
Definition: TensorBlock.h:299
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(const Dimensions &input_block_sizes, const Dimensions &input_block_strides, const BroadcastDimensions &bcast_block_sizes, const BroadcastDimensions &bcast_block_strides, const BroadcastDimensions &bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch &scratch, ScalarNoConst *materialized_output, ScalarNoConst **materialized_input, size_t *materialized_input_size) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType &expr, const Broadcast &broadcast)
internal::TensorMaterializedBlock< ScalarNoConst, NumDims, Layout, Index > TensorBlock
std::vector< size_t > Indices
Eigen::NumTraits< Scalar >::Real RealScalar
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:50
Generic expression where a coefficient-wise unary operator is applied to an expression.
Definition: CwiseUnaryOp.h:55
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
EIGEN_DEVICE_FUNC const Broadcast & broadcast() const
const std::vector< size_t > dimensions
XprType::CoeffReturnType CoeffReturnType
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements merge(const TensorBlockResourceRequirements &lhs, const TensorBlockResourceRequirements &rhs)
Definition: TensorBlock.h:138
std::ptrdiff_t j
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc &desc) const
broadcast_trivial broadcast(const std::array< buffer_info, N > &buffers, ssize_t &ndim, std::vector< ssize_t > &shape)
Definition: numpy.h:1647
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1461
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:36:43