10 #define EIGEN_USE_THREADS 
   14 #include <Eigen/CXX11/Tensor> 
   26 template <
typename Dst, 
typename Expr>
 
   39           typename Dst, 
typename Expr>
 
   43                                                    Vectorizable, Tiling>;
 
   48 template <
int NumDims>
 
   51   for (
int i = 0; 
i < NumDims; ++
i) {
 
   52     dims[
i] = internal::random<int>(min_dim, max_dim);
 
   57 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
   61   static constexpr 
int Options = 0 | Layout;
 
   65   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
 
   71   const auto expr = src.square();
 
   73   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
   75       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
   85 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
   89   static constexpr 
int Options = 0 | Layout;
 
   93   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
 
  102   const auto expr = lhs + rhs;
 
  104   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  106       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
  111     T sum = lhs.
coeff(
i) + rhs.coeff(
i);
 
  116 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  120   static constexpr 
int Options = 0 | Layout;
 
  122   auto dims = RandomDims<NumDims>(1, 10);
 
  126   const auto broadcasts = RandomDims<NumDims>(1, 7);
 
  127   const auto expr = src.broadcast(broadcasts);
 
  137   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  139       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
  148 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  152   auto dims = RandomDims<NumDims>(1, 10);
 
  156 #define TEST_CHIPPING(CHIP_DIM)                                           \ 
  157   if (NumDims > (CHIP_DIM)) {                                             \ 
  158     const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \ 
  159     const auto expr = src.template chip<(CHIP_DIM)>(offset);              \ 
  161     Tensor<T, NumDims - 1, Layout, Index> golden;                         \ 
  164     Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());       \ 
  166     using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \ 
  167     using Executor = internal::TensorExecutor<const Assign, Device,       \ 
  168                                               Vectorizable, Tiling>;      \ 
  170     Executor::run(Assign(dst, expr), d);                                  \ 
  172     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {            \ 
  173       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                     \ 
  187 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  191   auto dims = RandomDims<NumDims>(1, 10);
 
  193 #define TEST_CHIPPING(CHIP_DIM)                                             \ 
  194   if (NumDims > (CHIP_DIM)) {                                               \ 
  196     array<Index, NumDims - 1> src_dims;                                     \ 
  197     for (int i = 0; i < NumDims - 1; ++i) {                                 \ 
  198       int dim = i < (CHIP_DIM) ? i : i + 1;                                 \ 
  199       src_dims[i] = dims[dim];                                              \ 
  202     Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                    \ 
  205     const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \ 
  207     Tensor<T, NumDims, Layout, Index> random(dims);                         \ 
  210     Tensor<T, NumDims, Layout, Index> golden(dims);                         \ 
  212     golden.template chip<(CHIP_DIM)>(offset) = src;                         \ 
  214     Tensor<T, NumDims, Layout, Index> dst(dims);                            \ 
  216     auto expr = dst.template chip<(CHIP_DIM)>(offset);                      \ 
  218     using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \ 
  219     using Executor = internal::TensorExecutor<const Assign, Device,         \ 
  220                                               Vectorizable, Tiling>;        \ 
  222     Executor::run(Assign(expr, src), d);                                    \ 
  224     for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {              \ 
  225       VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                       \ 
  239 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  243   static constexpr 
int Options = 0 | Layout;
 
  245   auto dims = RandomDims<NumDims>(1, 10);
 
  250   for (
int i = 0; 
i < NumDims; ++
i) shuffle[
i] = 
i;
 
  255     for (
int i = 0; 
i < NumDims; ++
i) {
 
  256       shuffled_dims[
i] = dims[shuffle[
i]];
 
  259     const auto expr = src.
shuffle(shuffle);
 
  268     DeviceAssign<Vectorizable, Tiling>(
d, dst, expr);
 
  274   } 
while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
 
  277 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  281   static constexpr 
int Options = 0 | Layout;
 
  283   auto dims = RandomDims<NumDims>(5, 10);
 
  288   for (
int i = 0; 
i < NumDims; ++
i) shuffle[
i] = 
i;
 
  293     for (
int i = 0; 
i < NumDims; ++
i) shuffled_dims[shuffle[
i]] = dims[
i];
 
  298     auto golden_shuffle = golden.
shuffle(shuffle);
 
  303     auto dst_shuffle = dst.
shuffle(shuffle);
 
  304     DeviceAssign<Vectorizable, Tiling>(
d, dst_shuffle, src);
 
  310   } 
while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
 
  313 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  317   static_assert(NumDims >= 2, 
"NumDims must be greater or equal than 2");
 
  319   static constexpr 
int ReshapedDims = NumDims - 1;
 
  320   static constexpr 
int Options = 0 | Layout;
 
  322   auto dims = RandomDims<NumDims>(5, 10);
 
  327   std::vector<Index> shuffle;
 
  328   for (
int i = 0; 
i < ReshapedDims; ++
i) shuffle.push_back(
i);
 
  329   std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
 
  332   reshaped_dims[shuffle[0]] = dims[0] * dims[1];
 
  333   for (
int i = 1; 
i < ReshapedDims; ++
i) reshaped_dims[shuffle[
i]] = dims[
i + 1];
 
  340   auto expr = src.
reshape(reshaped_dims);
 
  342   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  344       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
  353 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  357   static_assert(NumDims >= 2, 
"NumDims must be greater or equal than 2");
 
  358   static constexpr 
int Options = 0 | Layout;
 
  360   auto dims = RandomDims<NumDims>(5, 10);
 
  369   for (
int i = 0; 
i < NumDims; ++
i) {
 
  375       src.
slice(slice_start, slice_size);
 
  380   auto expr = src.
slice(slice_start, slice_size);
 
  382   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  384       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
  393 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  397   static_assert(NumDims >= 2, 
"NumDims must be greater or equal than 2");
 
  398   static constexpr 
int Options = 0 | Layout;
 
  400   auto dims = RandomDims<NumDims>(5, 10);
 
  409   for (
int i = 0; 
i < NumDims; ++
i) {
 
  419   golden.
slice(slice_start, slice_size) = 
slice;
 
  423   auto expr = dst.
slice(slice_start, slice_size);
 
  427       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
  436 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  440   static constexpr 
int Options = 0 | Layout;
 
  442   auto dims = RandomDims<NumDims>(1, 10);
 
  446   const auto broadcasts = RandomDims<NumDims>(1, 7);
 
  447   const auto expr = src.square().eval().broadcast(broadcasts);
 
  457   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  459       internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
  468 template<
typename T, 
int NumDims>
 
  473     for (
int i = 0; 
i < NumDims; ++
i) {
 
  474       result += 
static_cast<T>((
i + 1) * dims[
i]);
 
  480 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  484   static constexpr 
int Options = 0 | Layout;
 
  486   auto dims = RandomDims<NumDims>(20, 30);
 
  500   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  502     internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
  511 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  515   static constexpr 
int Options = 0 | Layout;
 
  517   auto dims = RandomDims<NumDims>(1, 
numext::pow(1000000.0, 1.0 / NumDims));
 
  523   for (
int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
 
  535   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  537     internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
 
  546 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  550   static constexpr 
int Options = 0 | Layout;
 
  554   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
 
  560   const auto expr = src.square();
 
  563   auto on_done = [&done]() { done.
Notify(); };
 
  565   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  566   using DoneCallback = decltype(on_done);
 
  567   using Executor = internal::TensorAsyncExecutor<
const Assign, Device, DoneCallback,
 
  568                                                  Vectorizable, Tiling>;
 
  570   Executor::runAsync(Assign(dst, expr), 
d, on_done);
 
  579 template <
typename T, 
int NumDims, 
typename Device, 
bool Vectorizable,
 
  583   static constexpr 
int Options = 0 | Layout;
 
  587   auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
 
  596   const auto expr = lhs + rhs;
 
  599   auto on_done = [&done]() { done.
Notify(); };
 
  601   using Assign = 
TensorAssignOp<decltype(dst), 
const decltype(expr)>;
 
  602   using DoneCallback = decltype(on_done);
 
  603   using Executor = internal::TensorAsyncExecutor<
const Assign, Device, DoneCallback,
 
  604                                                  Vectorizable, Tiling>;
 
  606   Executor::runAsync(Assign(dst, expr), 
d, on_done);
 
  610     T sum = lhs.
coeff(
i) + rhs.coeff(
i);
 
  615 #ifdef EIGEN_DONT_VECTORIZE 
  616 #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL 
  618 #define VECTORIZABLE(VAL) VAL 
  621 #define CALL_SUBTEST_PART(PART) \ 
  624 #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \ 
  625   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \ 
  626   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \ 
  627   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \ 
  628   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \ 
  629   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \ 
  630   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \ 
  631   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \ 
  632   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \ 
  633   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \ 
  634   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \ 
  635   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \ 
  636   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \ 
  637   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \ 
  638   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \ 
  639   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \ 
  640   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device))) 
  643 #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \ 
  644   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \ 
  645   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \ 
  646   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \ 
  647   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \ 
  648   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \ 
  649   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \ 
  650   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \ 
  651   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device))) 
  658   const auto num_threads = internal::random<int>(20, 24);
 
  660   Eigen::ThreadPoolDevice tp_device(&tp, num_threads);