10 #if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) 
   11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H 
   17 #ifndef EIGEN_USE_SIMPLE_THREAD_POOL 
   18 template <
typename Env> 
using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
 
   21 template <
typename Env> 
using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
 
   30   Barrier(
unsigned int count) : state_(count << 1), notified_(false) {
 
   38     unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
 
   43     std::unique_lock<std::mutex> l(mu_);
 
   50     unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
 
   51     if ((v >> 1) == 0) 
return;
 
   52     std::unique_lock<std::mutex> l(mu_);
 
   60   std::condition_variable cv_;
 
   61   std::atomic<unsigned int> state_;  
 
   71 struct Notification : Barrier {
 
   72   Notification() : Barrier(1) {};
 
   78 template <
typename Function, 
typename... Args> 
struct FunctionWrapperWithNotification
 
   80   static void run(Notification* 
n, Function f, Args... args) {
 
   88 template <
typename Function, 
typename... Args> 
struct FunctionWrapperWithBarrier
 
   90   static void run(Barrier* 
b, Function f, Args... args) {
 
   98 template <
typename SyncType>
 
  107 struct ThreadPoolDevice {
 
  109   ThreadPoolDevice(ThreadPoolInterface* pool, 
int num_cores) : pool_(pool), num_threads_(num_cores) { }
 
  120     ::memcpy(dst, src, 
n);
 
  130     ::memset(buffer, 
c, 
n);
 
  151   template <
class Function, 
class... Args>
 
  153     Notification* 
n = 
new Notification();
 
  158   template <
class Function, 
class... Args>
 
  161                                                 Args&&... args)
 const {
 
  162     pool_->Schedule(std::bind(
 
  166   template <
class Function, 
class... Args>
 
  168     pool_->Schedule(std::bind(f, args...));
 
  174     return pool_->CurrentThreadId();
 
  182   void parallelFor(
Index n, 
const TensorOpCost& cost,
 
  185     typedef TensorCostModel<ThreadPoolDevice> CostModel;
 
  186     if (
n <= 1 || numThreads() == 1 ||
 
  187         CostModel::numThreads(
n, cost, 
static_cast<int>(numThreads())) == 1) {
 
  198     double block_size_f = 1.0 / CostModel::taskSize(1, cost);
 
  199     const Index max_oversharding_factor = 4;
 
  201         n, numext::maxi<Index>(divup<Index>(
n, max_oversharding_factor * numThreads()),
 
  205       Index new_block_size = block_align(block_size);
 
  212     double max_efficiency =
 
  213         static_cast<double>(block_count) /
 
  214         (divup<int>(block_count, numThreads()) * numThreads());
 
  217     for (
Index prev_block_count = block_count;
 
  218          max_efficiency < 1.0 && prev_block_count > 1;) {
 
  221       Index coarser_block_size = 
divup(
n, prev_block_count - 1);
 
  223         Index new_block_size = block_align(coarser_block_size);
 
  227       if (coarser_block_size > max_block_size) {
 
  231       const Index coarser_block_count = 
divup(
n, coarser_block_size);
 
  233       prev_block_count = coarser_block_count;
 
  234       const double coarser_efficiency =
 
  235           static_cast<double>(coarser_block_count) /
 
  236           (divup<int>(coarser_block_count, numThreads()) * numThreads());
 
  237       if (coarser_efficiency + 0.01 >= max_efficiency) {
 
  239         block_size = coarser_block_size;
 
  240         block_count = coarser_block_count;
 
  241         if (max_efficiency < coarser_efficiency) {
 
  242           max_efficiency = coarser_efficiency;
 
  250     Barrier barrier(
static_cast<unsigned int>(block_count));
 
  251     std::function<void(
Index, 
Index)> handleRange;
 
  252     handleRange = [=, &handleRange, &barrier, &f](
Index first, 
Index last) {
 
  253       if (last - first <= block_size) {
 
  260       Index mid = first + 
divup((last - first) / 2, block_size) * block_size;
 
  261       pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
 
  262       pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
 
  269   void parallelFor(
Index n, 
const TensorOpCost& cost,
 
  271     parallelFor(
n, cost, 
nullptr, std::move(f));
 
  275   ThreadPoolInterface* pool_;
 
  282 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H