10 #if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) 11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H 17 #ifndef EIGEN_USE_SIMPLE_THREAD_POOL 18 template <
typename Env>
using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
21 template <
typename Env>
using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
30 Barrier(
unsigned int count) : state_(count << 1), notified_(false) {
38 unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
43 std::unique_lock<std::mutex>
l(mu_);
50 unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
51 if ((v >> 1) == 0)
return;
52 std::unique_lock<std::mutex>
l(mu_);
60 std::condition_variable cv_;
61 std::atomic<unsigned int> state_;
71 struct Notification : Barrier {
72 Notification() : Barrier(1) {};
78 template <
typename Function,
typename... Args>
struct FunctionWrapperWithNotification
80 static void run(Notification*
n, Function
f, Args...
args) {
88 template <
typename Function,
typename... Args>
struct FunctionWrapperWithBarrier
90 static void run(Barrier*
b, Function f, Args...
args) {
98 template <
typename SyncType>
107 struct ThreadPoolDevice {
109 ThreadPoolDevice(ThreadPoolInterface* pool,
int num_cores) : pool_(pool), num_threads_(num_cores) { }
120 ::memcpy(dst, src, n);
130 ::memset(buffer, c, n);
151 template <
class Function,
class... Args>
153 Notification* n =
new Notification();
158 template <
class Function,
class... Args>
161 Args&&...
args)
const {
162 pool_->Schedule(std::bind(
166 template <
class Function,
class... Args>
168 pool_->Schedule(std::bind(f,
args...));
174 return pool_->CurrentThreadId();
182 void parallelFor(
Index n,
const TensorOpCost& cost,
185 typedef TensorCostModel<ThreadPoolDevice> CostModel;
186 if (n <= 1 || numThreads() == 1 ||
187 CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
198 double block_size_f = 1.0 / CostModel::taskSize(1, cost);
199 const Index max_oversharding_factor = 4;
201 n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
205 Index new_block_size = block_align(block_size);
212 double max_efficiency =
213 static_cast<double>(block_count) /
214 (divup<int>(block_count, numThreads()) * numThreads());
217 for (
Index prev_block_count = block_count;
218 max_efficiency < 1.0 && prev_block_count > 1;) {
221 Index coarser_block_size =
divup(n, prev_block_count - 1);
223 Index new_block_size = block_align(coarser_block_size);
227 if (coarser_block_size > max_block_size) {
231 const Index coarser_block_count =
divup(n, coarser_block_size);
233 prev_block_count = coarser_block_count;
234 const double coarser_efficiency =
235 static_cast<double>(coarser_block_count) /
236 (divup<int>(coarser_block_count, numThreads()) * numThreads());
237 if (coarser_efficiency + 0.01 >= max_efficiency) {
239 block_size = coarser_block_size;
240 block_count = coarser_block_count;
241 if (max_efficiency < coarser_efficiency) {
242 max_efficiency = coarser_efficiency;
250 Barrier barrier(static_cast<unsigned int>(block_count));
251 std::function<void(Index, Index)> handleRange;
253 if (
last - first <= block_size) {
260 Index mid = first +
divup((
last - first) / 2, block_size) * block_size;
261 pool_->Schedule([=, &handleRange]() { handleRange(mid,
last); });
262 pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
269 void parallelFor(
Index n,
const TensorOpCost& cost,
271 parallelFor(n, cost,
nullptr, std::move(f));
275 ThreadPoolInterface* pool_;
282 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H #define EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC void * aligned_malloc(std::size_t size)
constexpr int last(int, int result)
Namespace containing all symbols from the Eigen library.
std::ptrdiff_t l3CacheSize()
EIGEN_DEVICE_FUNC void aligned_free(void *ptr)
static const Line3 l(Rot3(), 1, 1)
constexpr int first(int i)
Implementation details for constexpr functions.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
SimpleThreadPoolTempl< StlThreadEnvironment > SimpleThreadPool
void run(Expr &expr, Dev &dev)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
NonBlockingThreadPoolTempl< StlThreadEnvironment > NonBlockingThreadPool
std::ptrdiff_t l1CacheSize()