atomicops.h
Go to the documentation of this file.
1 // ©2013-2016 Cameron Desrochers.
2 // Distributed under the simplified BSD license (see the license file that
3 // should have come with this header).
4 // Uses Jeff Preshing's semaphore implementation (under the terms of its
5 // separate zlib license, embedded below).
6 
7 #pragma once
8 
9 // Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation
10 // of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).
11 // Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).
12 // Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.
13 
14 #include <cassert>
15 #include <cerrno>
16 #include <cstdint>
17 #include <ctime>
18 #include <type_traits>
19 
20 // Platform detection
21 #if defined(__INTEL_COMPILER)
22 #define AE_ICC
23 #elif defined(_MSC_VER)
24 #define AE_VCPP
25 #elif defined(__GNUC__)
26 #define AE_GCC
27 #endif
28 
29 #if defined(_M_IA64) || defined(__ia64__)
30 #define AE_ARCH_IA64
31 #elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)
32 #define AE_ARCH_X64
33 #elif defined(_M_IX86) || defined(__i386__)
34 #define AE_ARCH_X86
35 #elif defined(_M_PPC) || defined(__powerpc__)
36 #define AE_ARCH_PPC
37 #else
38 #define AE_ARCH_UNKNOWN
39 #endif
40 
41 // AE_UNUSED
42 #define AE_UNUSED(x) ((void)x)
43 
44 // AE_FORCEINLINE
45 #if defined(AE_VCPP) || defined(AE_ICC)
46 #define AE_FORCEINLINE __forceinline
47 #elif defined(AE_GCC)
48 //#define AE_FORCEINLINE __attribute__((always_inline))
49 #define AE_FORCEINLINE inline
50 #else
51 #define AE_FORCEINLINE inline
52 #endif
53 
54 // AE_ALIGN
55 #if defined(AE_VCPP) || defined(AE_ICC)
56 #define AE_ALIGN(x) __declspec(align(x))
57 #elif defined(AE_GCC)
58 #define AE_ALIGN(x) __attribute__((aligned(x)))
59 #else
60 // Assume GCC compliant syntax...
61 #define AE_ALIGN(x) __attribute__((aligned(x)))
62 #endif
63 
64 // Portable atomic fences implemented below:
65 
66 namespace moodycamel
67 {
69 {
75 
76  // memory_order_sync: Forces a full sync:
77  // #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad
79 };
80 
81 } // end namespace moodycamel
82 
83 #if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || defined(AE_ICC)
84 // VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences
85 
86 #include <intrin.h>
87 
88 #if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
89 #define AeFullSync _mm_mfence
90 #define AeLiteSync _mm_mfence
91 #elif defined(AE_ARCH_IA64)
92 #define AeFullSync __mf
93 #define AeLiteSync __mf
94 #elif defined(AE_ARCH_PPC)
95 #include <ppcintrinsics.h>
96 #define AeFullSync __sync
97 #define AeLiteSync __lwsync
98 #endif
99 
100 #ifdef AE_VCPP
101 #pragma warning(push)
102 #pragma warning(disable : 4365) // Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch'
103  // error when using `assert`
104 #ifdef __cplusplus_cli
105 #pragma managed(push, off)
106 #endif
107 #endif
108 
109 namespace moodycamel
110 {
112 {
113  switch (order)
114  {
116  break;
118  _ReadBarrier();
119  break;
121  _WriteBarrier();
122  break;
124  _ReadWriteBarrier();
125  break;
127  _ReadWriteBarrier();
128  break;
129  default:
130  assert(false);
131  }
132 }
133 
134 // x86/x64 have a strong memory model -- all loads and stores have
135 // acquire and release semantics automatically (so only need compiler
136 // barriers for those).
137 #if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)
139 {
140  switch (order)
141  {
143  break;
145  _ReadBarrier();
146  break;
148  _WriteBarrier();
149  break;
151  _ReadWriteBarrier();
152  break;
154  _ReadWriteBarrier();
155  AeFullSync();
156  _ReadWriteBarrier();
157  break;
158  default:
159  assert(false);
160  }
161 }
162 #else
164 {
165  // Non-specialized arch, use heavier memory barriers everywhere just in case :-(
166  switch (order)
167  {
169  break;
171  _ReadBarrier();
172  AeLiteSync();
173  _ReadBarrier();
174  break;
176  _WriteBarrier();
177  AeLiteSync();
178  _WriteBarrier();
179  break;
181  _ReadWriteBarrier();
182  AeLiteSync();
183  _ReadWriteBarrier();
184  break;
186  _ReadWriteBarrier();
187  AeFullSync();
188  _ReadWriteBarrier();
189  break;
190  default:
191  assert(false);
192  }
193 }
194 #endif
195 } // end namespace moodycamel
196 #else
197 // Use standard library of atomics
198 #include <atomic>
199 
200 namespace moodycamel
201 {
203 {
204  switch (order)
205  {
207  break;
209  std::atomic_signal_fence(std::memory_order_acquire);
210  break;
212  std::atomic_signal_fence(std::memory_order_release);
213  break;
215  std::atomic_signal_fence(std::memory_order_acq_rel);
216  break;
218  std::atomic_signal_fence(std::memory_order_seq_cst);
219  break;
220  default:
221  assert(false);
222  }
223 }
224 
226 {
227  switch (order)
228  {
230  break;
232  std::atomic_thread_fence(std::memory_order_acquire);
233  break;
235  std::atomic_thread_fence(std::memory_order_release);
236  break;
238  std::atomic_thread_fence(std::memory_order_acq_rel);
239  break;
241  std::atomic_thread_fence(std::memory_order_seq_cst);
242  break;
243  default:
244  assert(false);
245  }
246 }
247 
248 } // end namespace moodycamel
249 
250 #endif
251 
252 #if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli))
253 #define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
254 #endif
255 
256 #ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
257 #include <atomic>
258 #endif
259 #include <utility>
260 
261 // WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:
262 // Provides basic support for atomic variables -- no memory ordering guarantees are provided.
263 // The guarantee of atomicity is only made for types that already have atomic load and store guarantees
264 // at the hardware level -- on most platforms this generally means aligned pointers and integers (only).
265 namespace moodycamel
266 {
267 template <typename T>
269 {
270 public:
272  {
273  }
274 #ifdef AE_VCPP
275 #pragma warning(disable : 4100) // Get rid of (erroneous) 'unreferenced formal parameter' warning
276 #endif
277  template <typename U>
278  weak_atomic(U&& x) : value(std::forward<U>(x))
279  {
280  }
281 #ifdef __cplusplus_cli
282  // Work around bug with universal reference/nullptr combination that only appears when /clr is on
283  weak_atomic(nullptr_t) : value(nullptr)
284  {
285  }
286 #endif
287  weak_atomic(weak_atomic const& other) : value(other.value)
288  {
289  }
290  weak_atomic(weak_atomic&& other) : value(std::move(other.value))
291  {
292  }
293 #ifdef AE_VCPP
294 #pragma warning(default : 4100)
295 #endif
296 
297  AE_FORCEINLINE operator T() const
298  {
299  return load();
300  }
301 
302 #ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
303  template <typename U>
305  {
306  value = std::forward<U>(x);
307  return *this;
308  }
309  AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other)
310  {
311  value = other.value;
312  return *this;
313  }
314 
315  AE_FORCEINLINE T load() const
316  {
317  return value;
318  }
319 
321  {
322 #if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
323  if (sizeof(T) == 4)
324  return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
325 #if defined(_M_AMD64)
326  else if (sizeof(T) == 8)
327  return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
328 #endif
329 #else
330 #error Unsupported platform
331 #endif
332  assert(false && "T must be either a 32 or 64 bit type");
333  return value;
334  }
335 
336  AE_FORCEINLINE T fetch_add_release(T increment)
337  {
338 #if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
339  if (sizeof(T) == 4)
340  return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
341 #if defined(_M_AMD64)
342  else if (sizeof(T) == 8)
343  return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
344 #endif
345 #else
346 #error Unsupported platform
347 #endif
348  assert(false && "T must be either a 32 or 64 bit type");
349  return value;
350  }
351 #else
352  template <typename U>
354  {
355  value.store(std::forward<U>(x), std::memory_order_relaxed);
356  return *this;
357  }
358 
360  {
362  return *this;
363  }
364 
366  {
367  return value.load(std::memory_order_relaxed);
368  }
369 
371  {
372  return value.fetch_add(increment, std::memory_order_acquire);
373  }
374 
376  {
377  return value.fetch_add(increment, std::memory_order_release);
378  }
379 #endif
380 
381 private:
382 #ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
383  // No std::atomic support, but still need to circumvent compiler optimizations.
384  // `volatile` will make memory access slow, but is guaranteed to be reliable.
385  volatile T value;
386 #else
387  std::atomic<T> value;
388 #endif
389 };
390 
391 } // end namespace moodycamel
392 
393 // Portable single-producer, single-consumer semaphore below:
394 
395 #if defined(_WIN32)
396 // Avoid including windows.h in a header; we only need a handful of
397 // items, so we'll redeclare them here (this is relatively safe since
398 // the API generally has to remain stable between Windows versions).
399 // I know this is an ugly hack but it still beats polluting the global
400 // namespace with thousands of generic names or adding a .cpp for nothing.
401 extern "C" {
402 struct _SECURITY_ATTRIBUTES;
403 __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount,
404  long lMaximumCount, const wchar_t* lpName);
405 __declspec(dllimport) int __stdcall CloseHandle(void* hObject);
406 __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
407 __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
408 }
409 #elif defined(__MACH__)
410 #include <mach/mach.h>
411 #elif defined(__unix__)
412 #include <semaphore.h>
413 #endif
414 
415 namespace moodycamel
416 {
417 // Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's
418 // portable + lightweight semaphore implementations, originally from
419 // https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
420 // LICENSE:
421 // Copyright (c) 2015 Jeff Preshing
422 //
423 // This software is provided 'as-is', without any express or implied
424 // warranty. In no event will the authors be held liable for any damages
425 // arising from the use of this software.
426 //
427 // Permission is granted to anyone to use this software for any purpose,
428 // including commercial applications, and to alter it and redistribute it
429 // freely, subject to the following restrictions:
430 //
431 // 1. The origin of this software must not be misrepresented; you must not
432 // claim that you wrote the original software. If you use this software
433 // in a product, an acknowledgement in the product documentation would be
434 // appreciated but is not required.
435 // 2. Altered source versions must be plainly marked as such, and must not be
436 // misrepresented as being the original software.
437 // 3. This notice may not be removed or altered from any source distribution.
438 namespace spsc_sema
439 {
440 #if defined(_WIN32)
441 class Semaphore
442 {
443 private:
444  void* m_hSema;
445 
446  Semaphore(const Semaphore& other);
447  Semaphore& operator=(const Semaphore& other);
448 
449 public:
450  Semaphore(int initialCount = 0)
451  {
452  assert(initialCount >= 0);
453  const long maxLong = 0x7fffffff;
454  m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
455  }
456 
457  ~Semaphore()
458  {
459  CloseHandle(m_hSema);
460  }
461 
462  void wait()
463  {
464  const unsigned long infinite = 0xffffffff;
465  WaitForSingleObject(m_hSema, infinite);
466  }
467 
468  bool try_wait()
469  {
470  const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
471  return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT;
472  }
473 
474  bool timed_wait(std::uint64_t usecs)
475  {
476  const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
477  return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT;
478  }
479 
480  void signal(int count = 1)
481  {
482  ReleaseSemaphore(m_hSema, count, nullptr);
483  }
484 };
485 #elif defined(__MACH__)
486 //---------------------------------------------------------
487 // Semaphore (Apple iOS and OSX)
488 // Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
489 //---------------------------------------------------------
490 class Semaphore
491 {
492 private:
493  semaphore_t m_sema;
494 
495  Semaphore(const Semaphore& other);
496  Semaphore& operator=(const Semaphore& other);
497 
498 public:
499  Semaphore(int initialCount = 0)
500  {
501  assert(initialCount >= 0);
502  semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
503  }
504 
505  ~Semaphore()
506  {
507  semaphore_destroy(mach_task_self(), m_sema);
508  }
509 
510  void wait()
511  {
512  semaphore_wait(m_sema);
513  }
514 
515  bool try_wait()
516  {
517  return timed_wait(0);
518  }
519 
520  bool timed_wait(std::int64_t timeout_usecs)
521  {
522  mach_timespec_t ts;
523  ts.tv_sec = timeout_usecs / 1000000;
524  ts.tv_nsec = (timeout_usecs % 1000000) * 1000;
525 
526  // added in OSX 10.10:
527  // https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
528  kern_return_t rc = semaphore_timedwait(m_sema, ts);
529 
530  return rc != KERN_OPERATION_TIMED_OUT;
531  }
532 
533  void signal()
534  {
535  semaphore_signal(m_sema);
536  }
537 
538  void signal(int count)
539  {
540  while (count-- > 0)
541  {
542  semaphore_signal(m_sema);
543  }
544  }
545 };
546 #elif defined(__unix__)
547 //---------------------------------------------------------
548 // Semaphore (POSIX, Linux)
549 //---------------------------------------------------------
550 class Semaphore
551 {
552 private:
553  sem_t m_sema;
554 
555  Semaphore(const Semaphore& other);
556  Semaphore& operator=(const Semaphore& other);
557 
558 public:
559  Semaphore(int initialCount = 0)
560  {
561  assert(initialCount >= 0);
562  sem_init(&m_sema, 0, initialCount);
563  }
564 
565  ~Semaphore()
566  {
567  sem_destroy(&m_sema);
568  }
569 
570  void wait()
571  {
572  // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
573  int rc;
574  do
575  {
576  rc = sem_wait(&m_sema);
577  } while (rc == -1 && errno == EINTR);
578  }
579 
580  bool try_wait()
581  {
582  int rc;
583  do
584  {
585  rc = sem_trywait(&m_sema);
586  } while (rc == -1 && errno == EINTR);
587  return !(rc == -1 && errno == EAGAIN);
588  }
589 
590  bool timed_wait(std::uint64_t usecs)
591  {
592  struct timespec ts;
593  const int usecs_in_1_sec = 1000000;
594  const int nsecs_in_1_sec = 1000000000;
595  clock_gettime(CLOCK_REALTIME, &ts);
596  ts.tv_sec += usecs / usecs_in_1_sec;
597  ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000;
598  // sem_timedwait bombs if you have more than 1e9 in tv_nsec
599  // so we have to clean things up before passing it in
600  if (ts.tv_nsec > nsecs_in_1_sec)
601  {
602  ts.tv_nsec -= nsecs_in_1_sec;
603  ++ts.tv_sec;
604  }
605 
606  int rc;
607  do
608  {
609  rc = sem_timedwait(&m_sema, &ts);
610  } while (rc == -1 && errno == EINTR);
611  return !(rc == -1 && errno == ETIMEDOUT);
612  }
613 
614  void signal()
615  {
616  sem_post(&m_sema);
617  }
618 
619  void signal(int count)
620  {
621  while (count-- > 0)
622  {
623  sem_post(&m_sema);
624  }
625  }
626 };
627 #else
628 #error Unsupported platform! (No semaphore wrapper available)
629 #endif
630 
631 //---------------------------------------------------------
632 // LightweightSemaphore
633 //---------------------------------------------------------
635 {
636 public:
637  typedef std::make_signed<std::size_t>::type ssize_t;
638 
639 private:
641  Semaphore m_sema;
642 
643  bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
644  {
645  ssize_t oldCount;
646  // Is there a better way to set the initial spin count?
647  // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
648  // as threads start hitting the kernel semaphore.
649  int spin = 10000;
650  while (--spin >= 0)
651  {
652  if (m_count.load() > 0)
653  {
654  m_count.fetch_add_acquire(-1);
655  return true;
656  }
657  compiler_fence(memory_order_acquire); // Prevent the compiler from collapsing the loop.
658  }
659  oldCount = m_count.fetch_add_acquire(-1);
660  if (oldCount > 0)
661  return true;
662  if (timeout_usecs < 0)
663  {
664  m_sema.wait();
665  return true;
666  }
667  if (m_sema.timed_wait(timeout_usecs))
668  return true;
669  // At this point, we've timed out waiting for the semaphore, but the
670  // count is still decremented indicating we may still be waiting on
671  // it. So we have to re-adjust the count, but only if the semaphore
672  // wasn't signaled enough times for us too since then. If it was, we
673  // need to release the semaphore too.
674  while (true)
675  {
676  oldCount = m_count.fetch_add_release(1);
677  if (oldCount < 0)
678  return false; // successfully restored things to the way they were
679  // Oh, the producer thread just signaled the semaphore after all. Try again:
680  oldCount = m_count.fetch_add_acquire(-1);
681  if (oldCount > 0 && m_sema.try_wait())
682  return true;
683  }
684  }
685 
686 public:
687  LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount)
688  {
689  assert(initialCount >= 0);
690  }
691 
692  bool tryWait()
693  {
694  if (m_count.load() > 0)
695  {
696  m_count.fetch_add_acquire(-1);
697  return true;
698  }
699  return false;
700  }
701 
702  void wait()
703  {
704  if (!tryWait())
705  waitWithPartialSpinning();
706  }
707 
708  bool wait(std::int64_t timeout_usecs)
709  {
710  return tryWait() || waitWithPartialSpinning(timeout_usecs);
711  }
712 
713  void signal(ssize_t count = 1)
714  {
715  assert(count >= 0);
716  ssize_t oldCount = m_count.fetch_add_release(count);
717  assert(oldCount >= -1);
718  if (oldCount < 0)
719  {
720  m_sema.signal(1);
721  }
722  }
723 
724  ssize_t availableApprox() const
725  {
726  ssize_t count = m_count.load();
727  return count > 0 ? count : 0;
728  }
729 };
730 } // end namespace spsc_sema
731 } // end namespace moodycamel
732 
733 #if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))
734 #pragma warning(pop)
735 #ifdef __cplusplus_cli
736 #pragma managed(pop)
737 #endif
738 #endif
AE_FORCEINLINE T fetch_add_release(T increment)
Definition: atomicops.h:375
bool wait(std::int64_t timeout_usecs)
Definition: atomicops.h:708
bool waitWithPartialSpinning(std::int64_t timeout_usecs=-1)
Definition: atomicops.h:643
weak_atomic(weak_atomic &&other)
Definition: atomicops.h:290
LightweightSemaphore(ssize_t initialCount=0)
Definition: atomicops.h:687
std::atomic< T > value
Definition: atomicops.h:387
weak_atomic(weak_atomic const &other)
Definition: atomicops.h:287
TFSIMD_FORCE_INLINE const tfScalar & x() const
void wait(int seconds)
AE_FORCEINLINE void compiler_fence(memory_order order)
Definition: atomicops.h:202
std::make_signed< std::size_t >::type ssize_t
Definition: atomicops.h:637
AE_FORCEINLINE T load() const
Definition: atomicops.h:365
AE_FORCEINLINE T fetch_add_acquire(T increment)
Definition: atomicops.h:370
AE_FORCEINLINE weak_atomic const & operator=(U &&x)
Definition: atomicops.h:353
AE_FORCEINLINE void fence(memory_order order)
Definition: atomicops.h:225
AE_FORCEINLINE weak_atomic const & operator=(weak_atomic const &other)
Definition: atomicops.h:359
void increment(int *value)
#define AE_FORCEINLINE
Definition: atomicops.h:51


ur_modern_driver
Author(s): Thomas Timm Andersen, Simon Rasmussen
autogenerated on Fri Jun 26 2020 03:37:00