5d225ec202
Use new internal atomics as they are now faster than TBB.
363 lines
12 KiB
Diff
363 lines
12 KiB
Diff
From a58d0ff4935ef14f32f01d4de362bba242f07e0c Mon Sep 17 00:00:00 2001
|
|
From: Larry Gritz <lg@larrygritz.com>
|
|
Date: Sat, 4 May 2013 10:22:12 -0700
|
|
Subject: [PATCH] spinlock tweaks that finally make it as good or better than
|
|
TBB.
|
|
|
|
---
|
|
src/include/thread.h | 89 ++++++++++++++++--------------------
|
|
src/libOpenImageIO/atomic_test.cpp | 9 ++--
|
|
src/libOpenImageIO/spinlock_test.cpp | 22 +++++++--
|
|
src/libtexture/imagecache_pvt.h | 2 +-
|
|
4 files changed, 62 insertions(+), 60 deletions(-)
|
|
|
|
diff --git a/src/include/thread.h b/src/include/thread.h
|
|
index 28645fc..2cd03c1 100644
|
|
--- a/src/include/thread.h
|
|
+++ b/src/include/thread.h
|
|
@@ -78,16 +78,22 @@
|
|
// Some day, we hope this is all replaced by use of std::atomic<>.
|
|
#if USE_TBB
|
|
# include <tbb/atomic.h>
|
|
- using tbb::atomic;
|
|
# include <tbb/spin_mutex.h>
|
|
+# define USE_TBB_ATOMIC 1
|
|
+# define USE_TBB_SPINLOCK 1
|
|
+#else
|
|
+# define USE_TBB_ATOMIC 0
|
|
+# define USE_TBB_SPINLOCK 0
|
|
#endif
|
|
|
|
+
|
|
#if defined(_MSC_VER) && !USE_TBB
|
|
# include <windows.h>
|
|
# include <winbase.h>
|
|
# pragma intrinsic (_InterlockedExchangeAdd)
|
|
# pragma intrinsic (_InterlockedCompareExchange)
|
|
# pragma intrinsic (_InterlockedCompareExchange64)
|
|
+# pragma intrinsic (_ReadWriteBarrier)
|
|
# if defined(_WIN64)
|
|
# pragma intrinsic(_InterlockedExchangeAdd64)
|
|
# endif
|
|
@@ -105,10 +111,6 @@
|
|
# endif
|
|
#endif
|
|
|
|
-#ifdef __APPLE__
|
|
-# include <libkern/OSAtomic.h>
|
|
-#endif
|
|
-
|
|
#if defined(__GNUC__) && (defined(_GLIBCXX_ATOMIC_BUILTINS) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 401))
|
|
#if !defined(__FreeBSD__) || defined(__x86_64__)
|
|
#define USE_GCC_ATOMICS
|
|
@@ -230,9 +232,6 @@ class thread_specific_ptr {
|
|
#elif USE_TBB
|
|
atomic<int> *a = (atomic<int> *)at;
|
|
return a->fetch_and_add (x);
|
|
-#elif defined(no__APPLE__)
|
|
- // Apple, not inline for Intel (only PPC?)
|
|
- return OSAtomicAdd32Barrier (x, at) - x;
|
|
#elif defined(_MSC_VER)
|
|
// Windows
|
|
return _InterlockedExchangeAdd ((volatile LONG *)at, x);
|
|
@@ -251,9 +250,6 @@ class thread_specific_ptr {
|
|
#elif USE_TBB
|
|
atomic<long long> *a = (atomic<long long> *)at;
|
|
return a->fetch_and_add (x);
|
|
-#elif defined(no__APPLE__)
|
|
- // Apple, not inline for Intel (only PPC?)
|
|
- return OSAtomicAdd64Barrier (x, at) - x;
|
|
#elif defined(_MSC_VER)
|
|
// Windows
|
|
# if defined(_WIN64)
|
|
@@ -282,8 +278,6 @@ class thread_specific_ptr {
|
|
#elif USE_TBB
|
|
atomic<int> *a = (atomic<int> *)at;
|
|
return a->compare_and_swap (newval, compareval) == newval;
|
|
-#elif defined(no__APPLE__)
|
|
- return OSAtomicCompareAndSwap32Barrier (compareval, newval, at);
|
|
#elif defined(_MSC_VER)
|
|
return (_InterlockedCompareExchange ((volatile LONG *)at, newval, compareval) == compareval);
|
|
#else
|
|
@@ -301,8 +295,6 @@ class thread_specific_ptr {
|
|
#elif USE_TBB
|
|
atomic<long long> *a = (atomic<long long> *)at;
|
|
return a->compare_and_swap (newval, compareval) == newval;
|
|
-#elif defined(no__APPLE__)
|
|
- return OSAtomicCompareAndSwap64Barrier (compareval, newval, at);
|
|
#elif defined(_MSC_VER)
|
|
return (_InterlockedCompareExchange64 ((volatile LONGLONG *)at, newval, compareval) == compareval);
|
|
#else
|
|
@@ -317,9 +309,7 @@ class thread_specific_ptr {
|
|
inline void
|
|
yield ()
|
|
{
|
|
-#if USE_TBB
|
|
- __TBB_Yield ();
|
|
-#elif defined(__GNUC__)
|
|
+#if defined(__GNUC__)
|
|
sched_yield ();
|
|
#elif defined(_MSC_VER)
|
|
SwitchToThread ();
|
|
@@ -334,12 +324,12 @@ class thread_specific_ptr {
|
|
inline void
|
|
pause (int delay)
|
|
{
|
|
-#if USE_TBB
|
|
- __TBB_Pause(delay);
|
|
-#elif defined(__GNUC__)
|
|
+#if defined(__GNUC__)
|
|
for (int i = 0; i < delay; ++i) {
|
|
__asm__ __volatile__("pause;");
|
|
}
|
|
+#elif USE_TBB
|
|
+ __TBB_Pause(delay);
|
|
#elif defined(_MSC_VER)
|
|
for (int i = 0; i < delay; ++i) {
|
|
#if defined (_WIN64)
|
|
@@ -369,14 +359,17 @@ class atomic_backoff {
|
|
yield();
|
|
}
|
|
}
|
|
+
|
|
private:
|
|
int m_count;
|
|
};
|
|
|
|
|
|
|
|
-#if (! USE_TBB)
|
|
-// If we're not using TBB, we need to define our own atomic<>.
|
|
+#if USE_TBB_ATOMIC
|
|
+using tbb::atomic;
|
|
+#else
|
|
+// If we're not using TBB's atomic, we need to define our own atomic<>.
|
|
|
|
|
|
/// Atomic integer. Increment, decrement, add, and subtract in a
|
|
@@ -456,7 +449,7 @@ class atomic {
|
|
};
|
|
|
|
|
|
-#endif /* ! USE_TBB */
|
|
+#endif /* ! USE_TBB_ATOMIC */
|
|
|
|
|
|
#ifdef NOTHREADS
|
|
@@ -478,7 +471,7 @@ class atomic {
|
|
typedef null_mutex spin_mutex;
|
|
typedef null_lock<spin_mutex> spin_lock;
|
|
|
|
-#elif USE_TBB
|
|
+#elif USE_TBB_SPINLOCK
|
|
|
|
// Use TBB's spin locks
|
|
typedef tbb::spin_mutex spin_mutex;
|
|
@@ -529,63 +522,61 @@ class spin_mutex {
|
|
/// Acquire the lock, spin until we have it.
|
|
///
|
|
void lock () {
|
|
-#if defined(no__APPLE__)
|
|
- // OS X has dedicated spin lock routines, may as well use them.
|
|
- OSSpinLockLock ((OSSpinLock *)&m_locked);
|
|
-#else
|
|
// To avoid spinning too tightly, we use the atomic_backoff to
|
|
// provide increasingly longer pauses, and if the lock is under
|
|
// lots of contention, eventually yield the timeslice.
|
|
atomic_backoff backoff;
|
|
+
|
|
// Try to get ownership of the lock. Though experimentation, we
|
|
// found that OIIO_UNLIKELY makes this just a bit faster on
|
|
// gcc x86/x86_64 systems.
|
|
while (! OIIO_UNLIKELY(try_lock())) {
|
|
do {
|
|
backoff();
|
|
- } while (*(volatile int *)&m_locked);
|
|
+ } while (m_locked);
|
|
+
|
|
// The full try_lock() involves a compare_and_swap, which
|
|
// writes memory, and that will lock the bus. But a normal
|
|
// read of m_locked will let us spin until the value
|
|
// changes, without locking the bus. So it's faster to
|
|
// check in this manner until the mutex appears to be free.
|
|
}
|
|
-#endif
|
|
}
|
|
|
|
/// Release the lock that we hold.
|
|
///
|
|
void unlock () {
|
|
-#if defined(no__APPLE__)
|
|
- OSSpinLockUnlock ((OSSpinLock *)&m_locked);
|
|
-#elif defined(__GNUC__)
|
|
- // GCC gives us an intrinsic that is even better, an atomic
|
|
- // assignment of 0 with "release" barrier semantics.
|
|
- __sync_lock_release ((volatile int *)&m_locked);
|
|
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
|
+ // Fastest way to do it is with a store with "release" semantics
|
|
+ __asm__ __volatile__("": : :"memory");
|
|
+ m_locked = 0;
|
|
+ // N.B. GCC gives us an intrinsic that is even better, an atomic
|
|
+ // assignment of 0 with "release" barrier semantics:
|
|
+ // __sync_lock_release (&m_locked);
|
|
+ // But empirically we found it not as performant as the above.
|
|
+#elif defined(_MSC_VER)
|
|
+ _ReadWriteBarrier();
|
|
+ m_locked = 0;
|
|
#else
|
|
// Otherwise, just assign zero to the atomic (but that's a full
|
|
// memory barrier).
|
|
- m_locked = 0;
|
|
+ *(atomic_int *)&m_locked = 0;
|
|
#endif
|
|
}
|
|
|
|
/// Try to acquire the lock. Return true if we have it, false if
|
|
/// somebody else is holding the lock.
|
|
bool try_lock () {
|
|
-#if defined(no__APPLE__)
|
|
- return OSSpinLockTry ((OSSpinLock *)&m_locked);
|
|
-#else
|
|
-# if USE_TBB
|
|
+#if USE_TBB_ATOMIC
|
|
// TBB's compare_and_swap returns the original value
|
|
- return m_locked.compare_and_swap (0, 1) == 0;
|
|
-# elif defined(__GNUC__)
|
|
+ return (*(atomic_int *)&m_locked).compare_and_swap (0, 1) == 0;
|
|
+#elif defined(__GNUC__)
|
|
// GCC gives us an intrinsic that is even better -- an atomic
|
|
// exchange with "acquire" barrier semantics.
|
|
- return __sync_lock_test_and_set ((volatile int *)&m_locked, 1) == 0;
|
|
-# else
|
|
+ return __sync_lock_test_and_set (&m_locked, 1) == 0;
|
|
+#else
|
|
// Our compare_and_swap returns true if it swapped
|
|
- return m_locked.bool_compare_and_swap (0, 1);
|
|
-# endif
|
|
+ return atomic_compare_and_exchange (&m_locked, 0, 1);
|
|
#endif
|
|
}
|
|
|
|
@@ -603,7 +594,7 @@ class spin_mutex {
|
|
};
|
|
|
|
private:
|
|
- atomic_int m_locked; ///< Atomic counter is zero if nobody holds the lock
|
|
+ volatile int m_locked; ///< Atomic counter is zero if nobody holds the lock
|
|
};
|
|
|
|
|
|
diff --git a/src/libOpenImageIO/atomic_test.cpp b/src/libOpenImageIO/atomic_test.cpp
|
|
index 2c1e807..42d469a 100644
|
|
--- a/src/libOpenImageIO/atomic_test.cpp
|
|
+++ b/src/libOpenImageIO/atomic_test.cpp
|
|
@@ -49,7 +49,7 @@
|
|
// and decrementing the crap out of it, and make sure it has the right
|
|
// value at the end.
|
|
|
|
-static int iterations = 160000000;
|
|
+static int iterations = 40000000;
|
|
static int numthreads = 16;
|
|
static int ntrials = 1;
|
|
static bool verbose = false;
|
|
@@ -184,16 +184,15 @@ int main (int argc, char *argv[])
|
|
|
|
static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
|
|
for (int i = 0; threadcounts[i] <= numthreads; ++i) {
|
|
- int nt = threadcounts[i];
|
|
+ int nt = wedge ? threadcounts[i] : numthreads;
|
|
int its = iterations/nt;
|
|
|
|
double range;
|
|
double t = time_trial (boost::bind(test_atomics,nt,its),
|
|
ntrials, &range);
|
|
|
|
- std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
|
|
- nt, Strutil::timeintervalformat(t),
|
|
- t, range, its);
|
|
+ std::cout << Strutil::format ("%2d\t%5.1f range %.2f\t(%d iters/thread)\n",
|
|
+ nt, t, range, its);
|
|
if (! wedge)
|
|
break; // don't loop if we're not wedging
|
|
}
|
|
diff --git a/src/libOpenImageIO/spinlock_test.cpp b/src/libOpenImageIO/spinlock_test.cpp
|
|
index 60c192b..64adbce 100644
|
|
--- a/src/libOpenImageIO/spinlock_test.cpp
|
|
+++ b/src/libOpenImageIO/spinlock_test.cpp
|
|
@@ -50,7 +50,7 @@
|
|
// accumulated value is equal to iterations*threads, then the spin locks
|
|
// worked.
|
|
|
|
-static int iterations = 160000000;
|
|
+static int iterations = 40000000;
|
|
static int numthreads = 16;
|
|
static int ntrials = 1;
|
|
static bool verbose = false;
|
|
@@ -58,6 +58,7 @@
|
|
|
|
static spin_mutex print_mutex; // make the prints not clobber each other
|
|
volatile long long accum = 0;
|
|
+float faccum = 0;
|
|
spin_mutex mymutex;
|
|
|
|
|
|
@@ -71,10 +72,22 @@
|
|
std::cout << "thread " << boost::this_thread::get_id()
|
|
<< ", accum = " << accum << "\n";
|
|
}
|
|
+#if 1
|
|
for (int i = 0; i < iterations; ++i) {
|
|
spin_lock lock (mymutex);
|
|
accum += 1;
|
|
}
|
|
+#else
|
|
+ // Alternate one that mixes in some math to make longer lock hold time,
|
|
+ // and also more to do between locks. Interesting contrast in timings.
|
|
+ float last = 0.0f;
|
|
+ for (int i = 0; i < iterations; ++i) {
|
|
+ last = fmodf (sinf(last), 1.0f);
|
|
+ spin_lock lock (mymutex);
|
|
+ accum += 1;
|
|
+ faccum = fmod (sinf(faccum+last), 1.0f);
|
|
+ }
|
|
+#endif
|
|
}
|
|
|
|
|
|
@@ -134,16 +147,15 @@ int main (int argc, char *argv[])
|
|
|
|
static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
|
|
for (int i = 0; threadcounts[i] <= numthreads; ++i) {
|
|
- int nt = threadcounts[i];
|
|
+ int nt = wedge ? threadcounts[i] : numthreads;
|
|
int its = iterations/nt;
|
|
|
|
double range;
|
|
double t = time_trial (boost::bind(test_spinlock,nt,its),
|
|
ntrials, &range);
|
|
|
|
- std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
|
|
- nt, Strutil::timeintervalformat(t),
|
|
- t, range, its);
|
|
+ std::cout << Strutil::format ("%2d\t%5.1f range %.2f\t(%d iters/thread)\n",
|
|
+ nt, t, range, its);
|
|
if (! wedge)
|
|
break; // don't loop if we're not wedging
|
|
}
|
|
diff --git a/src/libtexture/imagecache_pvt.h b/src/libtexture/imagecache_pvt.h
|
|
index 5d29782..3a49616 100644
|
|
--- a/src/libtexture/imagecache_pvt.h
|
|
+++ b/src/libtexture/imagecache_pvt.h
|
|
@@ -1003,7 +1003,7 @@ class ImageCacheImpl : public ImageCache {
|
|
newval = oldval + incr;
|
|
// Now try to atomically swap it, and repeat until we've
|
|
// done it with nobody else interfering.
|
|
-# if USE_TBB
|
|
+# if USE_TBB_ATOMIC
|
|
} while (llstat->compare_and_swap (*llnewval,*lloldval) != *lloldval);
|
|
# else
|
|
} while (llstat->bool_compare_and_swap (*llnewval,*lloldval));
|
|
--
|
|
1.8.1.6
|
|
|