From a58d0ff4935ef14f32f01d4de362bba242f07e0c Mon Sep 17 00:00:00 2001 From: Larry Gritz Date: Sat, 4 May 2013 10:22:12 -0700 Subject: [PATCH] spinlock tweaks that finally make it as good or better than TBB. --- src/include/thread.h | 89 ++++++++++++++++-------------------- src/libOpenImageIO/atomic_test.cpp | 9 ++-- src/libOpenImageIO/spinlock_test.cpp | 22 +++++++-- src/libtexture/imagecache_pvt.h | 2 +- 4 files changed, 62 insertions(+), 60 deletions(-) diff --git a/src/include/thread.h b/src/include/thread.h index 28645fc..2cd03c1 100644 --- a/src/include/thread.h +++ b/src/include/thread.h @@ -78,16 +78,22 @@ // Some day, we hope this is all replaced by use of std::atomic<>. #if USE_TBB # include - using tbb::atomic; # include +# define USE_TBB_ATOMIC 1 +# define USE_TBB_SPINLOCK 1 +#else +# define USE_TBB_ATOMIC 0 +# define USE_TBB_SPINLOCK 0 #endif + #if defined(_MSC_VER) && !USE_TBB # include # include # pragma intrinsic (_InterlockedExchangeAdd) # pragma intrinsic (_InterlockedCompareExchange) # pragma intrinsic (_InterlockedCompareExchange64) +# pragma intrinsic (_ReadWriteBarrier) # if defined(_WIN64) # pragma intrinsic(_InterlockedExchangeAdd64) # endif @@ -105,10 +111,6 @@ # endif #endif -#ifdef __APPLE__ -# include -#endif - #if defined(__GNUC__) && (defined(_GLIBCXX_ATOMIC_BUILTINS) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 401)) #if !defined(__FreeBSD__) || defined(__x86_64__) #define USE_GCC_ATOMICS @@ -230,9 +232,6 @@ class thread_specific_ptr { #elif USE_TBB atomic *a = (atomic *)at; return a->fetch_and_add (x); -#elif defined(no__APPLE__) - // Apple, not inline for Intel (only PPC?) - return OSAtomicAdd32Barrier (x, at) - x; #elif defined(_MSC_VER) // Windows return _InterlockedExchangeAdd ((volatile LONG *)at, x); @@ -251,9 +250,6 @@ class thread_specific_ptr { #elif USE_TBB atomic *a = (atomic *)at; return a->fetch_and_add (x); -#elif defined(no__APPLE__) - // Apple, not inline for Intel (only PPC?) - return OSAtomicAdd64Barrier (x, at) - x; #elif defined(_MSC_VER) // Windows # if defined(_WIN64) @@ -282,8 +278,6 @@ class thread_specific_ptr { #elif USE_TBB atomic *a = (atomic *)at; return a->compare_and_swap (newval, compareval) == newval; -#elif defined(no__APPLE__) - return OSAtomicCompareAndSwap32Barrier (compareval, newval, at); #elif defined(_MSC_VER) return (_InterlockedCompareExchange ((volatile LONG *)at, newval, compareval) == compareval); #else @@ -301,8 +295,6 @@ class thread_specific_ptr { #elif USE_TBB atomic *a = (atomic *)at; return a->compare_and_swap (newval, compareval) == newval; -#elif defined(no__APPLE__) - return OSAtomicCompareAndSwap64Barrier (compareval, newval, at); #elif defined(_MSC_VER) return (_InterlockedCompareExchange64 ((volatile LONGLONG *)at, newval, compareval) == compareval); #else @@ -317,9 +309,7 @@ class thread_specific_ptr { inline void yield () { -#if USE_TBB - __TBB_Yield (); -#elif defined(__GNUC__) +#if defined(__GNUC__) sched_yield (); #elif defined(_MSC_VER) SwitchToThread (); @@ -334,12 +324,12 @@ class thread_specific_ptr { inline void pause (int delay) { -#if USE_TBB - __TBB_Pause(delay); -#elif defined(__GNUC__) +#if defined(__GNUC__) for (int i = 0; i < delay; ++i) { __asm__ __volatile__("pause;"); } +#elif USE_TBB + __TBB_Pause(delay); #elif defined(_MSC_VER) for (int i = 0; i < delay; ++i) { #if defined (_WIN64) @@ -369,14 +359,17 @@ class atomic_backoff { yield(); } } + private: int m_count; }; -#if (! USE_TBB) -// If we're not using TBB, we need to define our own atomic<>. +#if USE_TBB_ATOMIC +using tbb::atomic; +#else +// If we're not using TBB's atomic, we need to define our own atomic<>. /// Atomic integer. Increment, decrement, add, and subtract in a @@ -456,7 +449,7 @@ class atomic { }; -#endif /* ! USE_TBB */ +#endif /* ! USE_TBB_ATOMIC */ #ifdef NOTHREADS @@ -478,7 +471,7 @@ class atomic { typedef null_mutex spin_mutex; typedef null_lock spin_lock; -#elif USE_TBB +#elif USE_TBB_SPINLOCK // Use TBB's spin locks typedef tbb::spin_mutex spin_mutex; @@ -529,63 +522,61 @@ class spin_mutex { /// Acquire the lock, spin until we have it. /// void lock () { -#if defined(no__APPLE__) - // OS X has dedicated spin lock routines, may as well use them. - OSSpinLockLock ((OSSpinLock *)&m_locked); -#else // To avoid spinning too tightly, we use the atomic_backoff to // provide increasingly longer pauses, and if the lock is under // lots of contention, eventually yield the timeslice. atomic_backoff backoff; + // Try to get ownership of the lock. Though experimentation, we // found that OIIO_UNLIKELY makes this just a bit faster on // gcc x86/x86_64 systems. while (! OIIO_UNLIKELY(try_lock())) { do { backoff(); - } while (*(volatile int *)&m_locked); + } while (m_locked); + // The full try_lock() involves a compare_and_swap, which // writes memory, and that will lock the bus. But a normal // read of m_locked will let us spin until the value // changes, without locking the bus. So it's faster to // check in this manner until the mutex appears to be free. } -#endif } /// Release the lock that we hold. /// void unlock () { -#if defined(no__APPLE__) - OSSpinLockUnlock ((OSSpinLock *)&m_locked); -#elif defined(__GNUC__) - // GCC gives us an intrinsic that is even better, an atomic - // assignment of 0 with "release" barrier semantics. - __sync_lock_release ((volatile int *)&m_locked); +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) + // Fastest way to do it is with a store with "release" semantics + __asm__ __volatile__("": : :"memory"); + m_locked = 0; + // N.B. GCC gives us an intrinsic that is even better, an atomic + // assignment of 0 with "release" barrier semantics: + // __sync_lock_release (&m_locked); + // But empirically we found it not as performant as the above. +#elif defined(_MSC_VER) + _ReadWriteBarrier(); + m_locked = 0; #else // Otherwise, just assign zero to the atomic (but that's a full // memory barrier). - m_locked = 0; + *(atomic_int *)&m_locked = 0; #endif } /// Try to acquire the lock. Return true if we have it, false if /// somebody else is holding the lock. bool try_lock () { -#if defined(no__APPLE__) - return OSSpinLockTry ((OSSpinLock *)&m_locked); -#else -# if USE_TBB +#if USE_TBB_ATOMIC // TBB's compare_and_swap returns the original value - return m_locked.compare_and_swap (0, 1) == 0; -# elif defined(__GNUC__) + return (*(atomic_int *)&m_locked).compare_and_swap (0, 1) == 0; +#elif defined(__GNUC__) // GCC gives us an intrinsic that is even better -- an atomic // exchange with "acquire" barrier semantics. - return __sync_lock_test_and_set ((volatile int *)&m_locked, 1) == 0; -# else + return __sync_lock_test_and_set (&m_locked, 1) == 0; +#else // Our compare_and_swap returns true if it swapped - return m_locked.bool_compare_and_swap (0, 1); -# endif + return atomic_compare_and_exchange (&m_locked, 0, 1); #endif } @@ -603,7 +594,7 @@ class spin_mutex { }; private: - atomic_int m_locked; ///< Atomic counter is zero if nobody holds the lock + volatile int m_locked; ///< Atomic counter is zero if nobody holds the lock }; diff --git a/src/libOpenImageIO/atomic_test.cpp b/src/libOpenImageIO/atomic_test.cpp index 2c1e807..42d469a 100644 --- a/src/libOpenImageIO/atomic_test.cpp +++ b/src/libOpenImageIO/atomic_test.cpp @@ -49,7 +49,7 @@ // and decrementing the crap out of it, and make sure it has the right // value at the end. -static int iterations = 160000000; +static int iterations = 40000000; static int numthreads = 16; static int ntrials = 1; static bool verbose = false; @@ -184,16 +184,15 @@ int main (int argc, char *argv[]) static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 }; for (int i = 0; threadcounts[i] <= numthreads; ++i) { - int nt = threadcounts[i]; + int nt = wedge ? threadcounts[i] : numthreads; int its = iterations/nt; double range; double t = time_trial (boost::bind(test_atomics,nt,its), ntrials, &range); - std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n", - nt, Strutil::timeintervalformat(t), - t, range, its); + std::cout << Strutil::format ("%2d\t%5.1f range %.2f\t(%d iters/thread)\n", + nt, t, range, its); if (! wedge) break; // don't loop if we're not wedging } diff --git a/src/libOpenImageIO/spinlock_test.cpp b/src/libOpenImageIO/spinlock_test.cpp index 60c192b..64adbce 100644 --- a/src/libOpenImageIO/spinlock_test.cpp +++ b/src/libOpenImageIO/spinlock_test.cpp @@ -50,7 +50,7 @@ // accumulated value is equal to iterations*threads, then the spin locks // worked. -static int iterations = 160000000; +static int iterations = 40000000; static int numthreads = 16; static int ntrials = 1; static bool verbose = false; @@ -58,6 +58,7 @@ static spin_mutex print_mutex; // make the prints not clobber each other volatile long long accum = 0; +float faccum = 0; spin_mutex mymutex; @@ -71,10 +72,22 @@ std::cout << "thread " << boost::this_thread::get_id() << ", accum = " << accum << "\n"; } +#if 1 for (int i = 0; i < iterations; ++i) { spin_lock lock (mymutex); accum += 1; } +#else + // Alternate one that mixes in some math to make longer lock hold time, + // and also more to do between locks. Interesting contrast in timings. + float last = 0.0f; + for (int i = 0; i < iterations; ++i) { + last = fmodf (sinf(last), 1.0f); + spin_lock lock (mymutex); + accum += 1; + faccum = fmod (sinf(faccum+last), 1.0f); + } +#endif } @@ -134,16 +147,15 @@ int main (int argc, char *argv[]) static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 }; for (int i = 0; threadcounts[i] <= numthreads; ++i) { - int nt = threadcounts[i]; + int nt = wedge ? threadcounts[i] : numthreads; int its = iterations/nt; double range; double t = time_trial (boost::bind(test_spinlock,nt,its), ntrials, &range); - std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n", - nt, Strutil::timeintervalformat(t), - t, range, its); + std::cout << Strutil::format ("%2d\t%5.1f range %.2f\t(%d iters/thread)\n", + nt, t, range, its); if (! wedge) break; // don't loop if we're not wedging } diff --git a/src/libtexture/imagecache_pvt.h b/src/libtexture/imagecache_pvt.h index 5d29782..3a49616 100644 --- a/src/libtexture/imagecache_pvt.h +++ b/src/libtexture/imagecache_pvt.h @@ -1003,7 +1003,7 @@ class ImageCacheImpl : public ImageCache { newval = oldval + incr; // Now try to atomically swap it, and repeat until we've // done it with nobody else interfering. -# if USE_TBB +# if USE_TBB_ATOMIC } while (llstat->compare_and_swap (*llnewval,*lloldval) != *lloldval); # else } while (llstat->bool_compare_and_swap (*llnewval,*lloldval)); -- 1.8.1.6