OpenImageIO/oiio-spinlocks.patch

From a58d0ff4935ef14f32f01d4de362bba242f07e0c Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 4 May 2013 10:22:12 -0700
Subject: [PATCH] spinlock tweaks that finally make it as good or better than
 TBB.

---
 src/include/thread.h                 | 89 ++++++++++++++++--------------------
 src/libOpenImageIO/atomic_test.cpp   |  9 ++--
 src/libOpenImageIO/spinlock_test.cpp | 22 +++++++--
 src/libtexture/imagecache_pvt.h      |  2 +-
 4 files changed, 62 insertions(+), 60 deletions(-)

diff --git a/src/include/thread.h b/src/include/thread.h
index 28645fc..2cd03c1 100644
--- a/src/include/thread.h
+++ b/src/include/thread.h
@@ -78,16 +78,22 @@
 // Some day, we hope this is all replaced by use of std::atomic<>.
 #if USE_TBB
 #  include <tbb/atomic.h>
-   using tbb::atomic;
 #  include <tbb/spin_mutex.h>
+#  define USE_TBB_ATOMIC 1
+#  define USE_TBB_SPINLOCK 1
+#else
+#  define USE_TBB_ATOMIC 0
+#  define USE_TBB_SPINLOCK 0
 #endif
 
+
 #if defined(_MSC_VER) && !USE_TBB
 #  include <windows.h>
 #  include <winbase.h>
 #  pragma intrinsic (_InterlockedExchangeAdd)
 #  pragma intrinsic (_InterlockedCompareExchange)
 #  pragma intrinsic (_InterlockedCompareExchange64)
+#  pragma intrinsic (_ReadWriteBarrier)
 #  if defined(_WIN64)
 #    pragma intrinsic(_InterlockedExchangeAdd64)
 #  endif
@@ -105,10 +111,6 @@
 #  endif
 #endif
 
-#ifdef __APPLE__
-#  include <libkern/OSAtomic.h>
-#endif
-
 #if defined(__GNUC__) && (defined(_GLIBCXX_ATOMIC_BUILTINS) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 401))
 #if !defined(__FreeBSD__) || defined(__x86_64__)
 #define USE_GCC_ATOMICS
@@ -230,9 +232,6 @@ class thread_specific_ptr {
 #elif USE_TBB
     atomic<int> *a = (atomic<int> *)at;
     return a->fetch_and_add (x);
-#elif defined(no__APPLE__)
-    // Apple, not inline for Intel (only PPC?)
-    return OSAtomicAdd32Barrier (x, at) - x;
 #elif defined(_MSC_VER)
     // Windows
     return _InterlockedExchangeAdd ((volatile LONG *)at, x);
@@ -251,9 +250,6 @@ class thread_specific_ptr {
 #elif USE_TBB
     atomic<long long> *a = (atomic<long long> *)at;
     return a->fetch_and_add (x);
-#elif defined(no__APPLE__)
-    // Apple, not inline for Intel (only PPC?)
-    return OSAtomicAdd64Barrier (x, at) - x;
 #elif defined(_MSC_VER)
     // Windows
 #  if defined(_WIN64)
@@ -282,8 +278,6 @@ class thread_specific_ptr {
 #elif USE_TBB
     atomic<int> *a = (atomic<int> *)at;
     return a->compare_and_swap (newval, compareval) == newval;
-#elif defined(no__APPLE__)
-    return OSAtomicCompareAndSwap32Barrier (compareval, newval, at);
 #elif defined(_MSC_VER)
     return (_InterlockedCompareExchange ((volatile LONG *)at, newval, compareval) == compareval);
 #else
@@ -301,8 +295,6 @@ class thread_specific_ptr {
 #elif USE_TBB
     atomic<long long> *a = (atomic<long long> *)at;
     return a->compare_and_swap (newval, compareval) == newval;
-#elif defined(no__APPLE__)
-    return OSAtomicCompareAndSwap64Barrier (compareval, newval, at);
 #elif defined(_MSC_VER)
     return (_InterlockedCompareExchange64 ((volatile LONGLONG *)at, newval, compareval) == compareval);
 #else
@@ -317,9 +309,7 @@ class thread_specific_ptr {
 inline void
 yield ()
 {
-#if USE_TBB
-    __TBB_Yield ();
-#elif defined(__GNUC__)
+#if defined(__GNUC__)
     sched_yield ();
 #elif defined(_MSC_VER)
     SwitchToThread ();
@@ -334,12 +324,12 @@ class thread_specific_ptr {
 inline void
 pause (int delay)
 {
-#if USE_TBB
-    __TBB_Pause(delay);
-#elif defined(__GNUC__)
+#if defined(__GNUC__)
     for (int i = 0; i < delay; ++i) {
         __asm__ __volatile__("pause;");
     }
+#elif USE_TBB
+    __TBB_Pause(delay);
 #elif defined(_MSC_VER)
     for (int i = 0; i < delay; ++i) {
 #if defined (_WIN64)
@@ -369,14 +359,17 @@ class atomic_backoff {
             yield();
         }
     }
+
 private:
     int m_count;
 };
 
 
 
-#if (! USE_TBB)
-// If we're not using TBB, we need to define our own atomic<>.
+#if USE_TBB_ATOMIC
+using tbb::atomic;
+#else
+// If we're not using TBB's atomic, we need to define our own atomic<>.
 
 
 /// Atomic integer.  Increment, decrement, add, and subtract in a
@@ -456,7 +449,7 @@ class atomic {
 };
 
 
-#endif /* ! USE_TBB */
+#endif /* ! USE_TBB_ATOMIC */
 
 
 #ifdef NOTHREADS
@@ -478,7 +471,7 @@ class atomic {
 typedef null_mutex spin_mutex;
 typedef null_lock<spin_mutex> spin_lock;
 
-#elif USE_TBB
+#elif USE_TBB_SPINLOCK
 
 // Use TBB's spin locks
 typedef tbb::spin_mutex spin_mutex;
@@ -529,63 +522,61 @@ class spin_mutex {
     /// Acquire the lock, spin until we have it.
     ///
     void lock () {
-#if defined(no__APPLE__)
-        // OS X has dedicated spin lock routines, may as well use them.
-        OSSpinLockLock ((OSSpinLock *)&m_locked);
-#else
         // To avoid spinning too tightly, we use the atomic_backoff to
         // provide increasingly longer pauses, and if the lock is under
         // lots of contention, eventually yield the timeslice.
         atomic_backoff backoff;
+
         // Try to get ownership of the lock. Though experimentation, we
         // found that OIIO_UNLIKELY makes this just a bit faster on 
         // gcc x86/x86_64 systems.
         while (! OIIO_UNLIKELY(try_lock())) {
             do {
                 backoff();
-            } while (*(volatile int *)&m_locked);
+            } while (m_locked);
+
             // The full try_lock() involves a compare_and_swap, which
             // writes memory, and that will lock the bus.  But a normal
             // read of m_locked will let us spin until the value
             // changes, without locking the bus. So it's faster to
             // check in this manner until the mutex appears to be free.
         }
-#endif
     }
 
     /// Release the lock that we hold.
     ///
     void unlock () {
-#if defined(no__APPLE__)
-        OSSpinLockUnlock ((OSSpinLock *)&m_locked);
-#elif defined(__GNUC__)
-        // GCC gives us an intrinsic that is even better, an atomic
-        // assignment of 0 with "release" barrier semantics.
-        __sync_lock_release ((volatile int *)&m_locked);
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+        // Fastest way to do it is with a store with "release" semantics
+        __asm__ __volatile__("": : :"memory");
+        m_locked = 0;
+        // N.B. GCC gives us an intrinsic that is even better, an atomic
+        // assignment of 0 with "release" barrier semantics:
+        //  __sync_lock_release (&m_locked);
+        // But empirically we found it not as performant as the above.
+#elif defined(_MSC_VER)
+        _ReadWriteBarrier();
+        m_locked = 0;
 #else
         // Otherwise, just assign zero to the atomic (but that's a full 
         // memory barrier).
-        m_locked = 0;
+        *(atomic_int *)&m_locked = 0;
 #endif
     }
 
     /// Try to acquire the lock.  Return true if we have it, false if
     /// somebody else is holding the lock.
     bool try_lock () {
-#if defined(no__APPLE__)
-        return OSSpinLockTry ((OSSpinLock *)&m_locked);
-#else
-#  if USE_TBB
+#if USE_TBB_ATOMIC
         // TBB's compare_and_swap returns the original value
-        return m_locked.compare_and_swap (0, 1) == 0;
-#  elif defined(__GNUC__)
+        return (*(atomic_int *)&m_locked).compare_and_swap (0, 1) == 0;
+#elif defined(__GNUC__)
         // GCC gives us an intrinsic that is even better -- an atomic
         // exchange with "acquire" barrier semantics.
-        return __sync_lock_test_and_set ((volatile int *)&m_locked, 1) == 0;
-#  else
+        return __sync_lock_test_and_set (&m_locked, 1) == 0;
+#else
         // Our compare_and_swap returns true if it swapped
-        return m_locked.bool_compare_and_swap (0, 1);
-#  endif
+        return atomic_compare_and_exchange (&m_locked, 0, 1);
 #endif
     }
 
@@ -603,7 +594,7 @@ class spin_mutex {
     };
 
 private:
-    atomic_int m_locked;  ///< Atomic counter is zero if nobody holds the lock
+    volatile int m_locked;  ///< Atomic counter is zero if nobody holds the lock
 };
 
 
diff --git a/src/libOpenImageIO/atomic_test.cpp b/src/libOpenImageIO/atomic_test.cpp
index 2c1e807..42d469a 100644
--- a/src/libOpenImageIO/atomic_test.cpp
+++ b/src/libOpenImageIO/atomic_test.cpp
@@ -49,7 +49,7 @@
 // and decrementing the crap out of it, and make sure it has the right
 // value at the end.
 
-static int iterations = 160000000;
+static int iterations = 40000000;
 static int numthreads = 16;
 static int ntrials = 1;
 static bool verbose = false;
@@ -184,16 +184,15 @@ int main (int argc, char *argv[])
 
     static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
     for (int i = 0; threadcounts[i] <= numthreads; ++i) {
-        int nt = threadcounts[i];
+        int nt = wedge ? threadcounts[i] : numthreads;
         int its = iterations/nt;
 
         double range;
         double t = time_trial (boost::bind(test_atomics,nt,its),
                                ntrials, &range);
 
-        std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
-                                      nt, Strutil::timeintervalformat(t),
-                                      t, range, its);
+        std::cout << Strutil::format ("%2d\t%5.1f   range %.2f\t(%d iters/thread)\n",
+                                      nt, t, range, its);
         if (! wedge)
             break;    // don't loop if we're not wedging
     }
diff --git a/src/libOpenImageIO/spinlock_test.cpp b/src/libOpenImageIO/spinlock_test.cpp
index 60c192b..64adbce 100644
--- a/src/libOpenImageIO/spinlock_test.cpp
+++ b/src/libOpenImageIO/spinlock_test.cpp
@@ -50,7 +50,7 @@
 // accumulated value is equal to iterations*threads, then the spin locks
 // worked.
 
-static int iterations = 160000000;
+static int iterations = 40000000;
 static int numthreads = 16;
 static int ntrials = 1;
 static bool verbose = false;
@@ -58,6 +58,7 @@
 
 static spin_mutex print_mutex;  // make the prints not clobber each other
 volatile long long accum = 0;
+float faccum = 0;
 spin_mutex mymutex;
 
 
@@ -71,10 +72,22 @@
         std::cout << "thread " << boost::this_thread::get_id() 
                   << ", accum = " << accum << "\n";
     }
+#if 1
     for (int i = 0;  i < iterations;  ++i) {
         spin_lock lock (mymutex);
         accum += 1;
     }
+#else
+    // Alternate one that mixes in some math to make longer lock hold time,
+    // and also more to do between locks.  Interesting contrast in timings.
+    float last = 0.0f;
+    for (int i = 0;  i < iterations;  ++i) {
+        last = fmodf (sinf(last), 1.0f);
+        spin_lock lock (mymutex);
+        accum += 1;
+        faccum = fmod (sinf(faccum+last), 1.0f);
+    }
+#endif
 }
 
 
@@ -134,16 +147,15 @@ int main (int argc, char *argv[])
 
     static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
     for (int i = 0; threadcounts[i] <= numthreads; ++i) {
-        int nt = threadcounts[i];
+        int nt = wedge ? threadcounts[i] : numthreads;
         int its = iterations/nt;
 
         double range;
         double t = time_trial (boost::bind(test_spinlock,nt,its),
                                ntrials, &range);
 
-        std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
-                                      nt, Strutil::timeintervalformat(t),
-                                      t, range, its);
+        std::cout << Strutil::format ("%2d\t%5.1f   range %.2f\t(%d iters/thread)\n",
+                                      nt, t, range, its);
         if (! wedge)
             break;    // don't loop if we're not wedging
     }
diff --git a/src/libtexture/imagecache_pvt.h b/src/libtexture/imagecache_pvt.h
index 5d29782..3a49616 100644
--- a/src/libtexture/imagecache_pvt.h
+++ b/src/libtexture/imagecache_pvt.h
@@ -1003,7 +1003,7 @@ class ImageCacheImpl : public ImageCache {
             newval = oldval + incr;
             // Now try to atomically swap it, and repeat until we've
             // done it with nobody else interfering.
-#  if USE_TBB
+#  if USE_TBB_ATOMIC
         } while (llstat->compare_and_swap (*llnewval,*lloldval) != *lloldval);
 #  else
         } while (llstat->bool_compare_and_swap (*llnewval,*lloldval));
-- 
1.8.1.6
Update to latest upstream release. Use new internal atomics as they are now faster than TBB. 2013-05-16 20:48:07 +00:00			`From a58d0ff4935ef14f32f01d4de362bba242f07e0c Mon Sep 17 00:00:00 2001`
			`From: Larry Gritz <lg@larrygritz.com>`
			`Date: Sat, 4 May 2013 10:22:12 -0700`
			`Subject: [PATCH] spinlock tweaks that finally make it as good or better than`
			`TBB.`

			`---`
			`src/include/thread.h \| 89 ++++++++++++++++--------------------`
			`src/libOpenImageIO/atomic_test.cpp \| 9 ++--`
			`src/libOpenImageIO/spinlock_test.cpp \| 22 +++++++--`
			`src/libtexture/imagecache_pvt.h \| 2 +-`
			`4 files changed, 62 insertions(+), 60 deletions(-)`

			`diff --git a/src/include/thread.h b/src/include/thread.h`
			`index 28645fc..2cd03c1 100644`
			`--- a/src/include/thread.h`
			`+++ b/src/include/thread.h`
			`@@ -78,16 +78,22 @@`
			`// Some day, we hope this is all replaced by use of std::atomic<>.`
			`#if USE_TBB`
			`# include <tbb/atomic.h>`
			`- using tbb::atomic;`
			`# include <tbb/spin_mutex.h>`
			`+# define USE_TBB_ATOMIC 1`
			`+# define USE_TBB_SPINLOCK 1`
			`+#else`
			`+# define USE_TBB_ATOMIC 0`
			`+# define USE_TBB_SPINLOCK 0`
			`#endif`

			`+`
			`#if defined(_MSC_VER) && !USE_TBB`
			`# include <windows.h>`
			`# include <winbase.h>`
			`# pragma intrinsic (_InterlockedExchangeAdd)`
			`# pragma intrinsic (_InterlockedCompareExchange)`
			`# pragma intrinsic (_InterlockedCompareExchange64)`
			`+# pragma intrinsic (_ReadWriteBarrier)`
			`# if defined(_WIN64)`
			`# pragma intrinsic(_InterlockedExchangeAdd64)`
			`# endif`
			`@@ -105,10 +111,6 @@`
			`# endif`
			`#endif`

			`-#ifdef __APPLE__`
			`-# include <libkern/OSAtomic.h>`
			`-#endif`
			`-`
			`#if defined(__GNUC__) && (defined(_GLIBCXX_ATOMIC_BUILTINS) \|\| (__GNUC__ * 100 + __GNUC_MINOR__ >= 401))`
			`#if !defined(__FreeBSD__) \|\| defined(__x86_64__)`
			`#define USE_GCC_ATOMICS`
			`@@ -230,9 +232,6 @@ class thread_specific_ptr {`
			`#elif USE_TBB`
			`atomic<int> a = (atomic<int> )at;`
			`return a->fetch_and_add (x);`
			`-#elif defined(no__APPLE__)`
			`- // Apple, not inline for Intel (only PPC?)`
			`- return OSAtomicAdd32Barrier (x, at) - x;`
			`#elif defined(_MSC_VER)`
			`// Windows`
			`return _InterlockedExchangeAdd ((volatile LONG *)at, x);`
			`@@ -251,9 +250,6 @@ class thread_specific_ptr {`
			`#elif USE_TBB`
			`atomic<long long> a = (atomic<long long> )at;`
			`return a->fetch_and_add (x);`
			`-#elif defined(no__APPLE__)`
			`- // Apple, not inline for Intel (only PPC?)`
			`- return OSAtomicAdd64Barrier (x, at) - x;`
			`#elif defined(_MSC_VER)`
			`// Windows`
			`# if defined(_WIN64)`
			`@@ -282,8 +278,6 @@ class thread_specific_ptr {`
			`#elif USE_TBB`
			`atomic<int> a = (atomic<int> )at;`
			`return a->compare_and_swap (newval, compareval) == newval;`
			`-#elif defined(no__APPLE__)`
			`- return OSAtomicCompareAndSwap32Barrier (compareval, newval, at);`
			`#elif defined(_MSC_VER)`
			`return (_InterlockedCompareExchange ((volatile LONG *)at, newval, compareval) == compareval);`
			`#else`
			`@@ -301,8 +295,6 @@ class thread_specific_ptr {`
			`#elif USE_TBB`
			`atomic<long long> a = (atomic<long long> )at;`
			`return a->compare_and_swap (newval, compareval) == newval;`
			`-#elif defined(no__APPLE__)`
			`- return OSAtomicCompareAndSwap64Barrier (compareval, newval, at);`
			`#elif defined(_MSC_VER)`
			`return (_InterlockedCompareExchange64 ((volatile LONGLONG *)at, newval, compareval) == compareval);`
			`#else`
			`@@ -317,9 +309,7 @@ class thread_specific_ptr {`
			`inline void`
			`yield ()`
			`{`
			`-#if USE_TBB`
			`- __TBB_Yield ();`
			`-#elif defined(__GNUC__)`
			`+#if defined(__GNUC__)`
			`sched_yield ();`
			`#elif defined(_MSC_VER)`
			`SwitchToThread ();`
			`@@ -334,12 +324,12 @@ class thread_specific_ptr {`
			`inline void`
			`pause (int delay)`
			`{`
			`-#if USE_TBB`
			`- __TBB_Pause(delay);`
			`-#elif defined(__GNUC__)`
			`+#if defined(__GNUC__)`
			`for (int i = 0; i < delay; ++i) {`
			`__asm__ __volatile__("pause;");`
			`}`
			`+#elif USE_TBB`
			`+ __TBB_Pause(delay);`
			`#elif defined(_MSC_VER)`
			`for (int i = 0; i < delay; ++i) {`
			`#if defined (_WIN64)`
			`@@ -369,14 +359,17 @@ class atomic_backoff {`
			`yield();`
			`}`
			`}`
			`+`
			`private:`
			`int m_count;`
			`};`



			`-#if (! USE_TBB)`
			`-// If we're not using TBB, we need to define our own atomic<>.`
			`+#if USE_TBB_ATOMIC`
			`+using tbb::atomic;`
			`+#else`
			`+// If we're not using TBB's atomic, we need to define our own atomic<>.`


			`/// Atomic integer. Increment, decrement, add, and subtract in a`
			`@@ -456,7 +449,7 @@ class atomic {`
			`};`


			`-#endif /* ! USE_TBB */`
			`+#endif /* ! USE_TBB_ATOMIC */`


			`#ifdef NOTHREADS`
			`@@ -478,7 +471,7 @@ class atomic {`
			`typedef null_mutex spin_mutex;`
			`typedef null_lock<spin_mutex> spin_lock;`

			`-#elif USE_TBB`
			`+#elif USE_TBB_SPINLOCK`

			`// Use TBB's spin locks`
			`typedef tbb::spin_mutex spin_mutex;`
			`@@ -529,63 +522,61 @@ class spin_mutex {`
			`/// Acquire the lock, spin until we have it.`
			`///`
			`void lock () {`
			`-#if defined(no__APPLE__)`
			`- // OS X has dedicated spin lock routines, may as well use them.`
			`- OSSpinLockLock ((OSSpinLock *)&m_locked);`
			`-#else`
			`// To avoid spinning too tightly, we use the atomic_backoff to`
			`// provide increasingly longer pauses, and if the lock is under`
			`// lots of contention, eventually yield the timeslice.`
			`atomic_backoff backoff;`
			`+`
			`// Try to get ownership of the lock. Though experimentation, we`
			`// found that OIIO_UNLIKELY makes this just a bit faster on`
			`// gcc x86/x86_64 systems.`
			`while (! OIIO_UNLIKELY(try_lock())) {`
			`do {`
			`backoff();`
			`- } while ((volatile int )&m_locked);`
			`+ } while (m_locked);`
			`+`
			`// The full try_lock() involves a compare_and_swap, which`
			`// writes memory, and that will lock the bus. But a normal`
			`// read of m_locked will let us spin until the value`
			`// changes, without locking the bus. So it's faster to`
			`// check in this manner until the mutex appears to be free.`
			`}`
			`-#endif`
			`}`

			`/// Release the lock that we hold.`
			`///`
			`void unlock () {`
			`-#if defined(no__APPLE__)`
			`- OSSpinLockUnlock ((OSSpinLock *)&m_locked);`
			`-#elif defined(__GNUC__)`
			`- // GCC gives us an intrinsic that is even better, an atomic`
			`- // assignment of 0 with "release" barrier semantics.`
			`- __sync_lock_release ((volatile int *)&m_locked);`
			`+#if defined(__GNUC__) && (defined(__x86_64__) \|\| defined(__i386__))`
			`+ // Fastest way to do it is with a store with "release" semantics`
			`+ __asm__ __volatile__("": : :"memory");`
			`+ m_locked = 0;`
			`+ // N.B. GCC gives us an intrinsic that is even better, an atomic`
			`+ // assignment of 0 with "release" barrier semantics:`
			`+ // __sync_lock_release (&m_locked);`
			`+ // But empirically we found it not as performant as the above.`
			`+#elif defined(_MSC_VER)`
			`+ _ReadWriteBarrier();`
			`+ m_locked = 0;`
			`#else`
			`// Otherwise, just assign zero to the atomic (but that's a full`
			`// memory barrier).`
			`- m_locked = 0;`
			`+ (atomic_int )&m_locked = 0;`
			`#endif`
			`}`

			`/// Try to acquire the lock. Return true if we have it, false if`
			`/// somebody else is holding the lock.`
			`bool try_lock () {`
			`-#if defined(no__APPLE__)`
			`- return OSSpinLockTry ((OSSpinLock *)&m_locked);`
			`-#else`
			`-# if USE_TBB`
			`+#if USE_TBB_ATOMIC`
			`// TBB's compare_and_swap returns the original value`
			`- return m_locked.compare_and_swap (0, 1) == 0;`
			`-# elif defined(__GNUC__)`
			`+ return ((atomic_int )&m_locked).compare_and_swap (0, 1) == 0;`
			`+#elif defined(__GNUC__)`
			`// GCC gives us an intrinsic that is even better -- an atomic`
			`// exchange with "acquire" barrier semantics.`
			`- return __sync_lock_test_and_set ((volatile int *)&m_locked, 1) == 0;`
			`-# else`
			`+ return __sync_lock_test_and_set (&m_locked, 1) == 0;`
			`+#else`
			`// Our compare_and_swap returns true if it swapped`
			`- return m_locked.bool_compare_and_swap (0, 1);`
			`-# endif`
			`+ return atomic_compare_and_exchange (&m_locked, 0, 1);`
			`#endif`
			`}`

			`@@ -603,7 +594,7 @@ class spin_mutex {`
			`};`

			`private:`
			`- atomic_int m_locked; ///< Atomic counter is zero if nobody holds the lock`
			`+ volatile int m_locked; ///< Atomic counter is zero if nobody holds the lock`
			`};`


			`diff --git a/src/libOpenImageIO/atomic_test.cpp b/src/libOpenImageIO/atomic_test.cpp`
			`index 2c1e807..42d469a 100644`
			`--- a/src/libOpenImageIO/atomic_test.cpp`
			`+++ b/src/libOpenImageIO/atomic_test.cpp`
			`@@ -49,7 +49,7 @@`
			`// and decrementing the crap out of it, and make sure it has the right`
			`// value at the end.`

			`-static int iterations = 160000000;`
			`+static int iterations = 40000000;`
			`static int numthreads = 16;`
			`static int ntrials = 1;`
			`static bool verbose = false;`
			`@@ -184,16 +184,15 @@ int main (int argc, char *argv[])`

			`static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };`
			`for (int i = 0; threadcounts[i] <= numthreads; ++i) {`
			`- int nt = threadcounts[i];`
			`+ int nt = wedge ? threadcounts[i] : numthreads;`
			`int its = iterations/nt;`

			`double range;`
			`double t = time_trial (boost::bind(test_atomics,nt,its),`
			`ntrials, &range);`

			`- std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",`
			`- nt, Strutil::timeintervalformat(t),`
			`- t, range, its);`
			`+ std::cout << Strutil::format ("%2d\t%5.1f range %.2f\t(%d iters/thread)\n",`
			`+ nt, t, range, its);`
			`if (! wedge)`
			`break; // don't loop if we're not wedging`
			`}`
			`diff --git a/src/libOpenImageIO/spinlock_test.cpp b/src/libOpenImageIO/spinlock_test.cpp`
			`index 60c192b..64adbce 100644`
			`--- a/src/libOpenImageIO/spinlock_test.cpp`
			`+++ b/src/libOpenImageIO/spinlock_test.cpp`
			`@@ -50,7 +50,7 @@`
			`// accumulated value is equal to iterations*threads, then the spin locks`
			`// worked.`

			`-static int iterations = 160000000;`
			`+static int iterations = 40000000;`
			`static int numthreads = 16;`
			`static int ntrials = 1;`
			`static bool verbose = false;`
			`@@ -58,6 +58,7 @@`

			`static spin_mutex print_mutex; // make the prints not clobber each other`
			`volatile long long accum = 0;`
			`+float faccum = 0;`
			`spin_mutex mymutex;`


			`@@ -71,10 +72,22 @@`
			`std::cout << "thread " << boost::this_thread::get_id()`
			`<< ", accum = " << accum << "\n";`
			`}`
			`+#if 1`
			`for (int i = 0; i < iterations; ++i) {`
			`spin_lock lock (mymutex);`
			`accum += 1;`
			`}`
			`+#else`
			`+ // Alternate one that mixes in some math to make longer lock hold time,`
			`+ // and also more to do between locks. Interesting contrast in timings.`
			`+ float last = 0.0f;`
			`+ for (int i = 0; i < iterations; ++i) {`
			`+ last = fmodf (sinf(last), 1.0f);`
			`+ spin_lock lock (mymutex);`
			`+ accum += 1;`
			`+ faccum = fmod (sinf(faccum+last), 1.0f);`
			`+ }`
			`+#endif`
			`}`


			`@@ -134,16 +147,15 @@ int main (int argc, char *argv[])`

			`static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };`
			`for (int i = 0; threadcounts[i] <= numthreads; ++i) {`
			`- int nt = threadcounts[i];`
			`+ int nt = wedge ? threadcounts[i] : numthreads;`
			`int its = iterations/nt;`

			`double range;`
			`double t = time_trial (boost::bind(test_spinlock,nt,its),`
			`ntrials, &range);`

			`- std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",`
			`- nt, Strutil::timeintervalformat(t),`
			`- t, range, its);`
			`+ std::cout << Strutil::format ("%2d\t%5.1f range %.2f\t(%d iters/thread)\n",`
			`+ nt, t, range, its);`
			`if (! wedge)`
			`break; // don't loop if we're not wedging`
			`}`
			`diff --git a/src/libtexture/imagecache_pvt.h b/src/libtexture/imagecache_pvt.h`
			`index 5d29782..3a49616 100644`
			`--- a/src/libtexture/imagecache_pvt.h`
			`+++ b/src/libtexture/imagecache_pvt.h`
			`@@ -1003,7 +1003,7 @@ class ImageCacheImpl : public ImageCache {`
			`newval = oldval + incr;`
			`// Now try to atomically swap it, and repeat until we've`
			`// done it with nobody else interfering.`
			`-# if USE_TBB`
			`+# if USE_TBB_ATOMIC`
			`} while (llstat->compare_and_swap (llnewval,lloldval) != *lloldval);`
			`# else`
			`} while (llstat->bool_compare_and_swap (llnewval,lloldval));`
			`--`
			`1.8.1.6`