Update to latest upstream release.

Use new internal atomics as they are now faster than TBB.
2013-05-16 15:48:07 -05:00 · 2013-05-16 15:48:07 -05:00 · 5d225ec202
parent 3a130363bd
commit 5d225ec202
4 changed files with 377 additions and 108 deletions
--- a/589.patch
+++ b/589.patch
@ -0,0 +1,362 @@
+From a58d0ff4935ef14f32f01d4de362bba242f07e0c Mon Sep 17 00:00:00 2001
+From: Larry Gritz <lg@larrygritz.com>
+Date: Sat, 4 May 2013 10:22:12 -0700
+Subject: [PATCH] spinlock tweaks that finally make it as good or better than
+ TBB.
+
+---
+ src/include/thread.h                 | 89 ++++++++++++++++--------------------
+ src/libOpenImageIO/atomic_test.cpp   |  9 ++--
+ src/libOpenImageIO/spinlock_test.cpp | 22 +++++++--
+ src/libtexture/imagecache_pvt.h      |  2 +-
+ 4 files changed, 62 insertions(+), 60 deletions(-)
+
+diff --git a/src/include/thread.h b/src/include/thread.h
+index 28645fc..2cd03c1 100644
+--- a/src/include/thread.h
+++ b/src/include/thread.h
+@@ -78,16 +78,22 @@
+ // Some day, we hope this is all replaced by use of std::atomic<>.
+ #if USE_TBB
+ #  include <tbb/atomic.h>
+-   using tbb::atomic;
+ #  include <tbb/spin_mutex.h>
+#  define USE_TBB_ATOMIC 1
+#  define USE_TBB_SPINLOCK 1
+#else
+#  define USE_TBB_ATOMIC 0
+#  define USE_TBB_SPINLOCK 0
+ #endif
+ 
+
+ #if defined(_MSC_VER) && !USE_TBB
+ #  include <windows.h>
+ #  include <winbase.h>
+ #  pragma intrinsic (_InterlockedExchangeAdd)
+ #  pragma intrinsic (_InterlockedCompareExchange)
+ #  pragma intrinsic (_InterlockedCompareExchange64)
+#  pragma intrinsic (_ReadWriteBarrier)
+ #  if defined(_WIN64)
+ #    pragma intrinsic(_InterlockedExchangeAdd64)
+ #  endif
+@@ -105,10 +111,6 @@
+ #  endif
+ #endif
+ 
+-#ifdef __APPLE__
+-#  include <libkern/OSAtomic.h>
+-#endif
+-
+ #if defined(__GNUC__) && (defined(_GLIBCXX_ATOMIC_BUILTINS) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 401))
+ #if !defined(__FreeBSD__) || defined(__x86_64__)
+ #define USE_GCC_ATOMICS
+@@ -230,9 +232,6 @@ class thread_specific_ptr {
+ #elif USE_TBB
+     atomic<int> *a = (atomic<int> *)at;
+     return a->fetch_and_add (x);
+-#elif defined(no__APPLE__)
+-    // Apple, not inline for Intel (only PPC?)
+-    return OSAtomicAdd32Barrier (x, at) - x;
+ #elif defined(_MSC_VER)
+     // Windows
+     return _InterlockedExchangeAdd ((volatile LONG *)at, x);
+@@ -251,9 +250,6 @@ class thread_specific_ptr {
+ #elif USE_TBB
+     atomic<long long> *a = (atomic<long long> *)at;
+     return a->fetch_and_add (x);
+-#elif defined(no__APPLE__)
+-    // Apple, not inline for Intel (only PPC?)
+-    return OSAtomicAdd64Barrier (x, at) - x;
+ #elif defined(_MSC_VER)
+     // Windows
+ #  if defined(_WIN64)
+@@ -282,8 +278,6 @@ class thread_specific_ptr {
+ #elif USE_TBB
+     atomic<int> *a = (atomic<int> *)at;
+     return a->compare_and_swap (newval, compareval) == newval;
+-#elif defined(no__APPLE__)
+-    return OSAtomicCompareAndSwap32Barrier (compareval, newval, at);
+ #elif defined(_MSC_VER)
+     return (_InterlockedCompareExchange ((volatile LONG *)at, newval, compareval) == compareval);
+ #else
+@@ -301,8 +295,6 @@ class thread_specific_ptr {
+ #elif USE_TBB
+     atomic<long long> *a = (atomic<long long> *)at;
+     return a->compare_and_swap (newval, compareval) == newval;
+-#elif defined(no__APPLE__)
+-    return OSAtomicCompareAndSwap64Barrier (compareval, newval, at);
+ #elif defined(_MSC_VER)
+     return (_InterlockedCompareExchange64 ((volatile LONGLONG *)at, newval, compareval) == compareval);
+ #else
+@@ -317,9 +309,7 @@ class thread_specific_ptr {
+ inline void
+ yield ()
+ {
+-#if USE_TBB
+-    __TBB_Yield ();
+-#elif defined(__GNUC__)
+#if defined(__GNUC__)
+     sched_yield ();
+ #elif defined(_MSC_VER)
+     SwitchToThread ();
+@@ -334,12 +324,12 @@ class thread_specific_ptr {
+ inline void
+ pause (int delay)
+ {
+-#if USE_TBB
+-    __TBB_Pause(delay);
+-#elif defined(__GNUC__)
+#if defined(__GNUC__)
+     for (int i = 0; i < delay; ++i) {
+         __asm__ __volatile__("pause;");
+     }
+#elif USE_TBB
+    __TBB_Pause(delay);
+ #elif defined(_MSC_VER)
+     for (int i = 0; i < delay; ++i) {
+ #if defined (_WIN64)
+@@ -369,14 +359,17 @@ class atomic_backoff {
+             yield();
+         }
+     }
+
+ private:
+     int m_count;
+ };
+ 
+ 
+ 
+-#if (! USE_TBB)
+-// If we're not using TBB, we need to define our own atomic<>.
+#if USE_TBB_ATOMIC
+using tbb::atomic;
+#else
+// If we're not using TBB's atomic, we need to define our own atomic<>.
+ 
+ 
+ /// Atomic integer.  Increment, decrement, add, and subtract in a
+@@ -456,7 +449,7 @@ class atomic {
+ };
+ 
+ 
+-#endif /* ! USE_TBB */
+#endif /* ! USE_TBB_ATOMIC */
+ 
+ 
+ #ifdef NOTHREADS
+@@ -478,7 +471,7 @@ class atomic {
+ typedef null_mutex spin_mutex;
+ typedef null_lock<spin_mutex> spin_lock;
+ 
+-#elif USE_TBB
+#elif USE_TBB_SPINLOCK
+ 
+ // Use TBB's spin locks
+ typedef tbb::spin_mutex spin_mutex;
+@@ -529,63 +522,61 @@ class spin_mutex {
+     /// Acquire the lock, spin until we have it.
+     ///
+     void lock () {
+-#if defined(no__APPLE__)
+-        // OS X has dedicated spin lock routines, may as well use them.
+-        OSSpinLockLock ((OSSpinLock *)&m_locked);
+-#else
+         // To avoid spinning too tightly, we use the atomic_backoff to
+         // provide increasingly longer pauses, and if the lock is under
+         // lots of contention, eventually yield the timeslice.
+         atomic_backoff backoff;
+
+         // Try to get ownership of the lock. Though experimentation, we
+         // found that OIIO_UNLIKELY makes this just a bit faster on 
+         // gcc x86/x86_64 systems.
+         while (! OIIO_UNLIKELY(try_lock())) {
+             do {
+                 backoff();
+-            } while (*(volatile int *)&m_locked);
+            } while (m_locked);
+
+             // The full try_lock() involves a compare_and_swap, which
+             // writes memory, and that will lock the bus.  But a normal
+             // read of m_locked will let us spin until the value
+             // changes, without locking the bus. So it's faster to
+             // check in this manner until the mutex appears to be free.
+         }
+-#endif
+     }
+ 
+     /// Release the lock that we hold.
+     ///
+     void unlock () {
+-#if defined(no__APPLE__)
+-        OSSpinLockUnlock ((OSSpinLock *)&m_locked);
+-#elif defined(__GNUC__)
+-        // GCC gives us an intrinsic that is even better, an atomic
+-        // assignment of 0 with "release" barrier semantics.
+-        __sync_lock_release ((volatile int *)&m_locked);
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+        // Fastest way to do it is with a store with "release" semantics
+        __asm__ __volatile__("": : :"memory");
+        m_locked = 0;
+        // N.B. GCC gives us an intrinsic that is even better, an atomic
+        // assignment of 0 with "release" barrier semantics:
+        //  __sync_lock_release (&m_locked);
+        // But empirically we found it not as performant as the above.
+#elif defined(_MSC_VER)
+        _ReadWriteBarrier();
+        m_locked = 0;
+ #else
+         // Otherwise, just assign zero to the atomic (but that's a full 
+         // memory barrier).
+-        m_locked = 0;
+        *(atomic_int *)&m_locked = 0;
+ #endif
+     }
+ 
+     /// Try to acquire the lock.  Return true if we have it, false if
+     /// somebody else is holding the lock.
+     bool try_lock () {
+-#if defined(no__APPLE__)
+-        return OSSpinLockTry ((OSSpinLock *)&m_locked);
+-#else
+-#  if USE_TBB
+#if USE_TBB_ATOMIC
+         // TBB's compare_and_swap returns the original value
+-        return m_locked.compare_and_swap (0, 1) == 0;
+-#  elif defined(__GNUC__)
+        return (*(atomic_int *)&m_locked).compare_and_swap (0, 1) == 0;
+#elif defined(__GNUC__)
+         // GCC gives us an intrinsic that is even better -- an atomic
+         // exchange with "acquire" barrier semantics.
+-        return __sync_lock_test_and_set ((volatile int *)&m_locked, 1) == 0;
+-#  else
+        return __sync_lock_test_and_set (&m_locked, 1) == 0;
+#else
+         // Our compare_and_swap returns true if it swapped
+-        return m_locked.bool_compare_and_swap (0, 1);
+-#  endif
+        return atomic_compare_and_exchange (&m_locked, 0, 1);
+ #endif
+     }
+ 
+@@ -603,7 +594,7 @@ class spin_mutex {
+     };
+ 
+ private:
+-    atomic_int m_locked;  ///< Atomic counter is zero if nobody holds the lock
+    volatile int m_locked;  ///< Atomic counter is zero if nobody holds the lock
+ };
+ 
+ 
+diff --git a/src/libOpenImageIO/atomic_test.cpp b/src/libOpenImageIO/atomic_test.cpp
+index 2c1e807..42d469a 100644
+--- a/src/libOpenImageIO/atomic_test.cpp
+++ b/src/libOpenImageIO/atomic_test.cpp
+@@ -49,7 +49,7 @@
+ // and decrementing the crap out of it, and make sure it has the right
+ // value at the end.
+ 
+-static int iterations = 160000000;
+static int iterations = 40000000;
+ static int numthreads = 16;
+ static int ntrials = 1;
+ static bool verbose = false;
+@@ -184,16 +184,15 @@ int main (int argc, char *argv[])
+ 
+     static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
+     for (int i = 0; threadcounts[i] <= numthreads; ++i) {
+-        int nt = threadcounts[i];
+        int nt = wedge ? threadcounts[i] : numthreads;
+         int its = iterations/nt;
+ 
+         double range;
+         double t = time_trial (boost::bind(test_atomics,nt,its),
+                                ntrials, &range);
+ 
+-        std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
+-                                      nt, Strutil::timeintervalformat(t),
+-                                      t, range, its);
+        std::cout << Strutil::format ("%2d\t%5.1f   range %.2f\t(%d iters/thread)\n",
+                                      nt, t, range, its);
+         if (! wedge)
+             break;    // don't loop if we're not wedging
+     }
+diff --git a/src/libOpenImageIO/spinlock_test.cpp b/src/libOpenImageIO/spinlock_test.cpp
+index 60c192b..64adbce 100644
+--- a/src/libOpenImageIO/spinlock_test.cpp
+++ b/src/libOpenImageIO/spinlock_test.cpp
+@@ -50,7 +50,7 @@
+ // accumulated value is equal to iterations*threads, then the spin locks
+ // worked.
+ 
+-static int iterations = 160000000;
+static int iterations = 40000000;
+ static int numthreads = 16;
+ static int ntrials = 1;
+ static bool verbose = false;
+@@ -58,6 +58,7 @@
+ 
+ static spin_mutex print_mutex;  // make the prints not clobber each other
+ volatile long long accum = 0;
+float faccum = 0;
+ spin_mutex mymutex;
+ 
+ 
+@@ -71,10 +72,22 @@
+         std::cout << "thread " << boost::this_thread::get_id() 
+                   << ", accum = " << accum << "\n";
+     }
+#if 1
+     for (int i = 0;  i < iterations;  ++i) {
+         spin_lock lock (mymutex);
+         accum += 1;
+     }
+#else
+    // Alternate one that mixes in some math to make longer lock hold time,
+    // and also more to do between locks.  Interesting contrast in timings.
+    float last = 0.0f;
+    for (int i = 0;  i < iterations;  ++i) {
+        last = fmodf (sinf(last), 1.0f);
+        spin_lock lock (mymutex);
+        accum += 1;
+        faccum = fmod (sinf(faccum+last), 1.0f);
+    }
+#endif
+ }
+ 
+ 
+@@ -134,16 +147,15 @@ int main (int argc, char *argv[])
+ 
+     static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
+     for (int i = 0; threadcounts[i] <= numthreads; ++i) {
+-        int nt = threadcounts[i];
+        int nt = wedge ? threadcounts[i] : numthreads;
+         int its = iterations/nt;
+ 
+         double range;
+         double t = time_trial (boost::bind(test_spinlock,nt,its),
+                                ntrials, &range);
+ 
+-        std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
+-                                      nt, Strutil::timeintervalformat(t),
+-                                      t, range, its);
+        std::cout << Strutil::format ("%2d\t%5.1f   range %.2f\t(%d iters/thread)\n",
+                                      nt, t, range, its);
+         if (! wedge)
+             break;    // don't loop if we're not wedging
+     }
+diff --git a/src/libtexture/imagecache_pvt.h b/src/libtexture/imagecache_pvt.h
+index 5d29782..3a49616 100644
+--- a/src/libtexture/imagecache_pvt.h
+++ b/src/libtexture/imagecache_pvt.h
+@@ -1003,7 +1003,7 @@ class ImageCacheImpl : public ImageCache {
+             newval = oldval + incr;
+             // Now try to atomically swap it, and repeat until we've
+             // done it with nobody else interfering.
+-#  if USE_TBB
+#  if USE_TBB_ATOMIC
+         } while (llstat->compare_and_swap (*llnewval,*lloldval) != *lloldval);
+ #  else
+         } while (llstat->bool_compare_and_swap (*llnewval,*lloldval));
+-- 
+1.8.1.6
+
--- a/OpenImageIO-1.1.2-use_external_tbb.patch
+++ b/OpenImageIO-1.1.2-use_external_tbb.patch
@ -1,68 +0,0 @@
-diff -Naur oiio-Release-1.1.2.orig/src/CMakeLists.txt oiio-Release-1.1.2/src/CMakeLists.txt
--- oiio-Release-1.1.2.orig/src/CMakeLists.txt	2012-12-05 12:46:56.000000000 -0600
-+++ oiio-Release-1.1.2/src/CMakeLists.txt	2013-01-02 15:52:43.941560982 -0600
-@@ -83,6 +83,8 @@
- set (PYTHON_VERSION 2.6)
- set (USE_EXTERNAL_PUGIXML OFF CACHE BOOL
-      "Use an externally built shared library version of the pugixml library")
-+set (USE_EXTERNAL_TBB OFF CACHE BOOL
-+     "Use system TBB library instead of bundled.")
- 
- set (SOVERSION ${OIIO_VERSION_MAJOR}.${OIIO_VERSION_MINOR}
-      CACHE STRING "Set the SO version in the SO name of the output library")
-diff -Naur oiio-Release-1.1.2.orig/src/include/CMakeLists.txt oiio-Release-1.1.2/src/include/CMakeLists.txt
--- oiio-Release-1.1.2.orig/src/include/CMakeLists.txt	2012-12-05 12:46:56.000000000 -0600
-+++ oiio-Release-1.1.2/src/include/CMakeLists.txt	2013-01-02 15:52:43.940561015 -0600
-@@ -22,7 +22,7 @@
- install (FILES ${public_headers} DESTINATION ${INCLUDE_INSTALL_DIR}
-          COMPONENT developer)
- 
-if (USE_TBB)
-+if (USE_TBB AND NOT USE_EXTERNAL_TBB)
-     install (DIRECTORY tbb DESTINATION ${INCLUDE_INSTALL_DIR}
-              COMPONENT developer)
- endif ()
-diff -Naur oiio-Release-1.1.2.orig/src/libOpenImageIO/CMakeLists.txt oiio-Release-1.1.2/src/libOpenImageIO/CMakeLists.txt
--- oiio-Release-1.1.2.orig/src/libOpenImageIO/CMakeLists.txt	2012-12-05 12:46:56.000000000 -0600
-+++ oiio-Release-1.1.2/src/libOpenImageIO/CMakeLists.txt	2013-01-02 15:52:43.941560982 -0600
-@@ -62,7 +62,13 @@
- endif ()
- 
- # Include our own TBB if using it
-if (USE_TBB)
-+if (USE_TBB AND USE_EXTERNAL_TBB)
-+    message (STATUS "System TBB library will be used.")
-+    find_package (TBB REQUIRED)
-+    include_directories (${TBB_INCLUDE_DIRS})
-+    set (libOpenImageIO_srcs ${libOpenImageIO_srcs})
-+elseif (USE_TBB AND NOT USE_EXTERNAL_TBB)
-+    message (STATUS "Built-in TBB library will be used.")
-     set (libOpenImageIO_srcs ${libOpenImageIO_srcs} ../libutil/tbb_misc.cpp)
- endif ()
- 
-@@ -202,7 +208,11 @@
-                            ${VISIBILITY_COMMAND} ${VISIBILITY_MAP_COMMAND}
-                            ${Boost_LIBRARIES})
- 
-
-+# Link against system TBB library if specified
-+if (USE_TBB AND USE_EXTERNAL_TBB)
-+    message (STATUS "Linking TBB: ${TBB_LIBRARIES}")
-+    target_link_libraries (OpenImageIO ${TBB_LIBRARIES})
-+endif ()
- 
- # Include OpenColorIO if using it
- if (USE_OCIO AND OCIO_FOUND)
-diff -Naur oiio-Release-1.1.2.orig/src/libutil/tbb_misc.cpp oiio-Release-1.1.2/src/libutil/tbb_misc.cpp
--- oiio-Release-1.1.2.orig/src/libutil/tbb_misc.cpp	2012-12-05 12:46:56.000000000 -0600
-+++ oiio-Release-1.1.2/src/libutil/tbb_misc.cpp	2013-01-02 15:53:10.403678615 -0600
-@@ -30,8 +30,7 @@
- // an executing program.
- 
- #include "tbb/tbb_stddef.h"
-// Out-of-line TBB assertion handling routines are instantiated here.
-#include "tbb/tbb_assert_impl.h"
-+#include "tbb/tbb_machine.h"
- 
- #include "tbb/tbb_misc.h"
- #include <cstdio>
--- a/OpenImageIO-1.1.3-SHA1_undef_ref.patch
+++ b/OpenImageIO-1.1.3-SHA1_undef_ref.patch
@ -1,14 +0,0 @@
-diff -Naur oiio-Release-1.1.3.orig/src/libutil/SHA1.cpp oiio-Release-1.1.3/src/libutil/SHA1.cpp
--- oiio-Release-1.1.3.orig/src/libutil/SHA1.cpp	2013-01-09 19:13:37.000000000 -0600
-+++ oiio-Release-1.1.3/src/libutil/SHA1.cpp	2013-01-15 07:53:27.479132623 -0600
-@@ -8,9 +8,9 @@
- 
- // If compiling with MFC, you might want to add #include "StdAfx.h"
- 
-+#include "SHA1.h"
- #include "hash.h"
- #include "dassert.h"
-#include "SHA1.h"
- 
- #ifdef SHA1_UTILITY_FUNCTIONS
- #define SHA1_MAX_FILE_BUFFER 8000
--- a/OpenImageIO.spec
+++ b/OpenImageIO.spec
@ -1,28 +1,20 @@
-%global githash1 g0b78dec
-%global githash2 0d48631
-%global githash3 9bf4356
-
 Name:           OpenImageIO
-Version:        1.1.3
-Release:        7%{?dist}
+Version:        1.1.10
+Release:        1%{?dist}
 Summary:        Library for reading and writing images

 Group:          Development/Libraries
 License:        BSD
 URL:            https://sites.google.com/site/openimageio/home

-#Source0:        https://download.github.com/%{name}-oiio-Release-%{version}-0-%{githash1}.tar.gz
 Source0:        https://download.github.com/oiio-Release-%{version}.tar.gz
 # Images for test suite
-#Source1:        %{name}-oiio-images-%{githash3}.tar.gz
+Source1:        oiio-images.tar.gz
 Source101:      FindTBB.cmake

-Patch0:         OpenImageIO-1.1.2-use_external_tbb.patch
-Patch2:         OpenImageIO-ppc.patch
-# https://github.com/OpenImageIO/oiio/issues/473
-Patch3:         OpenImageIO-1.1.3-SHA1_undef_ref.patch
-# https://github.com/The11ers/oiio/commit/010754d2a9b4b41f658a7752046c9217abaf98fc
-Patch4:         oiio-arm.patch
+Patch0:         OpenImageIO-ppc.patch
+Patch1:         589.patch
+Patch2:         oiio-arm.patch

 BuildRequires:  cmake txt2man
 BuildRequires:  qt4-devel
@ -87,14 +79,12 @@ Development files for package %{name}


 %prep
-#setup -q -n %{name}-oiio-%{githash2}
 %setup -q -n oiio-Release-%{version}
-%patch0 -p1 -b .exttbb
-%ifarch ppc %{power64}
-%patch2 -p1 -b .ppc
+%ifarch ppc ppc64
+%patch0 -p1 -b .ppc
 %endif
-%patch3 -p1 -b .sha1
-%patch4 -p1 -b .arm
+%patch1 -p1 -b .spinlocks
+%patch2 -p1 -b .arm

 # Install FindTBB.cmake
 install %{SOURCE101} src/cmake/modules/
@ -122,13 +112,8 @@ rm -rf build/linux && mkdir -p build/linux && pushd build/linux
       -DPYLIB_INSTALL_DIR:PATH=%{python_sitearch} \
       -DINSTALL_DOCS:BOOL=FALSE \
       -DUSE_EXTERNAL_PUGIXML:BOOL=TRUE \
-%ifarch x86_64
-       -DUSE_TBB:BOOL=TRUE \
-       -DUSE_EXTERNAL_TBB=TRUE \
-%else
       -DUSE_TBB:BOOL=FALSE \
-%endif
-%ifarch ppc %{power64}
+%ifarch ppc ppc64
       -DNOTHREADS:BOOL=TRUE \
 %endif
       ../../src
@ -150,6 +135,7 @@ cp -a doc/*.1 %{buildroot}%{_mandir}/man1


 %check
+# Not all tests pass on linux
 #pushd build/linux && make test


@ -175,6 +161,9 @@ cp -a doc/*.1 %{buildroot}%{_mandir}/man1


 %changelog
+* Tue Apr 23 2013 Richard Shaw <hobbes1069@gmail.com> - 1.1.10-1
+- Update to latest upstream release.
+
 * Sun Mar 31 2013 Peter Robinson <pbrobinson@fedoraproject.org> 1.1.3-7
 - Add upstream patch to fix FTBFS on ARM (RHBZ 924932)