Update to latest upstream release.

Use new internal atomics as they are now faster than TBB.
This commit is contained in:
Richard M. Shaw 2013-05-16 15:48:07 -05:00
parent 3a130363bd
commit 5d225ec202
4 changed files with 377 additions and 108 deletions

362
589.patch Normal file
View File

@ -0,0 +1,362 @@
From a58d0ff4935ef14f32f01d4de362bba242f07e0c Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 4 May 2013 10:22:12 -0700
Subject: [PATCH] spinlock tweaks that finally make it as good or better than
TBB.
---
src/include/thread.h | 89 ++++++++++++++++--------------------
src/libOpenImageIO/atomic_test.cpp | 9 ++--
src/libOpenImageIO/spinlock_test.cpp | 22 +++++++--
src/libtexture/imagecache_pvt.h | 2 +-
4 files changed, 62 insertions(+), 60 deletions(-)
diff --git a/src/include/thread.h b/src/include/thread.h
index 28645fc..2cd03c1 100644
--- a/src/include/thread.h
+++ b/src/include/thread.h
@@ -78,16 +78,22 @@
// Some day, we hope this is all replaced by use of std::atomic<>.
#if USE_TBB
# include <tbb/atomic.h>
- using tbb::atomic;
# include <tbb/spin_mutex.h>
+# define USE_TBB_ATOMIC 1
+# define USE_TBB_SPINLOCK 1
+#else
+# define USE_TBB_ATOMIC 0
+# define USE_TBB_SPINLOCK 0
#endif
+
#if defined(_MSC_VER) && !USE_TBB
# include <windows.h>
# include <winbase.h>
# pragma intrinsic (_InterlockedExchangeAdd)
# pragma intrinsic (_InterlockedCompareExchange)
# pragma intrinsic (_InterlockedCompareExchange64)
+# pragma intrinsic (_ReadWriteBarrier)
# if defined(_WIN64)
# pragma intrinsic(_InterlockedExchangeAdd64)
# endif
@@ -105,10 +111,6 @@
# endif
#endif
-#ifdef __APPLE__
-# include <libkern/OSAtomic.h>
-#endif
-
#if defined(__GNUC__) && (defined(_GLIBCXX_ATOMIC_BUILTINS) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 401))
#if !defined(__FreeBSD__) || defined(__x86_64__)
#define USE_GCC_ATOMICS
@@ -230,9 +232,6 @@ class thread_specific_ptr {
#elif USE_TBB
atomic<int> *a = (atomic<int> *)at;
return a->fetch_and_add (x);
-#elif defined(no__APPLE__)
- // Apple, not inline for Intel (only PPC?)
- return OSAtomicAdd32Barrier (x, at) - x;
#elif defined(_MSC_VER)
// Windows
return _InterlockedExchangeAdd ((volatile LONG *)at, x);
@@ -251,9 +250,6 @@ class thread_specific_ptr {
#elif USE_TBB
atomic<long long> *a = (atomic<long long> *)at;
return a->fetch_and_add (x);
-#elif defined(no__APPLE__)
- // Apple, not inline for Intel (only PPC?)
- return OSAtomicAdd64Barrier (x, at) - x;
#elif defined(_MSC_VER)
// Windows
# if defined(_WIN64)
@@ -282,8 +278,6 @@ class thread_specific_ptr {
#elif USE_TBB
atomic<int> *a = (atomic<int> *)at;
return a->compare_and_swap (newval, compareval) == newval;
-#elif defined(no__APPLE__)
- return OSAtomicCompareAndSwap32Barrier (compareval, newval, at);
#elif defined(_MSC_VER)
return (_InterlockedCompareExchange ((volatile LONG *)at, newval, compareval) == compareval);
#else
@@ -301,8 +295,6 @@ class thread_specific_ptr {
#elif USE_TBB
atomic<long long> *a = (atomic<long long> *)at;
return a->compare_and_swap (newval, compareval) == newval;
-#elif defined(no__APPLE__)
- return OSAtomicCompareAndSwap64Barrier (compareval, newval, at);
#elif defined(_MSC_VER)
return (_InterlockedCompareExchange64 ((volatile LONGLONG *)at, newval, compareval) == compareval);
#else
@@ -317,9 +309,7 @@ class thread_specific_ptr {
inline void
yield ()
{
-#if USE_TBB
- __TBB_Yield ();
-#elif defined(__GNUC__)
+#if defined(__GNUC__)
sched_yield ();
#elif defined(_MSC_VER)
SwitchToThread ();
@@ -334,12 +324,12 @@ class thread_specific_ptr {
inline void
pause (int delay)
{
-#if USE_TBB
- __TBB_Pause(delay);
-#elif defined(__GNUC__)
+#if defined(__GNUC__)
for (int i = 0; i < delay; ++i) {
__asm__ __volatile__("pause;");
}
+#elif USE_TBB
+ __TBB_Pause(delay);
#elif defined(_MSC_VER)
for (int i = 0; i < delay; ++i) {
#if defined (_WIN64)
@@ -369,14 +359,17 @@ class atomic_backoff {
yield();
}
}
+
private:
int m_count;
};
-#if (! USE_TBB)
-// If we're not using TBB, we need to define our own atomic<>.
+#if USE_TBB_ATOMIC
+using tbb::atomic;
+#else
+// If we're not using TBB's atomic, we need to define our own atomic<>.
/// Atomic integer. Increment, decrement, add, and subtract in a
@@ -456,7 +449,7 @@ class atomic {
};
-#endif /* ! USE_TBB */
+#endif /* ! USE_TBB_ATOMIC */
#ifdef NOTHREADS
@@ -478,7 +471,7 @@ class atomic {
typedef null_mutex spin_mutex;
typedef null_lock<spin_mutex> spin_lock;
-#elif USE_TBB
+#elif USE_TBB_SPINLOCK
// Use TBB's spin locks
typedef tbb::spin_mutex spin_mutex;
@@ -529,63 +522,61 @@ class spin_mutex {
/// Acquire the lock, spin until we have it.
///
void lock () {
-#if defined(no__APPLE__)
- // OS X has dedicated spin lock routines, may as well use them.
- OSSpinLockLock ((OSSpinLock *)&m_locked);
-#else
// To avoid spinning too tightly, we use the atomic_backoff to
// provide increasingly longer pauses, and if the lock is under
// lots of contention, eventually yield the timeslice.
atomic_backoff backoff;
+
// Try to get ownership of the lock. Though experimentation, we
// found that OIIO_UNLIKELY makes this just a bit faster on
// gcc x86/x86_64 systems.
while (! OIIO_UNLIKELY(try_lock())) {
do {
backoff();
- } while (*(volatile int *)&m_locked);
+ } while (m_locked);
+
// The full try_lock() involves a compare_and_swap, which
// writes memory, and that will lock the bus. But a normal
// read of m_locked will let us spin until the value
// changes, without locking the bus. So it's faster to
// check in this manner until the mutex appears to be free.
}
-#endif
}
/// Release the lock that we hold.
///
void unlock () {
-#if defined(no__APPLE__)
- OSSpinLockUnlock ((OSSpinLock *)&m_locked);
-#elif defined(__GNUC__)
- // GCC gives us an intrinsic that is even better, an atomic
- // assignment of 0 with "release" barrier semantics.
- __sync_lock_release ((volatile int *)&m_locked);
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+ // Fastest way to do it is with a store with "release" semantics
+ __asm__ __volatile__("": : :"memory");
+ m_locked = 0;
+ // N.B. GCC gives us an intrinsic that is even better, an atomic
+ // assignment of 0 with "release" barrier semantics:
+ // __sync_lock_release (&m_locked);
+ // But empirically we found it not as performant as the above.
+#elif defined(_MSC_VER)
+ _ReadWriteBarrier();
+ m_locked = 0;
#else
// Otherwise, just assign zero to the atomic (but that's a full
// memory barrier).
- m_locked = 0;
+ *(atomic_int *)&m_locked = 0;
#endif
}
/// Try to acquire the lock. Return true if we have it, false if
/// somebody else is holding the lock.
bool try_lock () {
-#if defined(no__APPLE__)
- return OSSpinLockTry ((OSSpinLock *)&m_locked);
-#else
-# if USE_TBB
+#if USE_TBB_ATOMIC
// TBB's compare_and_swap returns the original value
- return m_locked.compare_and_swap (0, 1) == 0;
-# elif defined(__GNUC__)
+ return (*(atomic_int *)&m_locked).compare_and_swap (0, 1) == 0;
+#elif defined(__GNUC__)
// GCC gives us an intrinsic that is even better -- an atomic
// exchange with "acquire" barrier semantics.
- return __sync_lock_test_and_set ((volatile int *)&m_locked, 1) == 0;
-# else
+ return __sync_lock_test_and_set (&m_locked, 1) == 0;
+#else
// Our compare_and_swap returns true if it swapped
- return m_locked.bool_compare_and_swap (0, 1);
-# endif
+ return atomic_compare_and_exchange (&m_locked, 0, 1);
#endif
}
@@ -603,7 +594,7 @@ class spin_mutex {
};
private:
- atomic_int m_locked; ///< Atomic counter is zero if nobody holds the lock
+ volatile int m_locked; ///< Atomic counter is zero if nobody holds the lock
};
diff --git a/src/libOpenImageIO/atomic_test.cpp b/src/libOpenImageIO/atomic_test.cpp
index 2c1e807..42d469a 100644
--- a/src/libOpenImageIO/atomic_test.cpp
+++ b/src/libOpenImageIO/atomic_test.cpp
@@ -49,7 +49,7 @@
// and decrementing the crap out of it, and make sure it has the right
// value at the end.
-static int iterations = 160000000;
+static int iterations = 40000000;
static int numthreads = 16;
static int ntrials = 1;
static bool verbose = false;
@@ -184,16 +184,15 @@ int main (int argc, char *argv[])
static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
for (int i = 0; threadcounts[i] <= numthreads; ++i) {
- int nt = threadcounts[i];
+ int nt = wedge ? threadcounts[i] : numthreads;
int its = iterations/nt;
double range;
double t = time_trial (boost::bind(test_atomics,nt,its),
ntrials, &range);
- std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
- nt, Strutil::timeintervalformat(t),
- t, range, its);
+ std::cout << Strutil::format ("%2d\t%5.1f range %.2f\t(%d iters/thread)\n",
+ nt, t, range, its);
if (! wedge)
break; // don't loop if we're not wedging
}
diff --git a/src/libOpenImageIO/spinlock_test.cpp b/src/libOpenImageIO/spinlock_test.cpp
index 60c192b..64adbce 100644
--- a/src/libOpenImageIO/spinlock_test.cpp
+++ b/src/libOpenImageIO/spinlock_test.cpp
@@ -50,7 +50,7 @@
// accumulated value is equal to iterations*threads, then the spin locks
// worked.
-static int iterations = 160000000;
+static int iterations = 40000000;
static int numthreads = 16;
static int ntrials = 1;
static bool verbose = false;
@@ -58,6 +58,7 @@
static spin_mutex print_mutex; // make the prints not clobber each other
volatile long long accum = 0;
+float faccum = 0;
spin_mutex mymutex;
@@ -71,10 +72,22 @@
std::cout << "thread " << boost::this_thread::get_id()
<< ", accum = " << accum << "\n";
}
+#if 1
for (int i = 0; i < iterations; ++i) {
spin_lock lock (mymutex);
accum += 1;
}
+#else
+ // Alternate one that mixes in some math to make longer lock hold time,
+ // and also more to do between locks. Interesting contrast in timings.
+ float last = 0.0f;
+ for (int i = 0; i < iterations; ++i) {
+ last = fmodf (sinf(last), 1.0f);
+ spin_lock lock (mymutex);
+ accum += 1;
+ faccum = fmod (sinf(faccum+last), 1.0f);
+ }
+#endif
}
@@ -134,16 +147,15 @@ int main (int argc, char *argv[])
static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
for (int i = 0; threadcounts[i] <= numthreads; ++i) {
- int nt = threadcounts[i];
+ int nt = wedge ? threadcounts[i] : numthreads;
int its = iterations/nt;
double range;
double t = time_trial (boost::bind(test_spinlock,nt,its),
ntrials, &range);
- std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
- nt, Strutil::timeintervalformat(t),
- t, range, its);
+ std::cout << Strutil::format ("%2d\t%5.1f range %.2f\t(%d iters/thread)\n",
+ nt, t, range, its);
if (! wedge)
break; // don't loop if we're not wedging
}
diff --git a/src/libtexture/imagecache_pvt.h b/src/libtexture/imagecache_pvt.h
index 5d29782..3a49616 100644
--- a/src/libtexture/imagecache_pvt.h
+++ b/src/libtexture/imagecache_pvt.h
@@ -1003,7 +1003,7 @@ class ImageCacheImpl : public ImageCache {
newval = oldval + incr;
// Now try to atomically swap it, and repeat until we've
// done it with nobody else interfering.
-# if USE_TBB
+# if USE_TBB_ATOMIC
} while (llstat->compare_and_swap (*llnewval,*lloldval) != *lloldval);
# else
} while (llstat->bool_compare_and_swap (*llnewval,*lloldval));
--
1.8.1.6

View File

@ -1,68 +0,0 @@
diff -Naur oiio-Release-1.1.2.orig/src/CMakeLists.txt oiio-Release-1.1.2/src/CMakeLists.txt
--- oiio-Release-1.1.2.orig/src/CMakeLists.txt 2012-12-05 12:46:56.000000000 -0600
+++ oiio-Release-1.1.2/src/CMakeLists.txt 2013-01-02 15:52:43.941560982 -0600
@@ -83,6 +83,8 @@
set (PYTHON_VERSION 2.6)
set (USE_EXTERNAL_PUGIXML OFF CACHE BOOL
"Use an externally built shared library version of the pugixml library")
+set (USE_EXTERNAL_TBB OFF CACHE BOOL
+ "Use system TBB library instead of bundled.")
set (SOVERSION ${OIIO_VERSION_MAJOR}.${OIIO_VERSION_MINOR}
CACHE STRING "Set the SO version in the SO name of the output library")
diff -Naur oiio-Release-1.1.2.orig/src/include/CMakeLists.txt oiio-Release-1.1.2/src/include/CMakeLists.txt
--- oiio-Release-1.1.2.orig/src/include/CMakeLists.txt 2012-12-05 12:46:56.000000000 -0600
+++ oiio-Release-1.1.2/src/include/CMakeLists.txt 2013-01-02 15:52:43.940561015 -0600
@@ -22,7 +22,7 @@
install (FILES ${public_headers} DESTINATION ${INCLUDE_INSTALL_DIR}
COMPONENT developer)
-if (USE_TBB)
+if (USE_TBB AND NOT USE_EXTERNAL_TBB)
install (DIRECTORY tbb DESTINATION ${INCLUDE_INSTALL_DIR}
COMPONENT developer)
endif ()
diff -Naur oiio-Release-1.1.2.orig/src/libOpenImageIO/CMakeLists.txt oiio-Release-1.1.2/src/libOpenImageIO/CMakeLists.txt
--- oiio-Release-1.1.2.orig/src/libOpenImageIO/CMakeLists.txt 2012-12-05 12:46:56.000000000 -0600
+++ oiio-Release-1.1.2/src/libOpenImageIO/CMakeLists.txt 2013-01-02 15:52:43.941560982 -0600
@@ -62,7 +62,13 @@
endif ()
# Include our own TBB if using it
-if (USE_TBB)
+if (USE_TBB AND USE_EXTERNAL_TBB)
+ message (STATUS "System TBB library will be used.")
+ find_package (TBB REQUIRED)
+ include_directories (${TBB_INCLUDE_DIRS})
+ set (libOpenImageIO_srcs ${libOpenImageIO_srcs})
+elseif (USE_TBB AND NOT USE_EXTERNAL_TBB)
+ message (STATUS "Built-in TBB library will be used.")
set (libOpenImageIO_srcs ${libOpenImageIO_srcs} ../libutil/tbb_misc.cpp)
endif ()
@@ -202,7 +208,11 @@
${VISIBILITY_COMMAND} ${VISIBILITY_MAP_COMMAND}
${Boost_LIBRARIES})
-
+# Link against system TBB library if specified
+if (USE_TBB AND USE_EXTERNAL_TBB)
+ message (STATUS "Linking TBB: ${TBB_LIBRARIES}")
+ target_link_libraries (OpenImageIO ${TBB_LIBRARIES})
+endif ()
# Include OpenColorIO if using it
if (USE_OCIO AND OCIO_FOUND)
diff -Naur oiio-Release-1.1.2.orig/src/libutil/tbb_misc.cpp oiio-Release-1.1.2/src/libutil/tbb_misc.cpp
--- oiio-Release-1.1.2.orig/src/libutil/tbb_misc.cpp 2012-12-05 12:46:56.000000000 -0600
+++ oiio-Release-1.1.2/src/libutil/tbb_misc.cpp 2013-01-02 15:53:10.403678615 -0600
@@ -30,8 +30,7 @@
// an executing program.
#include "tbb/tbb_stddef.h"
-// Out-of-line TBB assertion handling routines are instantiated here.
-#include "tbb/tbb_assert_impl.h"
+#include "tbb/tbb_machine.h"
#include "tbb/tbb_misc.h"
#include <cstdio>

View File

@ -1,14 +0,0 @@
diff -Naur oiio-Release-1.1.3.orig/src/libutil/SHA1.cpp oiio-Release-1.1.3/src/libutil/SHA1.cpp
--- oiio-Release-1.1.3.orig/src/libutil/SHA1.cpp 2013-01-09 19:13:37.000000000 -0600
+++ oiio-Release-1.1.3/src/libutil/SHA1.cpp 2013-01-15 07:53:27.479132623 -0600
@@ -8,9 +8,9 @@
// If compiling with MFC, you might want to add #include "StdAfx.h"
+#include "SHA1.h"
#include "hash.h"
#include "dassert.h"
-#include "SHA1.h"
#ifdef SHA1_UTILITY_FUNCTIONS
#define SHA1_MAX_FILE_BUFFER 8000

View File

@ -1,28 +1,20 @@
%global githash1 g0b78dec
%global githash2 0d48631
%global githash3 9bf4356
Name: OpenImageIO
Version: 1.1.3
Release: 7%{?dist}
Version: 1.1.10
Release: 1%{?dist}
Summary: Library for reading and writing images
Group: Development/Libraries
License: BSD
URL: https://sites.google.com/site/openimageio/home
#Source0: https://download.github.com/%{name}-oiio-Release-%{version}-0-%{githash1}.tar.gz
Source0: https://download.github.com/oiio-Release-%{version}.tar.gz
# Images for test suite
#Source1: %{name}-oiio-images-%{githash3}.tar.gz
Source1: oiio-images.tar.gz
Source101: FindTBB.cmake
Patch0: OpenImageIO-1.1.2-use_external_tbb.patch
Patch2: OpenImageIO-ppc.patch
# https://github.com/OpenImageIO/oiio/issues/473
Patch3: OpenImageIO-1.1.3-SHA1_undef_ref.patch
# https://github.com/The11ers/oiio/commit/010754d2a9b4b41f658a7752046c9217abaf98fc
Patch4: oiio-arm.patch
Patch0: OpenImageIO-ppc.patch
Patch1: 589.patch
Patch2: oiio-arm.patch
BuildRequires: cmake txt2man
BuildRequires: qt4-devel
@ -87,14 +79,12 @@ Development files for package %{name}
%prep
#setup -q -n %{name}-oiio-%{githash2}
%setup -q -n oiio-Release-%{version}
%patch0 -p1 -b .exttbb
%ifarch ppc %{power64}
%patch2 -p1 -b .ppc
%ifarch ppc ppc64
%patch0 -p1 -b .ppc
%endif
%patch3 -p1 -b .sha1
%patch4 -p1 -b .arm
%patch1 -p1 -b .spinlocks
%patch2 -p1 -b .arm
# Install FindTBB.cmake
install %{SOURCE101} src/cmake/modules/
@ -122,13 +112,8 @@ rm -rf build/linux && mkdir -p build/linux && pushd build/linux
-DPYLIB_INSTALL_DIR:PATH=%{python_sitearch} \
-DINSTALL_DOCS:BOOL=FALSE \
-DUSE_EXTERNAL_PUGIXML:BOOL=TRUE \
%ifarch x86_64
-DUSE_TBB:BOOL=TRUE \
-DUSE_EXTERNAL_TBB=TRUE \
%else
-DUSE_TBB:BOOL=FALSE \
%endif
%ifarch ppc %{power64}
%ifarch ppc ppc64
-DNOTHREADS:BOOL=TRUE \
%endif
../../src
@ -150,6 +135,7 @@ cp -a doc/*.1 %{buildroot}%{_mandir}/man1
%check
# Not all tests pass on linux
#pushd build/linux && make test
@ -175,6 +161,9 @@ cp -a doc/*.1 %{buildroot}%{_mandir}/man1
%changelog
* Tue Apr 23 2013 Richard Shaw <hobbes1069@gmail.com> - 1.1.10-1
- Update to latest upstream release.
* Sun Mar 31 2013 Peter Robinson <pbrobinson@fedoraproject.org> 1.1.3-7
- Add upstream patch to fix FTBFS on ARM (RHBZ 924932)