Short description: New pthread_barrier algorithm to fulfill barrier destruction requirements. Author(s): Torvald Riegel Origin: PATCH Bug-RHEL: NA Bug-Fedora: NA Bug-Upstream: #13065 Upstream status: http://patchwork.sourceware.org/patch/10062/ # commit d6533b39f004789e0de4b7d58a29f8282ee95f7b # Author: Torvald Riegel # Date: Wed Jun 24 14:37:32 2015 +0200 # # New pthread_barrier algorithm to fulfill barrier destruction requirements. # # The previous barrier implementation did not fulfill the POSIX requirements # for when a barrier can be destroyed. Specifically, it was possible that # threads that haven't noticed yet that their round is complete still access # the barrier's memory, and that those accesses can happen after the barrier # has been legally destroyed. # The new algorithm does not have this issue, and it avoids using a lock # internally. # Index: glibc-2.22-621-g90c400b/nptl/Makefile =================================================================== --- glibc-2.22-621-g90c400b.orig/nptl/Makefile +++ glibc-2.22-621-g90c400b/nptl/Makefile @@ -243,7 +243,7 @@ tests = tst-typesizes \ tst-sem1 tst-sem2 tst-sem3 tst-sem4 tst-sem5 tst-sem6 tst-sem7 \ tst-sem8 tst-sem9 tst-sem10 tst-sem11 tst-sem12 tst-sem13 tst-sem14 \ tst-sem15 \ - tst-barrier1 tst-barrier2 tst-barrier3 tst-barrier4 \ + tst-barrier1 tst-barrier2 tst-barrier3 tst-barrier4 tst-barrier5 \ tst-align tst-align3 \ tst-basic1 tst-basic2 tst-basic3 tst-basic4 tst-basic5 tst-basic6 \ tst-basic7 \ @@ -304,7 +304,7 @@ tests-nolibpthread = tst-unload gen-as-const-headers = pthread-errnos.sym \ lowlevelcond.sym lowlevelrwlock.sym \ - lowlevelbarrier.sym unwindbuf.sym \ + unwindbuf.sym \ lowlevelrobustlock.sym pthread-pi-defines.sym Index: glibc-2.22-621-g90c400b/nptl/lowlevelbarrier.sym =================================================================== --- glibc-2.22-621-g90c400b.orig/nptl/lowlevelbarrier.sym +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include -#include -#include "internaltypes.h" - --- - -CURR_EVENT offsetof (struct pthread_barrier, curr_event) -MUTEX offsetof (struct pthread_barrier, lock) -LEFT offsetof (struct pthread_barrier, left) -INIT_COUNT offsetof (struct pthread_barrier, init_count) -PRIVATE offsetof (struct pthread_barrier, private) Index: glibc-2.22-621-g90c400b/nptl/pthread_barrier_destroy.c =================================================================== --- glibc-2.22-621-g90c400b.orig/nptl/pthread_barrier_destroy.c +++ glibc-2.22-621-g90c400b/nptl/pthread_barrier_destroy.c @@ -18,25 +18,44 @@ #include #include "pthreadP.h" -#include +#include +#include int pthread_barrier_destroy (pthread_barrier_t *barrier) { - struct pthread_barrier *ibarrier; - int result = EBUSY; + struct pthread_barrier *bar = (struct pthread_barrier *) barrier; - ibarrier = (struct pthread_barrier *) barrier; + /* Destroying a barrier is only allowed if no thread is blocked on it. + Thus, there is no unfinished round, and all modifications to IN will + have happened before us (either because the calling thread took part + in the most recent round and thus synchronized-with all other threads + entering, or the program ensured this through other synchronization). + We must wait until all threads that entered so far have confirmed that + they have exited as well. To get the notification, pretend that we have + reached the reset threshold. */ + unsigned int count = bar->count; + unsigned int max_in_before_reset = BARRIER_IN_THRESHOLD + - BARRIER_IN_THRESHOLD % count; + /* Relaxed MO sufficient because the program must have ensured that all + modifications happen-before this load (see above). */ + unsigned int in = atomic_load_relaxed (&bar->in); + /* Trigger reset. The required acquire MO is below. */ + if (atomic_fetch_add_relaxed (&bar->out, max_in_before_reset - in) < in) + { + /* Not all threads confirmed yet that they have exited, so another + thread will perform a reset. Wait until that has happened. */ + while (in != 0) + { + futex_wait_simple (&bar->in, in, bar->shared); + in = atomic_load_relaxed (&bar->in); + } + } + /* We must ensure that memory reuse happens after all prior use of the + barrier (specifically, synchronize-with the reset of the barrier or the + confirmation of threads leaving the barrier). */ + atomic_thread_fence_acquire (); - lll_lock (ibarrier->lock, ibarrier->private ^ FUTEX_PRIVATE_FLAG); - - if (__glibc_likely (ibarrier->left == ibarrier->init_count)) - /* The barrier is not used anymore. */ - result = 0; - else - /* Still used, return with an error. */ - lll_unlock (ibarrier->lock, ibarrier->private ^ FUTEX_PRIVATE_FLAG); - - return result; + return 0; } Index: glibc-2.22-621-g90c400b/nptl/pthread_barrier_init.c =================================================================== --- glibc-2.22-621-g90c400b.orig/nptl/pthread_barrier_init.c +++ glibc-2.22-621-g90c400b/nptl/pthread_barrier_init.c @@ -18,7 +18,7 @@ #include #include "pthreadP.h" -#include +#include #include @@ -34,8 +34,10 @@ __pthread_barrier_init (pthread_barrier_ { struct pthread_barrier *ibarrier; - /* XXX EINVAL is not specified by POSIX as a possible error code. */ - if (__glibc_unlikely (count == 0)) + /* XXX EINVAL is not specified by POSIX as a possible error code. See + pthread_barrier_wait for the reason for the comparison with + BARRIER_IN_THRESHOLD. */ + if (__glibc_unlikely (count == 0 || count >= BARRIER_IN_THRESHOLD)) return EINVAL; const struct pthread_barrierattr *iattr @@ -46,15 +48,12 @@ __pthread_barrier_init (pthread_barrier_ ibarrier = (struct pthread_barrier *) barrier; /* Initialize the individual fields. */ - ibarrier->lock = LLL_LOCK_INITIALIZER; - ibarrier->left = count; - ibarrier->init_count = count; - ibarrier->curr_event = 0; - - /* XXX Don't use FUTEX_SHARED or FUTEX_PRIVATE as long as there are still - assembly implementations that expect the value determined below. */ - ibarrier->private = (iattr->pshared != PTHREAD_PROCESS_PRIVATE - ? 0 : FUTEX_PRIVATE_FLAG); + ibarrier->in = 0; + ibarrier->out = 0; + ibarrier->count = count; + ibarrier->current_round = 0; + ibarrier->shared = (iattr->pshared == PTHREAD_PROCESS_PRIVATE + ? FUTEX_PRIVATE : FUTEX_SHARED); return 0; } Index: glibc-2.22-621-g90c400b/nptl/pthread_barrier_wait.c =================================================================== --- glibc-2.22-621-g90c400b.orig/nptl/pthread_barrier_wait.c +++ glibc-2.22-621-g90c400b/nptl/pthread_barrier_wait.c @@ -18,63 +18,206 @@ #include #include -#include #include #include -/* Wait on barrier. */ +/* Wait on the barrier. + + In each round, we wait for a fixed number of threads to enter the barrier + (COUNT). Once that has happened, exactly these threads are allowed to + leave the barrier. Note that POSIX does not require that only COUNT + threads can attempt to block using the barrier concurrently. + + We count the number of threads that have entered (IN). Each thread + increments IN when entering, thus getting a position in the sequence of + threads that are or have been waiting (starting with 1, so the position + is the number of threads that have entered so far including the current + thread). + CURRENT_ROUND designates the most recent thread whose round has been + detected as complete. When a thread detects that enough threads have + entered to make a round complete, it finishes this round by effectively + adding COUNT to CURRENT_ROUND atomically. Threads that believe that their + round is not complete yet wait until CURRENT_ROUND is not smaller than + their position anymore. + + A barrier can be destroyed as soon as no threads are blocked on the + barrier. This is already the case if just one thread from the last round + has stopped waiting and returned to the caller; the assumption is that + all threads from the round are unblocked atomically, even though they may + return at different times from the respective calls to + pthread_barrier_wait). Thus, a valid call to pthread_barrier_destroy can + be concurrent with other threads still figuring out that their round has + been completed. Therefore, threads need to confirm that they have left + the barrier by incrementing OUT, and pthread_barrier_destroy needs to wait + until OUT equals IN. + + To avoid an ABA issue for futex_wait on CURRENT_ROUND and for archs with + 32b-only atomics, we additionally reset the barrier when IN reaches + a threshold to avoid overflow. We assume that the total number of threads + is less than INT_MAX/2, and set the threshold accordingly so that we can + use a simple atomic_fetch_add on IN instead of a CAS when entering. The + threshold is always set to the end of a round, so all threads that have + entered are either pre-reset threads or post-reset threads (i.e., have a + position larger than the threshold). + Pre-reset threads just run the algorithm explained above. Post-reset + threads wait until IN is reset to a pre-threshold value. + When the last pre-reset thread leaves the barrier (i.e., OUT equals the + threshold), it resets the barrier to its initial state. Other (post-reset) + threads wait for the reset to have finished by waiting until IN is less + than the threshold and then restart by trying to enter the barrier again. + + We reuse the reset mechanism in pthread_barrier_destroy to get notified + when all threads have left the barrier: We trigger an artificial reset and + wait for the last pre-reset thread to finish reset, thus notifying the + thread that is about to destroy the barrier. + + Blocking using futexes is straightforward: pre-reset threads wait for + completion of their round using CURRENT_ROUND as futex word, and post-reset + threads and pthread_barrier_destroy use IN as futex word. + + Further notes: + * It is not simple to let some of the post-reset threads help with the + reset because of the ABA issues that arise; therefore, we simply make + the last thread to leave responsible for the reset. + * POSIX leaves it unspecified whether a signal handler running in a thread + that has been unblocked (because its round is complete) can stall all + other threads and prevent them from returning from the barrier. In this + implementation, other threads will return. However, + pthread_barrier_destroy will of course wait for the signal handler thread + to confirm that it left the barrier. + + TODO We should add spinning with back-off. Once we do that, we could also + try to avoid the futex_wake syscall when a round is detected as finished. + If we do not spin, it is quite likely that at least some other threads will + have called futex_wait already. */ int __pthread_barrier_wait (pthread_barrier_t *barrier) { - struct pthread_barrier *ibarrier = (struct pthread_barrier *) barrier; - int result = 0; - int lll_private = ibarrier->private ^ FUTEX_PRIVATE_FLAG; - int futex_private = (lll_private == LLL_PRIVATE - ? FUTEX_PRIVATE : FUTEX_SHARED); + struct pthread_barrier *bar = (struct pthread_barrier *) barrier; - /* Make sure we are alone. */ - lll_lock (ibarrier->lock, lll_private); + /* How many threads entered so far, including ourself. */ + unsigned int i; - /* One more arrival. */ - --ibarrier->left; + reset_restart: + /* Try to enter the barrier. We need acquire MO to (1) ensure that if we + observe that our round can be completed (see below for our attempt to do + so), all pre-barrier-entry effects of all threads in our round happen + before us completing the round, and (2) to make our use of the barrier + happen after a potential reset. We need release MO to make sure that our + pre-barrier-entry effects happen before threads in this round leaving the + barrier. */ + i = atomic_fetch_add_acq_rel (&bar->in, 1) + 1; + /* These loads are after the fetch_add so that we're less likely to first + pull in the cache line as shared. */ + unsigned int count = bar->count; + /* This is the number of threads that can enter before we need to reset. + Always at the end of a round. */ + unsigned int max_in_before_reset = BARRIER_IN_THRESHOLD + - BARRIER_IN_THRESHOLD % count; - /* Are these all? */ - if (ibarrier->left == 0) + if (i > max_in_before_reset) { - /* Yes. Increment the event counter to avoid invalid wake-ups and - tell the current waiters that it is their turn. */ - ++ibarrier->curr_event; - - /* Wake up everybody. */ - futex_wake (&ibarrier->curr_event, INT_MAX, futex_private); + /* We're in a reset round. Just wait for a reset to finish; do not + help finishing previous rounds because this could happen + concurrently with a reset. */ + while (i > max_in_before_reset) + { + futex_wait_simple (&bar->in, i, bar->shared); + /* Relaxed MO is fine here because we just need an indication for + when we should retry to enter (which will use acquire MO, see + above). */ + i = atomic_load_relaxed (&bar->in); + } + goto reset_restart; + } - /* This is the thread which finished the serialization. */ - result = PTHREAD_BARRIER_SERIAL_THREAD; + /* Look at the current round. At this point, we are just interested in + whether we can complete rounds, based on the information we obtained + through our acquire-MO load of IN. Nonetheless, if we notice that + our round has been completed using this load, we use the acquire-MO + fence below to make sure that all pre-barrier-entry effects of all + threads in our round happen before us leaving the barrier. Therefore, + relaxed MO is sufficient. */ + unsigned cr = atomic_load_relaxed (&bar->current_round); + + /* Try to finish previous rounds and/or the current round. We simply + consider just our position here and do not try to do the work of threads + that entered more recently. */ + while (cr + count <= i) + { + /* Calculate the new current round based on how many threads entered. + NEWCR must be larger than CR because CR+COUNT ends a round. */ + unsigned int newcr = i - i % count; + /* Try to complete previous and/or the current round. We need release + MO to propagate the happens-before that we observed through reading + with acquire MO from IN to other threads. If the CAS fails, it + is like the relaxed-MO load of CURRENT_ROUND above. */ + if (atomic_compare_exchange_weak_release (&bar->current_round, &cr, + newcr)) + { + /* Update CR with the modification we just did. */ + cr = newcr; + /* Wake threads belonging to the rounds we just finished. We may + wake more threads than necessary if more than COUNT threads try + to block concurrently on the barrier, but this is not a typical + use of barriers. + Note that we can still access SHARED because we haven't yet + confirmed to have left the barrier. */ + futex_wake (&bar->current_round, INT_MAX, bar->shared); + /* We did as much as we could based on our position. If we advanced + the current round to a round sufficient for us, do not wait for + that to happen and skip the acquire fence (we already + synchronize-with all other threads in our round through the + initial acquire MO fetch_add of IN. */ + if (i <= cr) + goto ready_to_leave; + else + break; + } } - else + + /* Wait until the current round is more recent than the round we are in. */ + while (i > cr) { - /* The number of the event we are waiting for. The barrier's event - number must be bumped before we continue. */ - unsigned int event = ibarrier->curr_event; - - /* Before suspending, make the barrier available to others. */ - lll_unlock (ibarrier->lock, lll_private); - - /* Wait for the event counter of the barrier to change. */ - do - futex_wait_simple (&ibarrier->curr_event, event, futex_private); - while (event == ibarrier->curr_event); + /* Wait for the current round to finish. */ + futex_wait_simple (&bar->current_round, cr, bar->shared); + /* See the fence below. */ + cr = atomic_load_relaxed (&bar->current_round); } - /* Make sure the init_count is stored locally or in a register. */ - unsigned int init_count = ibarrier->init_count; + /* Our round finished. Use the acquire MO fence to synchronize-with the + thread that finished the round, either through the initial load of + CURRENT_ROUND above or a failed CAS in the loop above. */ + atomic_thread_fence_acquire (); + + /* Now signal that we left. */ + unsigned int o; + ready_to_leave: + /* We need release MO here so that our use of the barrier happens before + reset or memory reuse after pthread_barrier_destroy. */ + o = atomic_fetch_add_release (&bar->out, 1) + 1; + if (o == max_in_before_reset) + { + /* Perform a reset if we are the last pre-reset thread leaving. All + other threads accessing the barrier are post-reset threads and are + incrementing or spinning on IN. Thus, resetting IN as the last step + of reset ensures that the reset is not concurrent with actual use of + the barrier. We need the acquire MO fence so that the reset happens + after use of the barrier by all earlier pre-reset threads. */ + atomic_thread_fence_acquire (); + atomic_store_relaxed (&bar->current_round, 0); + atomic_store_relaxed (&bar->out, 0); + /* When destroying the barrier, we wait for a reset to happen. Thus, + we must load SHARED now so that this happens before the barrier is + destroyed. */ + int shared = bar->shared; + atomic_store_release (&bar->in, 0); + futex_wake (&bar->in, INT_MAX, shared); - /* If this was the last woken thread, unlock. */ - if (atomic_increment_val (&ibarrier->left) == init_count) - /* We are done. */ - lll_unlock (ibarrier->lock, lll_private); + } - return result; + /* Return a special value for exactly one thread per round. */ + return i % count == 0 ? PTHREAD_BARRIER_SERIAL_THREAD : 0; } weak_alias (__pthread_barrier_wait, pthread_barrier_wait) Index: glibc-2.22-621-g90c400b/nptl/pthread_barrierattr_setpshared.c =================================================================== --- glibc-2.22-621-g90c400b.orig/nptl/pthread_barrierattr_setpshared.c +++ glibc-2.22-621-g90c400b/nptl/pthread_barrierattr_setpshared.c @@ -24,15 +24,11 @@ int pthread_barrierattr_setpshared (pthread_barrierattr_t *attr, int pshared) { - struct pthread_barrierattr *iattr; - int err = futex_supports_pshared (pshared); if (err != 0) return err; - iattr = (struct pthread_barrierattr *) attr; - - iattr->pshared = pshared; + ((struct pthread_barrierattr *) attr)->pshared = pshared; return 0; } Index: glibc-2.22-621-g90c400b/nptl/tst-barrier4.c =================================================================== --- glibc-2.22-621-g90c400b.orig/nptl/tst-barrier4.c +++ glibc-2.22-621-g90c400b/nptl/tst-barrier4.c @@ -16,7 +16,7 @@ License along with the GNU C Library; if not, see . */ -/* This is a test for behavior not guaranteed by POSIX. */ +/* This tests destruction of a barrier right after waiting on it. */ #include #include #include Index: glibc-2.22-621-g90c400b/nptl/tst-barrier5.c =================================================================== --- /dev/null +++ glibc-2.22-621-g90c400b/nptl/tst-barrier5.c @@ -0,0 +1,145 @@ +/* Copyright (C) 2004-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* This tests the barrier reset mechanism. */ +#include +#include +#include +#include +#include + + +static pthread_barrier_t b1; +static pthread_barrier_t b2; + + +#define N 20 +#define ROUNDS_PER_RUN 20 +#define START ((BARRIER_IN_THRESHOLD / N - ROUNDS_PER_RUN / 2) * N) + +static void * +tf (void *arg) +{ + int runs = 0; + + while (runs++ < 30) + { + /* In each run, we execute a number of rounds and initialize the barrier + so that we will go over the reset threshold with those rounds. */ + for (int rounds = 0; rounds < ROUNDS_PER_RUN; rounds++) + pthread_barrier_wait (&b1); + + if (pthread_barrier_wait (&b1) == PTHREAD_BARRIER_SERIAL_THREAD) + { + pthread_barrier_destroy (&b1); + if (pthread_barrier_init (&b1, NULL, N) != 0) + { + puts ("tf: 1st barrier_init failed"); + exit (1); + } + puts ("b1 reinitialized"); + /* Trigger a reset. */ + struct pthread_barrier *bar = (struct pthread_barrier *) &b1; + bar->in = START; + bar->out = START; + /* We deliberately don't set bar->current_round so that we also + test whether the helping for the updates of current_round + works correctly. */ + } + + /* Same as above, just for b2. */ + for (int rounds = 0; rounds < ROUNDS_PER_RUN; rounds++) + pthread_barrier_wait (&b2); + + if (pthread_barrier_wait (&b2) == PTHREAD_BARRIER_SERIAL_THREAD) + { + pthread_barrier_destroy (&b2); + if (pthread_barrier_init (&b2, NULL, N) != 0) + { + puts ("tf: 2nd barrier_init failed"); + exit (1); + } + puts ("b2 reinitialized"); + /* Trigger a reset. See above. */ + struct pthread_barrier *bar = (struct pthread_barrier *) &b2; + bar->in = START; + bar->out = START; + } + } + + return NULL; +} + + +static int +do_test (void) +{ + pthread_attr_t at; + int cnt; + + if (pthread_attr_init (&at) != 0) + { + puts ("attr_init failed"); + return 1; + } + + if (pthread_attr_setstacksize (&at, 1 * 1024 * 1024) != 0) + { + puts ("attr_setstacksize failed"); + return 1; + } + + if (pthread_barrier_init (&b1, NULL, N) != 0) + { + puts ("1st barrier_init failed"); + return 1; + } + + if (pthread_barrier_init (&b2, NULL, N) != 0) + { + puts ("2nd barrier_init failed"); + return 1; + } + + pthread_t th[N - 1]; + for (cnt = 0; cnt < N - 1; ++cnt) + if (pthread_create (&th[cnt], &at, tf, NULL) != 0) + { + puts ("pthread_create failed"); + return 1; + } + + if (pthread_attr_destroy (&at) != 0) + { + puts ("attr_destroy failed"); + return 1; + } + + tf (NULL); + + for (cnt = 0; cnt < N - 1; ++cnt) + if (pthread_join (th[cnt], NULL) != 0) + { + puts ("pthread_join failed"); + return 1; + } + + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" Index: glibc-2.22-621-g90c400b/sysdeps/nptl/internaltypes.h =================================================================== --- glibc-2.22-621-g90c400b.orig/sysdeps/nptl/internaltypes.h +++ glibc-2.22-621-g90c400b/sysdeps/nptl/internaltypes.h @@ -95,12 +95,13 @@ struct pthread_rwlockattr /* Barrier data structure. */ struct pthread_barrier { - unsigned int curr_event; - int lock; - unsigned int left; - unsigned int init_count; - int private; + unsigned int in; + unsigned int current_round; + unsigned int count; + int shared; + unsigned int out; }; +#define BARRIER_IN_THRESHOLD (UINT_MAX/2) /* Barrier variable attribute data structure. */ Index: glibc-2.22-621-g90c400b/sysdeps/unix/sysv/linux/i386/pthread_barrier_wait.S =================================================================== --- glibc-2.22-621-g90c400b.orig/sysdeps/unix/sysv/linux/i386/pthread_barrier_wait.S +++ /dev/null @@ -1,187 +0,0 @@ -/* Copyright (C) 2002-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper , 2002. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include -#include - - .text - - .globl __pthread_barrier_wait - .type __pthread_barrier_wait,@function - .align 16 -__pthread_barrier_wait: - cfi_startproc - pushl %ebx - cfi_adjust_cfa_offset(4) - cfi_offset(%ebx, -8) - - movl 8(%esp), %ebx - - /* Get the mutex. */ - movl $1, %edx - xorl %eax, %eax - LOCK - cmpxchgl %edx, MUTEX(%ebx) - jnz 1f - - /* One less waiter. If this was the last one needed wake - everybody. */ -2: subl $1, LEFT(%ebx) - je 3f - - /* There are more threads to come. */ - pushl %esi - cfi_adjust_cfa_offset(4) - cfi_offset(%esi, -12) - -#if CURR_EVENT == 0 - movl (%ebx), %edx -#else - movl CURR_EVENT(%ebx), %edx -#endif - - /* Release the mutex. */ - LOCK - subl $1, MUTEX(%ebx) - jne 6f - - /* Wait for the remaining threads. The call will return immediately - if the CURR_EVENT memory has meanwhile been changed. */ -7: -#if FUTEX_WAIT == 0 - movl PRIVATE(%ebx), %ecx -#else - movl $FUTEX_WAIT, %ecx - orl PRIVATE(%ebx), %ecx -#endif - xorl %esi, %esi -8: movl $SYS_futex, %eax - ENTER_KERNEL - - /* Don't return on spurious wakeups. The syscall does not change - any register except %eax so there is no need to reload any of - them. */ -#if CURR_EVENT == 0 - cmpl %edx, (%ebx) -#else - cmpl %edx, CURR_EVENT(%ebx) -#endif - je 8b - - /* Increment LEFT. If this brings the count back to the - initial count unlock the object. */ - movl $1, %edx - movl INIT_COUNT(%ebx), %ecx - LOCK - xaddl %edx, LEFT(%ebx) - subl $1, %ecx - cmpl %ecx, %edx - jne 10f - - /* Release the mutex. We cannot release the lock before - waking the waiting threads since otherwise a new thread might - arrive and gets waken up, too. */ - LOCK - subl $1, MUTEX(%ebx) - jne 9f - - /* Note: %esi is still zero. */ -10: movl %esi, %eax /* != PTHREAD_BARRIER_SERIAL_THREAD */ - - popl %esi - cfi_adjust_cfa_offset(-4) - cfi_restore(%esi) - popl %ebx - cfi_adjust_cfa_offset(-4) - cfi_restore(%ebx) - ret - - cfi_adjust_cfa_offset(4) - cfi_offset(%ebx, -8) - - /* The necessary number of threads arrived. */ -3: -#if CURR_EVENT == 0 - addl $1, (%ebx) -#else - addl $1, CURR_EVENT(%ebx) -#endif - - /* Wake up all waiters. The count is a signed number in the kernel - so 0x7fffffff is the highest value. */ - movl $0x7fffffff, %edx - movl $FUTEX_WAKE, %ecx - orl PRIVATE(%ebx), %ecx - movl $SYS_futex, %eax - ENTER_KERNEL - - /* Increment LEFT. If this brings the count back to the - initial count unlock the object. */ - movl $1, %edx - movl INIT_COUNT(%ebx), %ecx - LOCK - xaddl %edx, LEFT(%ebx) - subl $1, %ecx - cmpl %ecx, %edx - jne 5f - - /* Release the mutex. We cannot release the lock before - waking the waiting threads since otherwise a new thread might - arrive and gets waken up, too. */ - LOCK - subl $1, MUTEX(%ebx) - jne 4f - -5: orl $-1, %eax /* == PTHREAD_BARRIER_SERIAL_THREAD */ - - popl %ebx - cfi_adjust_cfa_offset(-4) - cfi_restore(%ebx) - ret - - cfi_adjust_cfa_offset(4) - cfi_offset(%ebx, -8) -1: movl PRIVATE(%ebx), %ecx - leal MUTEX(%ebx), %edx - xorl $LLL_SHARED, %ecx - call __lll_lock_wait - jmp 2b - -4: movl PRIVATE(%ebx), %ecx - leal MUTEX(%ebx), %eax - xorl $LLL_SHARED, %ecx - call __lll_unlock_wake - jmp 5b - - cfi_adjust_cfa_offset(4) - cfi_offset(%esi, -12) -6: movl PRIVATE(%ebx), %ecx - leal MUTEX(%ebx), %eax - xorl $LLL_SHARED, %ecx - call __lll_unlock_wake - jmp 7b - -9: movl PRIVATE(%ebx), %ecx - leal MUTEX(%ebx), %eax - xorl $LLL_SHARED, %ecx - call __lll_unlock_wake - jmp 10b - cfi_endproc - .size __pthread_barrier_wait,.-__pthread_barrier_wait -weak_alias (__pthread_barrier_wait, pthread_barrier_wait) Index: glibc-2.22-621-g90c400b/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S =================================================================== --- glibc-2.22-621-g90c400b.orig/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (C) 2002-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper , 2002. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include -#include - - - .text - - .globl __pthread_barrier_wait - .type __pthread_barrier_wait,@function - .align 16 -__pthread_barrier_wait: - /* Get the mutex. */ - xorl %eax, %eax - movl $1, %esi - LOCK - cmpxchgl %esi, MUTEX(%rdi) - jnz 1f - - /* One less waiter. If this was the last one needed wake - everybody. */ -2: decl LEFT(%rdi) - je 3f - - /* There are more threads to come. */ -#if CURR_EVENT == 0 - movl (%rdi), %edx -#else - movl CURR_EVENT(%rdi), %edx -#endif - - /* Release the mutex. */ - LOCK - decl MUTEX(%rdi) - jne 6f - - /* Wait for the remaining threads. The call will return immediately - if the CURR_EVENT memory has meanwhile been changed. */ -7: -#if FUTEX_WAIT == 0 - movl PRIVATE(%rdi), %esi -#else - movl $FUTEX_WAIT, %esi - orl PRIVATE(%rdi), %esi -#endif - xorq %r10, %r10 -8: movl $SYS_futex, %eax - syscall - - /* Don't return on spurious wakeups. The syscall does not change - any register except %eax so there is no need to reload any of - them. */ -#if CURR_EVENT == 0 - cmpl %edx, (%rdi) -#else - cmpl %edx, CURR_EVENT(%rdi) -#endif - je 8b - - /* Increment LEFT. If this brings the count back to the - initial count unlock the object. */ - movl $1, %edx - movl INIT_COUNT(%rdi), %eax - LOCK - xaddl %edx, LEFT(%rdi) - subl $1, %eax - cmpl %eax, %edx - jne,pt 10f - - /* Release the mutex. We cannot release the lock before - waking the waiting threads since otherwise a new thread might - arrive and gets waken up, too. */ - LOCK - decl MUTEX(%rdi) - jne 9f - -10: xorl %eax, %eax /* != PTHREAD_BARRIER_SERIAL_THREAD */ - - retq - - /* The necessary number of threads arrived. */ -3: -#if CURR_EVENT == 0 - incl (%rdi) -#else - incl CURR_EVENT(%rdi) -#endif - - /* Wake up all waiters. The count is a signed number in the kernel - so 0x7fffffff is the highest value. */ - movl $0x7fffffff, %edx - movl $FUTEX_WAKE, %esi - orl PRIVATE(%rdi), %esi - movl $SYS_futex, %eax - syscall - - /* Increment LEFT. If this brings the count back to the - initial count unlock the object. */ - movl $1, %edx - movl INIT_COUNT(%rdi), %eax - LOCK - xaddl %edx, LEFT(%rdi) - subl $1, %eax - cmpl %eax, %edx - jne,pt 5f - - /* Release the mutex. We cannot release the lock before - waking the waiting threads since otherwise a new thread might - arrive and gets waken up, too. */ - LOCK - decl MUTEX(%rdi) - jne 4f - -5: orl $-1, %eax /* == PTHREAD_BARRIER_SERIAL_THREAD */ - - retq - -1: movl PRIVATE(%rdi), %esi - addq $MUTEX, %rdi - xorl $LLL_SHARED, %esi - callq __lll_lock_wait - subq $MUTEX, %rdi - jmp 2b - -4: movl PRIVATE(%rdi), %esi - addq $MUTEX, %rdi - xorl $LLL_SHARED, %esi - callq __lll_unlock_wake - jmp 5b - -6: movl PRIVATE(%rdi), %esi - addq $MUTEX, %rdi - xorl $LLL_SHARED, %esi - callq __lll_unlock_wake - subq $MUTEX, %rdi - jmp 7b - -9: movl PRIVATE(%rdi), %esi - addq $MUTEX, %rdi - xorl $LLL_SHARED, %esi - callq __lll_unlock_wake - jmp 10b - .size __pthread_barrier_wait,.-__pthread_barrier_wait -weak_alias (__pthread_barrier_wait, pthread_barrier_wait)