b4b29f9485
The Cavium guys reported a soft lockup on their arm64 machine, caused by commitc55a6ffa62
("locking/osq: Relax atomic semantics"): mutex_optimistic_spin+0x9c/0x1d0 __mutex_lock_slowpath+0x44/0x158 mutex_lock+0x54/0x58 kernfs_iop_permission+0x38/0x70 __inode_permission+0x88/0xd8 inode_permission+0x30/0x6c link_path_walk+0x68/0x4d4 path_openat+0xb4/0x2bc do_filp_open+0x74/0xd0 do_sys_open+0x14c/0x228 SyS_openat+0x3c/0x48 el0_svc_naked+0x24/0x28 This is because in osq_lock we initialise the node for the current CPU: node->locked = 0; node->next = NULL; node->cpu = curr; and then publish the current CPU in the lock tail: old = atomic_xchg_acquire(&lock->tail, curr); Once the update to lock->tail is visible to another CPU, the node is then live and can be both read and updated by concurrent lockers. Unfortunately, the ACQUIRE semantics of the xchg operation mean that there is no guarantee the contents of the node will be visible before lock tail is updated. This can lead to lock corruption when, for example, a concurrent locker races to set the next field. Fixes:c55a6ffa62
("locking/osq: Relax atomic semantics"): Reported-by: David Daney <ddaney@caviumnetworks.com> Reported-by: Andrew Pinski <andrew.pinski@caviumnetworks.com> Tested-by: Andrew Pinski <andrew.pinski@caviumnetworks.com> Acked-by: Davidlohr Bueso <dave@stgolabs.net> Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: http://lkml.kernel.org/r/1449856001-21177-1-git-send-email-will.deacon@arm.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
211 lines
5.1 KiB
C
211 lines
5.1 KiB
C
#include <linux/percpu.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/osq_lock.h>
|
|
|
|
/*
|
|
* An MCS like lock especially tailored for optimistic spinning for sleeping
|
|
* lock implementations (mutex, rwsem, etc).
|
|
*
|
|
* Using a single mcs node per CPU is safe because sleeping locks should not be
|
|
* called from interrupt context and we have preemption disabled while
|
|
* spinning.
|
|
*/
|
|
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
|
|
|
|
/*
|
|
* We use the value 0 to represent "no CPU", thus the encoded value
|
|
* will be the CPU number incremented by 1.
|
|
*/
|
|
static inline int encode_cpu(int cpu_nr)
|
|
{
|
|
return cpu_nr + 1;
|
|
}
|
|
|
|
static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
|
|
{
|
|
int cpu_nr = encoded_cpu_val - 1;
|
|
|
|
return per_cpu_ptr(&osq_node, cpu_nr);
|
|
}
|
|
|
|
/*
|
|
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
|
|
* Can return NULL in case we were the last queued and we updated @lock instead.
|
|
*/
|
|
static inline struct optimistic_spin_node *
|
|
osq_wait_next(struct optimistic_spin_queue *lock,
|
|
struct optimistic_spin_node *node,
|
|
struct optimistic_spin_node *prev)
|
|
{
|
|
struct optimistic_spin_node *next = NULL;
|
|
int curr = encode_cpu(smp_processor_id());
|
|
int old;
|
|
|
|
/*
|
|
* If there is a prev node in queue, then the 'old' value will be
|
|
* the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
|
|
* we're currently last in queue, then the queue will then become empty.
|
|
*/
|
|
old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
|
|
|
|
for (;;) {
|
|
if (atomic_read(&lock->tail) == curr &&
|
|
atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
|
|
/*
|
|
* We were the last queued, we moved @lock back. @prev
|
|
* will now observe @lock and will complete its
|
|
* unlock()/unqueue().
|
|
*/
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* We must xchg() the @node->next value, because if we were to
|
|
* leave it in, a concurrent unlock()/unqueue() from
|
|
* @node->next might complete Step-A and think its @prev is
|
|
* still valid.
|
|
*
|
|
* If the concurrent unlock()/unqueue() wins the race, we'll
|
|
* wait for either @lock to point to us, through its Step-B, or
|
|
* wait for a new @node->next from its Step-C.
|
|
*/
|
|
if (node->next) {
|
|
next = xchg(&node->next, NULL);
|
|
if (next)
|
|
break;
|
|
}
|
|
|
|
cpu_relax_lowlatency();
|
|
}
|
|
|
|
return next;
|
|
}
|
|
|
|
bool osq_lock(struct optimistic_spin_queue *lock)
|
|
{
|
|
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
|
|
struct optimistic_spin_node *prev, *next;
|
|
int curr = encode_cpu(smp_processor_id());
|
|
int old;
|
|
|
|
node->locked = 0;
|
|
node->next = NULL;
|
|
node->cpu = curr;
|
|
|
|
/*
|
|
* We need both ACQUIRE (pairs with corresponding RELEASE in
|
|
* unlock() uncontended, or fastpath) and RELEASE (to publish
|
|
* the node fields we just initialised) semantics when updating
|
|
* the lock tail.
|
|
*/
|
|
old = atomic_xchg(&lock->tail, curr);
|
|
if (old == OSQ_UNLOCKED_VAL)
|
|
return true;
|
|
|
|
prev = decode_cpu(old);
|
|
node->prev = prev;
|
|
WRITE_ONCE(prev->next, node);
|
|
|
|
/*
|
|
* Normally @prev is untouchable after the above store; because at that
|
|
* moment unlock can proceed and wipe the node element from stack.
|
|
*
|
|
* However, since our nodes are static per-cpu storage, we're
|
|
* guaranteed their existence -- this allows us to apply
|
|
* cmpxchg in an attempt to undo our queueing.
|
|
*/
|
|
|
|
while (!READ_ONCE(node->locked)) {
|
|
/*
|
|
* If we need to reschedule bail... so we can block.
|
|
*/
|
|
if (need_resched())
|
|
goto unqueue;
|
|
|
|
cpu_relax_lowlatency();
|
|
}
|
|
return true;
|
|
|
|
unqueue:
|
|
/*
|
|
* Step - A -- stabilize @prev
|
|
*
|
|
* Undo our @prev->next assignment; this will make @prev's
|
|
* unlock()/unqueue() wait for a next pointer since @lock points to us
|
|
* (or later).
|
|
*/
|
|
|
|
for (;;) {
|
|
if (prev->next == node &&
|
|
cmpxchg(&prev->next, node, NULL) == node)
|
|
break;
|
|
|
|
/*
|
|
* We can only fail the cmpxchg() racing against an unlock(),
|
|
* in which case we should observe @node->locked becomming
|
|
* true.
|
|
*/
|
|
if (smp_load_acquire(&node->locked))
|
|
return true;
|
|
|
|
cpu_relax_lowlatency();
|
|
|
|
/*
|
|
* Or we race against a concurrent unqueue()'s step-B, in which
|
|
* case its step-C will write us a new @node->prev pointer.
|
|
*/
|
|
prev = READ_ONCE(node->prev);
|
|
}
|
|
|
|
/*
|
|
* Step - B -- stabilize @next
|
|
*
|
|
* Similar to unlock(), wait for @node->next or move @lock from @node
|
|
* back to @prev.
|
|
*/
|
|
|
|
next = osq_wait_next(lock, node, prev);
|
|
if (!next)
|
|
return false;
|
|
|
|
/*
|
|
* Step - C -- unlink
|
|
*
|
|
* @prev is stable because its still waiting for a new @prev->next
|
|
* pointer, @next is stable because our @node->next pointer is NULL and
|
|
* it will wait in Step-A.
|
|
*/
|
|
|
|
WRITE_ONCE(next->prev, prev);
|
|
WRITE_ONCE(prev->next, next);
|
|
|
|
return false;
|
|
}
|
|
|
|
void osq_unlock(struct optimistic_spin_queue *lock)
|
|
{
|
|
struct optimistic_spin_node *node, *next;
|
|
int curr = encode_cpu(smp_processor_id());
|
|
|
|
/*
|
|
* Fast path for the uncontended case.
|
|
*/
|
|
if (likely(atomic_cmpxchg_release(&lock->tail, curr,
|
|
OSQ_UNLOCKED_VAL) == curr))
|
|
return;
|
|
|
|
/*
|
|
* Second most likely case.
|
|
*/
|
|
node = this_cpu_ptr(&osq_node);
|
|
next = xchg(&node->next, NULL);
|
|
if (next) {
|
|
WRITE_ONCE(next->locked, 1);
|
|
return;
|
|
}
|
|
|
|
next = osq_wait_next(lock, node, NULL);
|
|
if (next)
|
|
WRITE_ONCE(next->locked, 1);
|
|
}
|