291 lines
8.7 KiB
Diff
291 lines
8.7 KiB
Diff
blockdev: turn a rw semaphore into a percpu rw semaphore
|
|
|
|
commit 62ac665ff9fc07497ca524bd20d6a96893d11071
|
|
Author: Mikulas Patocka <mpatocka@redhat.com>
|
|
Date: Wed Sep 26 07:46:43 2012 +0200
|
|
|
|
blockdev: turn a rw semaphore into a percpu rw semaphore
|
|
|
|
This avoids cache line bouncing when many processes lock the semaphore
|
|
for read.
|
|
|
|
New percpu lock implementation
|
|
|
|
The lock consists of an array of percpu unsigned integers, a boolean
|
|
variable and a mutex.
|
|
|
|
When we take the lock for read, we enter rcu read section, check for a
|
|
"locked" variable. If it is false, we increase a percpu counter on the
|
|
current cpu and exit the rcu section. If "locked" is true, we exit the
|
|
rcu section, take the mutex and drop it (this waits until a writer
|
|
finished) and retry.
|
|
|
|
Unlocking for read just decreases percpu variable. Note that we can
|
|
unlock on a difference cpu than where we locked, in this case the
|
|
counter underflows. The sum of all percpu counters represents the number
|
|
of processes that hold the lock for read.
|
|
|
|
When we need to lock for write, we take the mutex, set "locked" variable
|
|
to true and synchronize rcu. Since RCU has been synchronized, no
|
|
processes can create new read locks. We wait until the sum of percpu
|
|
counters is zero - when it is, there are no readers in the critical
|
|
section.
|
|
|
|
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
|
|
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
|
|
Index: linux-3.6.x86_64/Documentation/percpu-rw-semaphore.txt
|
|
===================================================================
|
|
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
|
|
+++ linux-3.6.x86_64/Documentation/percpu-rw-semaphore.txt 2012-11-16 17:12:57.351936583 -0500
|
|
@@ -0,0 +1,27 @@
|
|
+Percpu rw semaphores
|
|
+--------------------
|
|
+
|
|
+Percpu rw semaphores is a new read-write semaphore design that is
|
|
+optimized for locking for reading.
|
|
+
|
|
+The problem with traditional read-write semaphores is that when multiple
|
|
+cores take the lock for reading, the cache line containing the semaphore
|
|
+is bouncing between L1 caches of the cores, causing performance
|
|
+degradation.
|
|
+
|
|
+Locking for reading it very fast, it uses RCU and it avoids any atomic
|
|
+instruction in the lock and unlock path. On the other hand, locking for
|
|
+writing is very expensive, it calls synchronize_rcu() that can take
|
|
+hundreds of microseconds.
|
|
+
|
|
+The lock is declared with "struct percpu_rw_semaphore" type.
|
|
+The lock is initialized percpu_init_rwsem, it returns 0 on success and
|
|
+-ENOMEM on allocation failure.
|
|
+The lock must be freed with percpu_free_rwsem to avoid memory leak.
|
|
+
|
|
+The lock is locked for read with percpu_down_read, percpu_up_read and
|
|
+for write with percpu_down_write, percpu_up_write.
|
|
+
|
|
+The idea of using RCU for optimized rw-lock was introduced by
|
|
+Eric Dumazet <eric.dumazet@gmail.com>.
|
|
+The code was written by Mikulas Patocka <mpatocka@redhat.com>
|
|
Index: linux-3.6.x86_64/fs/block_dev.c
|
|
===================================================================
|
|
--- linux-3.6.x86_64.orig/fs/block_dev.c 2012-11-16 17:12:37.381002516 -0500
|
|
+++ linux-3.6.x86_64/fs/block_dev.c 2012-11-16 17:27:41.217005828 -0500
|
|
@@ -127,7 +127,7 @@
|
|
return -EINVAL;
|
|
|
|
/* Prevent starting I/O or mapping the device */
|
|
- down_write(&bdev->bd_block_size_semaphore);
|
|
+ percpu_down_write(&bdev->bd_block_size_semaphore);
|
|
|
|
/* Check that the block device is not memory mapped */
|
|
mapping = bdev->bd_inode->i_mapping;
|
|
@@ -135,7 +135,7 @@
|
|
if (!prio_tree_empty(&mapping->i_mmap) ||
|
|
!list_empty(&mapping->i_mmap_nonlinear)) {
|
|
mutex_unlock(&mapping->i_mmap_mutex);
|
|
- up_write(&bdev->bd_block_size_semaphore);
|
|
+ percpu_up_write(&bdev->bd_block_size_semaphore);
|
|
return -EBUSY;
|
|
}
|
|
mutex_unlock(&mapping->i_mmap_mutex);
|
|
@@ -148,7 +148,7 @@
|
|
kill_bdev(bdev);
|
|
}
|
|
|
|
- up_write(&bdev->bd_block_size_semaphore);
|
|
+ percpu_up_write(&bdev->bd_block_size_semaphore);
|
|
|
|
return 0;
|
|
}
|
|
@@ -460,6 +460,12 @@
|
|
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
|
|
if (!ei)
|
|
return NULL;
|
|
+
|
|
+ if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
|
|
+ kmem_cache_free(bdev_cachep, ei);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
return &ei->vfs_inode;
|
|
}
|
|
|
|
@@ -468,6 +474,8 @@
|
|
struct inode *inode = container_of(head, struct inode, i_rcu);
|
|
struct bdev_inode *bdi = BDEV_I(inode);
|
|
|
|
+ percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
|
|
+
|
|
kmem_cache_free(bdev_cachep, bdi);
|
|
}
|
|
|
|
@@ -491,7 +499,6 @@
|
|
inode_init_once(&ei->vfs_inode);
|
|
/* Initialize mutex for freeze. */
|
|
mutex_init(&bdev->bd_fsfreeze_mutex);
|
|
- init_rwsem(&bdev->bd_block_size_semaphore);
|
|
}
|
|
|
|
static inline void __bd_forget(struct inode *inode)
|
|
@@ -1593,11 +1600,11 @@
|
|
ssize_t ret;
|
|
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
|
|
|
|
- down_read(&bdev->bd_block_size_semaphore);
|
|
+ percpu_down_read(&bdev->bd_block_size_semaphore);
|
|
|
|
ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
|
|
|
|
- up_read(&bdev->bd_block_size_semaphore);
|
|
+ percpu_up_read(&bdev->bd_block_size_semaphore);
|
|
|
|
return ret;
|
|
}
|
|
@@ -1622,7 +1629,7 @@
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
- down_read(&bdev->bd_block_size_semaphore);
|
|
+ percpu_down_read(&bdev->bd_block_size_semaphore);
|
|
|
|
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
|
|
if (ret > 0 || ret == -EIOCBQUEUED) {
|
|
@@ -1633,7 +1640,7 @@
|
|
ret = err;
|
|
}
|
|
|
|
- up_read(&bdev->bd_block_size_semaphore);
|
|
+ percpu_up_read(&bdev->bd_block_size_semaphore);
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
@@ -1646,11 +1653,11 @@
|
|
int ret;
|
|
struct block_device *bdev = I_BDEV(file->f_mapping->host);
|
|
|
|
- down_read(&bdev->bd_block_size_semaphore);
|
|
+ percpu_down_read(&bdev->bd_block_size_semaphore);
|
|
|
|
ret = generic_file_mmap(file, vma);
|
|
|
|
- up_read(&bdev->bd_block_size_semaphore);
|
|
+ percpu_up_read(&bdev->bd_block_size_semaphore);
|
|
|
|
return ret;
|
|
}
|
|
Index: linux-3.6.x86_64/include/linux/fs.h
|
|
===================================================================
|
|
--- linux-3.6.x86_64.orig/include/linux/fs.h 2012-11-16 17:12:37.424002387 -0500
|
|
+++ linux-3.6.x86_64/include/linux/fs.h 2012-11-16 17:28:12.578901349 -0500
|
|
@@ -415,6 +415,7 @@
|
|
#include <linux/migrate_mode.h>
|
|
#include <linux/uidgid.h>
|
|
#include <linux/lockdep.h>
|
|
+#include <linux/percpu-rwsem.h>
|
|
|
|
#include <asm/byteorder.h>
|
|
|
|
@@ -725,7 +726,7 @@
|
|
/* Mutex for freeze */
|
|
struct mutex bd_fsfreeze_mutex;
|
|
/* A semaphore that prevents I/O while block size is being changed */
|
|
- struct rw_semaphore bd_block_size_semaphore;
|
|
+ struct percpu_rw_semaphore bd_block_size_semaphore;
|
|
};
|
|
|
|
/*
|
|
Index: linux-3.6.x86_64/include/linux/percpu-rwsem.h
|
|
===================================================================
|
|
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
|
|
+++ linux-3.6.x86_64/include/linux/percpu-rwsem.h 2012-11-16 17:12:57.354936574 -0500
|
|
@@ -0,0 +1,89 @@
|
|
+#ifndef _LINUX_PERCPU_RWSEM_H
|
|
+#define _LINUX_PERCPU_RWSEM_H
|
|
+
|
|
+#include <linux/mutex.h>
|
|
+#include <linux/percpu.h>
|
|
+#include <linux/rcupdate.h>
|
|
+#include <linux/delay.h>
|
|
+
|
|
+struct percpu_rw_semaphore {
|
|
+ unsigned __percpu *counters;
|
|
+ bool locked;
|
|
+ struct mutex mtx;
|
|
+};
|
|
+
|
|
+static inline void percpu_down_read(struct percpu_rw_semaphore *p)
|
|
+{
|
|
+ rcu_read_lock();
|
|
+ if (unlikely(p->locked)) {
|
|
+ rcu_read_unlock();
|
|
+ mutex_lock(&p->mtx);
|
|
+ this_cpu_inc(*p->counters);
|
|
+ mutex_unlock(&p->mtx);
|
|
+ return;
|
|
+ }
|
|
+ this_cpu_inc(*p->counters);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+static inline void percpu_up_read(struct percpu_rw_semaphore *p)
|
|
+{
|
|
+ /*
|
|
+ * On X86, write operation in this_cpu_dec serves as a memory unlock
|
|
+ * barrier (i.e. memory accesses may be moved before the write, but
|
|
+ * no memory accesses are moved past the write).
|
|
+ * On other architectures this may not be the case, so we need smp_mb()
|
|
+ * there.
|
|
+ */
|
|
+#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE))
|
|
+ barrier();
|
|
+#else
|
|
+ smp_mb();
|
|
+#endif
|
|
+ this_cpu_dec(*p->counters);
|
|
+}
|
|
+
|
|
+static inline unsigned __percpu_count(unsigned __percpu *counters)
|
|
+{
|
|
+ unsigned total = 0;
|
|
+ int cpu;
|
|
+
|
|
+ for_each_possible_cpu(cpu)
|
|
+ total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu));
|
|
+
|
|
+ return total;
|
|
+}
|
|
+
|
|
+static inline void percpu_down_write(struct percpu_rw_semaphore *p)
|
|
+{
|
|
+ mutex_lock(&p->mtx);
|
|
+ p->locked = true;
|
|
+ synchronize_rcu();
|
|
+ while (__percpu_count(p->counters))
|
|
+ msleep(1);
|
|
+ smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */
|
|
+}
|
|
+
|
|
+static inline void percpu_up_write(struct percpu_rw_semaphore *p)
|
|
+{
|
|
+ p->locked = false;
|
|
+ mutex_unlock(&p->mtx);
|
|
+}
|
|
+
|
|
+static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p)
|
|
+{
|
|
+ p->counters = alloc_percpu(unsigned);
|
|
+ if (unlikely(!p->counters))
|
|
+ return -ENOMEM;
|
|
+ p->locked = false;
|
|
+ mutex_init(&p->mtx);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p)
|
|
+{
|
|
+ free_percpu(p->counters);
|
|
+ p->counters = NULL; /* catch use after free bugs */
|
|
+}
|
|
+
|
|
+#endif
|