215 lines
6.8 KiB
Diff
215 lines
6.8 KiB
Diff
Fix a crash when block device is read and block size is changed at the same time
|
|
|
|
commit b87570f5d349661814b262dd5fc40787700f80d6
|
|
Author: Mikulas Patocka <mpatocka@redhat.com>
|
|
Date: Wed Sep 26 07:46:40 2012 +0200
|
|
|
|
Fix a crash when block device is read and block size is changed at the same time
|
|
|
|
The kernel may crash when block size is changed and I/O is issued
|
|
simultaneously.
|
|
|
|
Because some subsystems (udev or lvm) may read any block device anytime,
|
|
the bug actually puts any code that changes a block device size in
|
|
jeopardy.
|
|
|
|
The crash can be reproduced if you place "msleep(1000)" to
|
|
blkdev_get_blocks just before "bh->b_size = max_blocks <<
|
|
inode->i_blkbits;".
|
|
Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct"
|
|
While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0"
|
|
You get a BUG.
|
|
|
|
The direct and non-direct I/O is written with the assumption that block
|
|
size does not change. It doesn't seem practical to fix these crashes
|
|
one-by-one there may be many crash possibilities when block size changes
|
|
at a certain place and it is impossible to find them all and verify the
|
|
code.
|
|
|
|
This patch introduces a new rw-lock bd_block_size_semaphore. The lock is
|
|
taken for read during I/O. It is taken for write when changing block
|
|
size. Consequently, block size can't be changed while I/O is being
|
|
submitted.
|
|
|
|
For asynchronous I/O, the patch only prevents block size change while
|
|
the I/O is being submitted. The block size can change when the I/O is in
|
|
progress or when the I/O is being finished. This is acceptable because
|
|
there are no accesses to block size when asynchronous I/O is being
|
|
finished.
|
|
|
|
The patch prevents block size changing while the device is mapped with
|
|
mmap.
|
|
|
|
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
|
|
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
|
|
Index: linux-3.6.x86_64/drivers/char/raw.c
|
|
===================================================================
|
|
--- linux-3.6.x86_64.orig/drivers/char/raw.c 2012-11-16 17:12:35.127010280 -0500
|
|
+++ linux-3.6.x86_64/drivers/char/raw.c 2012-11-16 17:12:37.381002516 -0500
|
|
@@ -285,7 +285,7 @@
|
|
|
|
static const struct file_operations raw_fops = {
|
|
.read = do_sync_read,
|
|
- .aio_read = generic_file_aio_read,
|
|
+ .aio_read = blkdev_aio_read,
|
|
.write = do_sync_write,
|
|
.aio_write = blkdev_aio_write,
|
|
.fsync = blkdev_fsync,
|
|
Index: linux-3.6.x86_64/fs/block_dev.c
|
|
===================================================================
|
|
--- linux-3.6.x86_64.orig/fs/block_dev.c 2012-11-16 17:12:35.127010280 -0500
|
|
+++ linux-3.6.x86_64/fs/block_dev.c 2012-11-16 17:12:37.381002516 -0500
|
|
@@ -116,6 +116,8 @@
|
|
|
|
int set_blocksize(struct block_device *bdev, int size)
|
|
{
|
|
+ struct address_space *mapping;
|
|
+
|
|
/* Size must be a power of two, and between 512 and PAGE_SIZE */
|
|
if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
|
|
return -EINVAL;
|
|
@@ -124,6 +126,20 @@
|
|
if (size < bdev_logical_block_size(bdev))
|
|
return -EINVAL;
|
|
|
|
+ /* Prevent starting I/O or mapping the device */
|
|
+ down_write(&bdev->bd_block_size_semaphore);
|
|
+
|
|
+ /* Check that the block device is not memory mapped */
|
|
+ mapping = bdev->bd_inode->i_mapping;
|
|
+ mutex_lock(&mapping->i_mmap_mutex);
|
|
+ if (!prio_tree_empty(&mapping->i_mmap) ||
|
|
+ !list_empty(&mapping->i_mmap_nonlinear)) {
|
|
+ mutex_unlock(&mapping->i_mmap_mutex);
|
|
+ up_write(&bdev->bd_block_size_semaphore);
|
|
+ return -EBUSY;
|
|
+ }
|
|
+ mutex_unlock(&mapping->i_mmap_mutex);
|
|
+
|
|
/* Don't change the size if it is same as current */
|
|
if (bdev->bd_block_size != size) {
|
|
sync_blockdev(bdev);
|
|
@@ -131,6 +147,9 @@
|
|
bdev->bd_inode->i_blkbits = blksize_bits(size);
|
|
kill_bdev(bdev);
|
|
}
|
|
+
|
|
+ up_write(&bdev->bd_block_size_semaphore);
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -472,6 +491,7 @@
|
|
inode_init_once(&ei->vfs_inode);
|
|
/* Initialize mutex for freeze. */
|
|
mutex_init(&bdev->bd_fsfreeze_mutex);
|
|
+ init_rwsem(&bdev->bd_block_size_semaphore);
|
|
}
|
|
|
|
static inline void __bd_forget(struct inode *inode)
|
|
@@ -1567,6 +1587,22 @@
|
|
return blkdev_ioctl(bdev, mode, cmd, arg);
|
|
}
|
|
|
|
+ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
|
|
+ unsigned long nr_segs, loff_t pos)
|
|
+{
|
|
+ ssize_t ret;
|
|
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
|
|
+
|
|
+ down_read(&bdev->bd_block_size_semaphore);
|
|
+
|
|
+ ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
|
|
+
|
|
+ up_read(&bdev->bd_block_size_semaphore);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(blkdev_aio_read);
|
|
+
|
|
/*
|
|
* Write data to the block device. Only intended for the block device itself
|
|
* and the raw driver which basically is a fake block device.
|
|
@@ -1578,12 +1614,16 @@
|
|
unsigned long nr_segs, loff_t pos)
|
|
{
|
|
struct file *file = iocb->ki_filp;
|
|
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
|
|
struct blk_plug plug;
|
|
ssize_t ret;
|
|
|
|
BUG_ON(iocb->ki_pos != pos);
|
|
|
|
blk_start_plug(&plug);
|
|
+
|
|
+ down_read(&bdev->bd_block_size_semaphore);
|
|
+
|
|
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
|
|
if (ret > 0 || ret == -EIOCBQUEUED) {
|
|
ssize_t err;
|
|
@@ -1592,11 +1632,29 @@
|
|
if (err < 0 && ret > 0)
|
|
ret = err;
|
|
}
|
|
+
|
|
+ up_read(&bdev->bd_block_size_semaphore);
|
|
+
|
|
blk_finish_plug(&plug);
|
|
+
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(blkdev_aio_write);
|
|
|
|
+int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
|
|
+{
|
|
+ int ret;
|
|
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
|
|
+
|
|
+ down_read(&bdev->bd_block_size_semaphore);
|
|
+
|
|
+ ret = generic_file_mmap(file, vma);
|
|
+
|
|
+ up_read(&bdev->bd_block_size_semaphore);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
/*
|
|
* Try to release a page associated with block device when the system
|
|
* is under memory pressure.
|
|
@@ -1627,9 +1685,9 @@
|
|
.llseek = block_llseek,
|
|
.read = do_sync_read,
|
|
.write = do_sync_write,
|
|
- .aio_read = generic_file_aio_read,
|
|
+ .aio_read = blkdev_aio_read,
|
|
.aio_write = blkdev_aio_write,
|
|
- .mmap = generic_file_mmap,
|
|
+ .mmap = blkdev_mmap,
|
|
.fsync = blkdev_fsync,
|
|
.unlocked_ioctl = block_ioctl,
|
|
#ifdef CONFIG_COMPAT
|
|
Index: linux-3.6.x86_64/include/linux/fs.h
|
|
===================================================================
|
|
--- linux-3.6.x86_64.orig/include/linux/fs.h 2012-11-16 17:12:35.127010280 -0500
|
|
+++ linux-3.6.x86_64/include/linux/fs.h 2012-11-16 17:12:37.424002387 -0500
|
|
@@ -724,6 +724,8 @@
|
|
int bd_fsfreeze_count;
|
|
/* Mutex for freeze */
|
|
struct mutex bd_fsfreeze_mutex;
|
|
+ /* A semaphore that prevents I/O while block size is being changed */
|
|
+ struct rw_semaphore bd_block_size_semaphore;
|
|
};
|
|
|
|
/*
|
|
@@ -2564,6 +2566,8 @@
|
|
unsigned long *nr_segs, size_t *count, int access_flags);
|
|
|
|
/* fs/block_dev.c */
|
|
+extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
|
|
+ unsigned long nr_segs, loff_t pos);
|
|
extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
|
|
unsigned long nr_segs, loff_t pos);
|
|
extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
|