kernel-ark/fs/reiserfs/resize.c
Frederic Weisbecker 8ebc423238 reiserfs: kill-the-BKL
This patch is an attempt to remove the Bkl based locking scheme from
reiserfs and is intended.

It is a bit inspired from an old attempt by Peter Zijlstra:

   http://lkml.indiana.edu/hypermail/linux/kernel/0704.2/2174.html

The bkl is heavily used in this filesystem to prevent from
concurrent write accesses on the filesystem.

Reiserfs makes a deep use of the specific properties of the Bkl:

- It can be acqquired recursively by a same task
- It is released on the schedule() calls and reacquired when schedule() returns

The two properties above are a roadmap for the reiserfs write locking so it's
very hard to simply replace it with a common mutex.

- We need a recursive-able locking unless we want to restructure several blocks
  of the code.
- We need to identify the sites where the bkl was implictly relaxed
  (schedule, wait, sync, etc...) so that we can in turn release and
  reacquire our new lock explicitly.
  Such implicit releases of the lock are often required to let other
  resources producer/consumer do their job or we can suffer unexpected
  starvations or deadlocks.

So the new lock that replaces the bkl here is a per superblock mutex with a
specific property: it can be acquired recursively by a same task, like the
bkl.

For such purpose, we integrate a lock owner and a lock depth field on the
superblock information structure.

The first axis on this patch is to turn reiserfs_write_(un)lock() function
into a wrapper to manage this mutex. Also some explicit calls to
lock_kernel() have been converted to reiserfs_write_lock() helpers.

The second axis is to find the important blocking sites (schedule...(),
wait_on_buffer(), sync_dirty_buffer(), etc...) and then apply an explicit
release of the write lock on these locations before blocking. Then we can
safely wait for those who can give us resources or those who need some.
Typically this is a fight between the current writer, the reiserfs workqueue
(aka the async commiter) and the pdflush threads.

The third axis is a consequence of the second. The write lock is usually
on top of a lock dependency chain which can include the journal lock, the
flush lock or the commit lock. So it's dangerous to release and trying to
reacquire the write lock while we still hold other locks.

This is fine with the bkl:

      T1                       T2

lock_kernel()
    mutex_lock(A)
    unlock_kernel()
    // do something
                            lock_kernel()
                                mutex_lock(A) -> already locked by T1
                                schedule() (and then unlock_kernel())
    lock_kernel()
    mutex_unlock(A)
    ....

This is not fine with a mutex:

      T1                       T2

mutex_lock(write)
    mutex_lock(A)
    mutex_unlock(write)
    // do something
                           mutex_lock(write)
                              mutex_lock(A) -> already locked by T1
                              schedule()

    mutex_lock(write) -> already locked by T2
    deadlock

The solution in this patch is to provide a helper which releases the write
lock and sleep a bit if we can't lock a mutex that depend on it. It's another
simulation of the bkl behaviour.

The last axis is to locate the fs callbacks that are called with the bkl held,
according to Documentation/filesystem/Locking.

Those are:

- reiserfs_remount
- reiserfs_fill_super
- reiserfs_put_super

Reiserfs didn't need to explicitly lock because of the context of these callbacks.
But now we must take care of that with the new locking.

After this patch, reiserfs suffers from a slight performance regression (for now).
On UP, a high volume write with dd reports an average of 27 MB/s instead
of 30 MB/s without the patch applied.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Bron Gondwana <brong@fastmail.fm>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
LKML-Reference: <1239070789-13354-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-14 07:17:59 +02:00

213 lines
6.2 KiB
C

/*
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
*/
/*
* Written by Alexander Zarochentcev.
*
* The kernel part of the (on-line) reiserfs resizer.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/reiserfs_fs.h>
#include <linux/reiserfs_fs_sb.h>
#include <linux/buffer_head.h>
int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
{
int err = 0;
struct reiserfs_super_block *sb;
struct reiserfs_bitmap_info *bitmap;
struct reiserfs_bitmap_info *info;
struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
struct buffer_head *bh;
struct reiserfs_transaction_handle th;
unsigned int bmap_nr_new, bmap_nr;
unsigned int block_r_new, block_r;
struct reiserfs_list_bitmap *jb;
struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
unsigned long int block_count, free_blocks;
int i;
int copy_size;
sb = SB_DISK_SUPER_BLOCK(s);
if (SB_BLOCK_COUNT(s) >= block_count_new) {
printk("can\'t shrink filesystem on-line\n");
return -EINVAL;
}
/* check the device size */
bh = sb_bread(s, block_count_new - 1);
if (!bh) {
printk("reiserfs_resize: can\'t read last block\n");
return -EINVAL;
}
bforget(bh);
/* old disk layout detection; those partitions can be mounted, but
* cannot be resized */
if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
!= REISERFS_DISK_OFFSET_IN_BYTES) {
printk
("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
return -ENOTSUPP;
}
/* count used bits in last bitmap block */
block_r = SB_BLOCK_COUNT(s) -
(reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8;
/* count bitmap blocks in new fs */
bmap_nr_new = block_count_new / (s->s_blocksize * 8);
block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
if (block_r_new)
bmap_nr_new++;
else
block_r_new = s->s_blocksize * 8;
/* save old values */
block_count = SB_BLOCK_COUNT(s);
bmap_nr = reiserfs_bmap_count(s);
/* resizing of reiserfs bitmaps (journal and real), if needed */
if (bmap_nr_new > bmap_nr) {
/* reallocate journal bitmaps */
if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
printk
("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
return -ENOMEM;
}
/* the new journal bitmaps are zero filled, now we copy in the bitmap
** node pointers from the old journal bitmap structs, and then
** transfer the new data structures into the journal struct.
**
** using the copy_size var below allows this code to work for
** both shrinking and expanding the FS.
*/
copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
copy_size =
copy_size * sizeof(struct reiserfs_list_bitmap_node *);
for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
struct reiserfs_bitmap_node **node_tmp;
jb = SB_JOURNAL(s)->j_list_bitmap + i;
memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
/* just in case vfree schedules on us, copy the new
** pointer into the journal struct before freeing the
** old one
*/
node_tmp = jb->bitmaps;
jb->bitmaps = jbitmap[i].bitmaps;
vfree(node_tmp);
}
/* allocate additional bitmap blocks, reallocate array of bitmap
* block pointers */
bitmap =
vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
if (!bitmap) {
/* Journal bitmaps are still supersized, but the memory isn't
* leaked, so I guess it's ok */
printk("reiserfs_resize: unable to allocate memory.\n");
return -ENOMEM;
}
memset(bitmap, 0,
sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
for (i = 0; i < bmap_nr; i++)
bitmap[i] = old_bitmap[i];
/* This doesn't go through the journal, but it doesn't have to.
* The changes are still atomic: We're synced up when the journal
* transaction begins, and the new bitmaps don't matter if the
* transaction fails. */
for (i = bmap_nr; i < bmap_nr_new; i++) {
/* don't use read_bitmap_block since it will cache
* the uninitialized bitmap */
bh = sb_bread(s, i * s->s_blocksize * 8);
if (!bh) {
vfree(bitmap);
return -EIO;
}
memset(bh->b_data, 0, sb_blocksize(sb));
reiserfs_test_and_set_le_bit(0, bh->b_data);
reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
reiserfs_write_unlock(s);
sync_dirty_buffer(bh);
reiserfs_write_lock(s);
// update bitmap_info stuff
bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
brelse(bh);
}
/* free old bitmap blocks array */
SB_AP_BITMAP(s) = bitmap;
vfree(old_bitmap);
}
/* begin transaction, if there was an error, it's fine. Yes, we have
* incorrect bitmaps now, but none of it is ever going to touch the
* disk anyway. */
err = journal_begin(&th, s, 10);
if (err)
return err;
/* Extend old last bitmap block - new blocks have been made available */
info = SB_AP_BITMAP(s) + bmap_nr - 1;
bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
if (!bh) {
int jerr = journal_end(&th, s, 10);
if (jerr)
return jerr;
return -EIO;
}
reiserfs_prepare_for_journal(s, bh, 1);
for (i = block_r; i < s->s_blocksize * 8; i++)
reiserfs_test_and_clear_le_bit(i, bh->b_data);
info->free_count += s->s_blocksize * 8 - block_r;
journal_mark_dirty(&th, s, bh);
brelse(bh);
/* Correct new last bitmap block - It may not be full */
info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
if (!bh) {
int jerr = journal_end(&th, s, 10);
if (jerr)
return jerr;
return -EIO;
}
reiserfs_prepare_for_journal(s, bh, 1);
for (i = block_r_new; i < s->s_blocksize * 8; i++)
reiserfs_test_and_set_le_bit(i, bh->b_data);
journal_mark_dirty(&th, s, bh);
brelse(bh);
info->free_count -= s->s_blocksize * 8 - block_r_new;
/* update super */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
free_blocks = SB_FREE_BLOCKS(s);
PUT_SB_FREE_BLOCKS(s,
free_blocks + (block_count_new - block_count -
(bmap_nr_new - bmap_nr)));
PUT_SB_BLOCK_COUNT(s, block_count_new);
PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
s->s_dirt = 1;
journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
SB_JOURNAL(s)->j_must_wait = 1;
return journal_end(&th, s, 10);
}