Add even more btrfs corruption/error fixes

This commit is contained in:
Josh Boyer 2014-10-17 13:27:18 -04:00
parent 8139576baf
commit e46472556a
14 changed files with 989 additions and 8 deletions

View File

@ -0,0 +1,54 @@
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 11 Sep 2014 11:44:49 +0100
Subject: [PATCH] Btrfs: add missing compression property remove in
btrfs_ioctl_setflags
(cherry picked from commit 78a017a2c92df9b571db0a55a016280f9019c65e)
The behaviour of a 'chattr -c' consists of getting the current flags,
clearing the FS_COMPR_FL bit and then sending the result to the set
flags ioctl - this means the bit FS_NOCOMP_FL isn't set in the flags
passed to the ioctl. This results in the compression property not being
cleared from the inode - it was cleared only if the bit FS_NOCOMP_FL
was set in the received flags.
Reproducer:
$ mkfs.btrfs -f /dev/sdd
$ mount /dev/sdd /mnt && cd /mnt
$ mkdir a
$ chattr +c a
$ touch a/file
$ lsattr a/file
--------c------- a/file
$ chattr -c a
$ touch a/file2
$ lsattr a/file2
--------c------- a/file2
$ lsattr -d a
---------------- a
Reported-by: Andreas Schneider <asn@cryptomilk.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/ioctl.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 13d577b39bfa..91269bd9ad05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
goto out_drop;
} else {
+ ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+ if (ret && ret != -ENODATA)
+ goto out_drop;
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
--
1.9.3

View File

@ -0,0 +1,215 @@
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 19 Sep 2014 10:40:00 -0400
Subject: [PATCH] Btrfs: cleanup error handling in build_backref_tree
(cherry picked from commit 75bfb9aff45e44625260f52a5fd581b92ace3e62)
When balance panics it tends to panic in the
BUG_ON(!upper->checked);
test, because it means it couldn't build the backref tree properly. This is
annoying to users and frankly a recoverable error, nothing in this function is
actually fatal since it is just an in-memory building of the backrefs for a
given bytenr. So go through and change all the BUG_ON()'s to ASSERT()'s, and
fix the BUG_ON(!upper->checked) thing to just return an error.
This patch also fixes the error handling so it tears down the work we've done
properly. This code was horribly broken since we always just panic'ed instead
of actually erroring out, so it needed to be completely re-worked. With this
patch my broken image no longer panics when I mount it. Thanks,
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/relocation.c | 88 ++++++++++++++++++++++++++++++++++-----------------
1 file changed, 59 insertions(+), 29 deletions(-)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65245a07275b..3e227dccaecc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
err = ret;
goto out;
}
- BUG_ON(!ret || !path1->slots[0]);
+ ASSERT(ret);
+ ASSERT(path1->slots[0]);
path1->slots[0]--;
@@ -746,10 +747,10 @@ again:
* the backref was added previously when processing
* backref of type BTRFS_TREE_BLOCK_REF_KEY
*/
- BUG_ON(!list_is_singular(&cur->upper));
+ ASSERT(list_is_singular(&cur->upper));
edge = list_entry(cur->upper.next, struct backref_edge,
list[LOWER]);
- BUG_ON(!list_empty(&edge->list[UPPER]));
+ ASSERT(list_empty(&edge->list[UPPER]));
exist = edge->node[UPPER];
/*
* add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
cur->cowonly = 1;
}
#else
- BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
#endif
if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
* backref of this type.
*/
root = find_reloc_root(rc, cur->bytenr);
- BUG_ON(!root);
+ ASSERT(root);
cur->root = root;
break;
}
@@ -868,7 +869,7 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
if (btrfs_root_level(&root->root_item) == cur->level) {
/* tree root */
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
cur->bytenr);
if (should_ignore_root(root))
list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
need_check = true;
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
lower->bytenr);
if (should_ignore_root(root))
list_add(&lower->list, &useless);
@@ -982,7 +983,7 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
@@ -1026,7 +1027,7 @@ next:
* everything goes well, connect backref nodes and insert backref nodes
* into the cache.
*/
- BUG_ON(!node->checked);
+ ASSERT(node->checked);
cowonly = node->cowonly;
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1062,8 +1063,21 @@ next:
continue;
}
- BUG_ON(!upper->checked);
- BUG_ON(cowonly != upper->cowonly);
+ if (!upper->checked) {
+ /*
+ * Still want to blow up for developers since this is a
+ * logic bug.
+ */
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+ if (cowonly != upper->cowonly) {
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
@@ -1086,7 +1100,7 @@ next:
while (!list_empty(&useless)) {
upper = list_entry(useless.next, struct backref_node, list);
list_del_init(&upper->list);
- BUG_ON(!list_empty(&upper->upper));
+ ASSERT(list_empty(&upper->upper));
if (upper == node)
node = NULL;
if (upper->lowest) {
@@ -1119,29 +1133,45 @@ out:
if (err) {
while (!list_empty(&useless)) {
lower = list_entry(useless.next,
- struct backref_node, upper);
- list_del_init(&lower->upper);
+ struct backref_node, list);
+ list_del_init(&lower->list);
}
- upper = node;
- INIT_LIST_HEAD(&list);
- while (upper) {
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- list_splice_tail(&upper->upper, &list);
- free_backref_node(cache, upper);
- }
-
- if (list_empty(&list))
- break;
-
- edge = list_entry(list.next, struct backref_edge,
- list[LOWER]);
+ while (!list_empty(&list)) {
+ edge = list_first_entry(&list, struct backref_edge,
+ list[UPPER]);
+ list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
upper = edge->node[UPPER];
free_backref_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes
+ * and isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &useless);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to proces */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &useless);
+ }
+
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, list);
+ list_del_init(&lower->list);
+ free_backref_node(cache, lower);
}
return ERR_PTR(err);
}
- BUG_ON(node && node->detached);
+ ASSERT(!node || !node->detached);
return node;
}
--
1.9.3

View File

@ -0,0 +1,39 @@
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 18 Sep 2014 11:27:17 -0400
Subject: [PATCH] Btrfs: don't do async reclaim during log replay
(cherry picked from commit f6acfd50110b335c7af636cf1fc8e55319cae5fc)
Trying to reproduce a log enospc bug I hit a panic in the async reclaim code
during log replay. This is because we use fs_info->fs_root as our root for
shrinking and such. Technically we can use whatever root we want, but let's
just not allow async reclaim while we're doing log replay. Thanks,
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/extent-tree.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3efe1c3877bf..98042c1a48b4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4502,7 +4502,13 @@ again:
space_info->flush = 1;
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
- if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!root->fs_info->log_root_recovering &&
+ need_do_async_reclaim(space_info, root->fs_info, used) &&
!work_busy(&root->fs_info->async_reclaim_work))
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
--
1.9.3

View File

@ -0,0 +1,64 @@
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 19 Sep 2014 15:43:34 -0400
Subject: [PATCH] Btrfs: fix build_backref_tree issue with multiple shared
blocks
(cherry picked from commit bbe9051441effce51c9a533d2c56440df64db2d7)
Marc Merlin sent me a broken fs image months ago where it would blow up in the
upper->checked BUG_ON() in build_backref_tree. This is because we had a
scenario like this
block a -- level 4 (not shared)
|
block b -- level 3 (reloc block, shared)
|
block c -- level 2 (not shared)
|
block d -- level 1 (shared)
|
block e -- level 0 (shared)
We go to build a backref tree for block e, we notice block d is shared and add
it to the list of blocks to lookup it's backrefs for. Now when we loop around
we will check edges for the block, so we will see we looked up block c last
time. So we lookup block d and then see that the block that points to it is
block c and we can just skip that edge since we've already been up this path.
The problem is because we clear need_check when we see block d (as it is shared)
we never add block b as needing to be checked. And because block c is in our
path already we bail out before we walk up to block b and add it to the backref
check list.
To fix this we need to reset need_check if we trip over a block that doesn't
need to be checked. This will make sure that any subsequent blocks in the path
as we're walking up afterwards are added to the list to be processed. With this
patch I can now mount Marc's fs image and it'll complete the balance without
panicing. Thanks,
Reported-by: Marc MERLIN <marc@merlins.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/relocation.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3e227dccaecc..56fe6ec409ac 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -978,8 +978,11 @@ again:
need_check = false;
list_add_tail(&edge->list[UPPER],
&list);
- } else
+ } else {
+ if (upper->checked)
+ need_check = true;
INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
--
1.9.3

View File

@ -0,0 +1,56 @@
From: Sage Weil <sage@redhat.com>
Date: Fri, 26 Sep 2014 08:30:06 -0700
Subject: [PATCH] Btrfs: fix race in WAIT_SYNC ioctl
(cherry picked from commit 42383020beb1cfb05f5d330cc311931bc4917a97)
We check whether transid is already committed via last_trans_committed and
then search through trans_list for pending transactions. If
last_trans_committed is updated by btrfs_commit_transaction after we check
it (there is no locking), we will fail to find the committed transaction
and return EINVAL to the caller. This has been observed occasionally by
ceph-osd (which uses this ioctl heavily).
Fix by rechecking whether the provided transid <= last_trans_committed
after the search fails, and if so return 0.
Signed-off-by: Sage Weil <sage@redhat.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/transaction.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d89c6d3542ca..98a25df1c430 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -609,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
if (transid <= root->fs_info->last_trans_committed)
goto out;
- ret = -EINVAL;
/* find specified transaction */
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -625,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
}
}
spin_unlock(&root->fs_info->trans_lock);
- /* The specified transaction doesn't exist */
- if (!cur_trans)
+
+ /*
+ * The specified transaction doesn't exist, or we
+ * raced with btrfs_commit_transaction
+ */
+ if (!cur_trans) {
+ if (transid > root->fs_info->last_trans_committed)
+ ret = -EINVAL;
goto out;
+ }
} else {
/* find newest transaction that is committing | committed */
spin_lock(&root->fs_info->trans_lock);
--
1.9.3

View File

@ -0,0 +1,88 @@
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 16 Sep 2014 17:49:30 +0800
Subject: [PATCH] Btrfs: fix up bounds checking in lseek
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
(cherry picked from commit 4d1a40c66bed0b3fa43b9da5fbd5cbe332e4eccf)
An user reported this, it is because that lseek's SEEK_SET/SEEK_CUR/SEEK_END
allow a negative value for @offset, but btrfs's SEEK_DATA/SEEK_HOLE don't
prepare for that and convert the negative @offset into unsigned type,
so we get (end < start) warning.
[ 1269.835374] ------------[ cut here ]------------
[ 1269.836809] WARNING: CPU: 0 PID: 1241 at fs/btrfs/extent_io.c:430 insert_state+0x11d/0x140()
[ 1269.838816] BTRFS: end < start 4094 18446744073709551615
[ 1269.840334] CPU: 0 PID: 1241 Comm: a.out Tainted: G W 3.16.0+ #306
[ 1269.858229] Call Trace:
[ 1269.858612] [<ffffffff81801a69>] dump_stack+0x4e/0x68
[ 1269.858952] [<ffffffff8107894c>] warn_slowpath_common+0x8c/0xc0
[ 1269.859416] [<ffffffff81078a36>] warn_slowpath_fmt+0x46/0x50
[ 1269.859929] [<ffffffff813b0fbd>] insert_state+0x11d/0x140
[ 1269.860409] [<ffffffff813b1396>] __set_extent_bit+0x3b6/0x4e0
[ 1269.860805] [<ffffffff813b21c7>] lock_extent_bits+0x87/0x200
[ 1269.861697] [<ffffffff813a5b28>] btrfs_file_llseek+0x148/0x2a0
[ 1269.862168] [<ffffffff811f201e>] SyS_lseek+0xae/0xc0
[ 1269.862620] [<ffffffff8180b212>] system_call_fastpath+0x16/0x1b
[ 1269.862970] ---[ end trace 4d33ea885832054b ]---
This assumes that btrfs starts finding DATA/HOLE from the beginning of file
if the assigned @offset is negative.
Also we add alignment for lock_extent_bits 's range.
Reported-by: Toralf Förster <toralf.foerster@gmx.de>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/file.c | 25 +++++++++++++++----------
1 file changed, 15 insertions(+), 10 deletions(-)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ff1cc0399b9a..68dd92cd7d54 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2621,23 +2621,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- u64 lockstart = *offset;
- u64 lockend = i_size_read(inode);
- u64 start = *offset;
- u64 len = i_size_read(inode);
+ u64 lockstart;
+ u64 lockend;
+ u64 start;
+ u64 len;
int ret = 0;
- lockend = max_t(u64, root->sectorsize, lockend);
+ if (inode->i_size == 0)
+ return -ENXIO;
+
+ /*
+ * *offset can be negative, in this case we start finding DATA/HOLE from
+ * the very start of the file.
+ */
+ start = max_t(loff_t, 0, *offset);
+
+ lockstart = round_down(start, root->sectorsize);
+ lockend = round_up(i_size_read(inode), root->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
-
lockend--;
len = lockend - lockstart + 1;
- len = max_t(u64, len, root->sectorsize);
- if (inode->i_size == 0)
- return -ENXIO;
-
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
&cached_state);
--
1.9.3

View File

@ -0,0 +1,40 @@
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 18 Sep 2014 11:30:44 -0400
Subject: [PATCH] Btrfs: try not to ENOSPC on log replay
(cherry picked from commit 1d52c78afbbf80b58299e076a159617d6b42fe3c)
When doing log replay we may have to update inodes, which traditionally goes
through our delayed inode stuff. This will try to move space over from the
trans handle, but we don't reserve space in our trans handle on replay since we
don't know how much we will need, so instead we try to flush. But because we
have a trans handle open we won't flush anything, so if we are out of reserve
space we will simply return ENOSPC. Since we know that if an operation made it
into the log then we definitely had space before the box bought the farm then we
don't need to worry about doing this space reservation. Use the
fs_info->log_root_recovering flag to skip the delayed inode stuff and update the
item directly. Thanks,
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/inode.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8039021353ab..1c1173e59adf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3662,7 +3662,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !root->fs_info->log_root_recovering) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
--
1.9.3

View File

@ -1,19 +1,37 @@
From: Josh Boyer <jwboyer@fedoraproject.org>
Date: Wed, 15 Oct 2014 10:09:50 -0400
From: Chris Mason <clm@fb.com>
Date: Wed, 15 Oct 2014 13:50:56 -0700
Subject: [PATCH] Revert "Btrfs: race free update of commit root for ro
snapshots"
This reverts commit 9c3b306e1c9e6be4be09e99a8fe2227d1005effc.
(cherry picked from commit d37973082b453ba6b89ec07eb7b84305895d35e1)
Switching only one commit root during a transaction is wrong because it
leads the fs into an inconsistent state. All commit roots should be
switched at once, at transaction commit time, otherwise backref walking
can often miss important references that were only accessible through
the old commit root. Plus, the root item for the snapshot's root wasn't
getting updated and preventing the next transaction commit to do it.
This made several users get into random corruption issues after creation
of readonly snapshots.
A regression test for xfstests will follow soon.
Cc: stable@vger.kernel.org # 3.17
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/inode.c | 36 ------------------------------------
fs/btrfs/ioctl.c | 33 +++++++++++++++++++++++++++++++++
2 files changed, 33 insertions(+), 36 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 016c403bfe7e..46bd0303fadd 100644
index 16454b6efc55..886d8d42640d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5202,42 +5202,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
@@ -5203,42 +5203,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
iput(inode);
inode = ERR_PTR(ret);
}
@ -57,10 +75,10 @@ index 016c403bfe7e..46bd0303fadd 100644
return inode;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8a8e29878c34..f99f15e5e8cd 100644
index 91269bd9ad05..b765d412cbb6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -711,6 +711,39 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
@@ -714,6 +714,39 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;

View File

@ -0,0 +1,67 @@
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Wed, 20 Aug 2014 16:10:15 +0800
Subject: [PATCH] btrfs: Fix a deadlock in btrfs_dev_replace_finishing()
(cherry picked from commit 12b894cb288d57292b01cf158177b6d5c89a6272)
btrfs-transacion:5657
[stack snip]
btrfs_bio_map()
btrfs_bio_counter_inc_blocked()
percpu_counter_inc(&fs_info->bio_counter) ###bio_counter > 0(A)
__btrfs_bio_map()
btrfs_dev_replace_lock()
mutex_lock(dev_replace->lock) ###wait mutex(B)
btrfs:32612
[stack snip]
btrfs_dev_replace_start()
btrfs_dev_replace_lock()
mutex_lock(dev_replace->lock) ###hold mutex(B)
btrfs_dev_replace_finishing()
btrfs_rm_dev_replace_blocked()
wait until percpu_counter_sum == 0 ###wait on bio_counter(A)
This bug can be triggered quite easily by the following test script:
http://pastebin.com/MQmb37Cy
This patch will fix the ABBA problem by calling
btrfs_dev_replace_unlock() before btrfs_rm_dev_replace_blocked().
The consistency of btrfs devices list and their superblocks is protected
by device_list_mutex, not btrfs_dev_replace_lock/unlock().
So it is safe the move btrfs_dev_replace_unlock() before
btrfs_rm_dev_replace_blocked().
Reported-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Cc: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/dev-replace.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index eea26e1b2fda..d738ff8ab81c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -567,6 +567,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_kobj_rm_device(fs_info, src_device);
btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_dev_replace_unlock(dev_replace);
+
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
@@ -580,7 +582,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* superblock is scratched out so that it is no longer marked to
* belong to this filesystem.
*/
- btrfs_dev_replace_unlock(dev_replace);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&root->fs_info->chunk_mutex);
--
1.9.3

View File

@ -0,0 +1,168 @@
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Wed, 17 Sep 2014 11:53:35 +0800
Subject: [PATCH] btrfs: Fix and enhance merge_extent_mapping() to insert best
fitted extent map
(cherry picked from commit e6c4efd87ab04e5ead363f24e6ac35ed3506d401)
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/inode.c | 79 ++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 57 insertions(+), 22 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 016c403bfe7e..8039021353ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6191,21 +6191,60 @@ out_fail_inode:
goto out_fail;
}
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+ struct rb_node *next;
+
+ next = rb_next(&em->rb_node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+ struct rb_node *prev;
+
+ prev = rb_prev(&em->rb_node);
+ if (!prev)
+ return NULL;
+ return container_of(prev, struct extent_map, rb_node);
+}
+
/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
* and an extent that you want to insert, deal with overlap and insert
- * the new extent into the tree.
+ * the best fitted new extent into the tree.
*/
static int merge_extent_mapping(struct extent_map_tree *em_tree,
struct extent_map *existing,
struct extent_map *em,
u64 map_start)
{
+ struct extent_map *prev;
+ struct extent_map *next;
+ u64 start;
+ u64 end;
u64 start_diff;
BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
- start_diff = map_start - em->start;
- em->start = map_start;
- em->len = existing->start - em->start;
+
+ if (existing->start > map_start) {
+ next = existing;
+ prev = prev_extent_map(next);
+ } else {
+ prev = existing;
+ next = next_extent_map(prev);
+ }
+
+ start = prev ? extent_map_end(prev) : em->start;
+ start = max_t(u64, start, em->start);
+ end = next ? next->start : extent_map_end(em);
+ end = min_t(u64, end, extent_map_end(em));
+ start_diff = start - em->start;
+ em->start = start;
+ em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
em->block_start += start_diff;
@@ -6482,25 +6521,21 @@ insert:
ret = 0;
- existing = lookup_extent_mapping(em_tree, start, len);
- if (existing && (existing->start > start ||
- existing->start + existing->len <= start)) {
+ existing = search_extent_mapping(em_tree, start, len);
+ /*
+ * existing will always be non-NULL, since there must be
+ * extent causing the -EEXIST.
+ */
+ if (start >= extent_map_end(existing) ||
+ start + len <= existing->start) {
+ /*
+ * The existing extent map is the one nearest to
+ * the [start, start + len) range which overlaps
+ */
+ err = merge_extent_mapping(em_tree, existing,
+ em, start);
free_extent_map(existing);
- existing = NULL;
- }
- if (!existing) {
- existing = lookup_extent_mapping(em_tree, em->start,
- em->len);
- if (existing) {
- err = merge_extent_mapping(em_tree, existing,
- em, start);
- free_extent_map(existing);
- if (err) {
- free_extent_map(em);
- em = NULL;
- }
- } else {
- err = -EIO;
+ if (err) {
free_extent_map(em);
em = NULL;
}
--
1.9.3

View File

@ -0,0 +1,38 @@
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 22 Sep 2014 09:13:03 +0800
Subject: [PATCH] btrfs: Fix the wrong condition judgment about subset extent
map
(cherry picked from commit 32be3a1ac6d09576c57063c6c350ca36eaebdbd3)
Previous commit: btrfs: Fix and enhance merge_extent_mapping() to insert
best fitted extent map
is using wrong condition to judgement whether the range is a subset of a
existing extent map.
This may cause bug in btrfs no-holes mode.
This patch will correct the judgment and fix the bug.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/inode.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c1173e59adf..16454b6efc55 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6528,7 +6528,7 @@ insert:
* extent causing the -EEXIST.
*/
if (start >= extent_map_end(existing) ||
- start + len <= existing->start) {
+ start <= existing->start) {
/*
* The existing extent map is the one nearest to
* the [start, start + len) range which overlaps
--
1.9.3

View File

@ -0,0 +1,66 @@
From: Mark Fasheh <mfasheh@suse.de>
Date: Mon, 18 Aug 2014 14:01:17 -0700
Subject: [PATCH] btrfs: don't go readonly on existing qgroup items
(cherry picked from commit 0b4699dcb65c2cff793210b07f40b98c2d423a43)
btrfs_drop_snapshot() leaves subvolume qgroup items on disk after
completion. This can cause problems with snapshot creation. If a new
snapshot tries to claim the deleted subvolumes id, btrfs will get -EEXIST
from add_qgroup_item() and go read-only. The following commands will
reproduce this problem (assume btrfs is on /dev/sda and is mounted at
/btrfs)
mkfs.btrfs -f /dev/sda
mount -t btrfs /dev/sda /btrfs/
btrfs quota enable /btrfs/
btrfs su sna /btrfs/ /btrfs/snap
btrfs su de /btrfs/snap
sleep 45
umount /btrfs/
mount -t btrfs /dev/sda /btrfs/
We can fix this by catching -EEXIST in add_qgroup_item() and
initializing the existing items. We have the problem of orphaned
relation items being on disk from an old snapshot but that is outside
the scope of this patch.
Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/qgroup.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ded5c601d916..d094534c3b53 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -551,9 +551,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroupid;
+ /*
+ * Avoid a transaction abort by catching -EEXIST here. In that
+ * case, we proceed by re-initializing the existing structure
+ * on disk.
+ */
+
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_info));
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
@@ -572,7 +578,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_limit));
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
--
1.9.3

View File

@ -0,0 +1,39 @@
From: David Sterba <dsterba@suse.cz>
Date: Wed, 23 Jul 2014 14:39:35 +0200
Subject: [PATCH] btrfs: wake up transaction thread from SYNC_FS ioctl
(cherry picked from commit 2fad4e83e12591eb3bd213875b9edc2d18e93383)
The transaction thread may want to do more work, namely it pokes the
cleaner ktread that will start processing uncleaned subvols.
This can be triggered by user via the 'btrfs fi sync' command, otherwise
there was a delay up to 30 seconds before the cleaner started to clean
old snapshots.
Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
fs/btrfs/ioctl.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8a8e29878c34..13d577b39bfa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5283,6 +5283,12 @@ long btrfs_ioctl(struct file *file, unsigned int
if (ret)
return ret;
ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ /*
+ * The transaction thread may want to do more work,
+ * namely it pokes the cleaner ktread that will start
+ * processing uncleaned subvols.
+ */
+ wake_up_process(root->fs_info->transaction_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
--
1.9.3

View File

@ -42,7 +42,7 @@ Summary: The Linux kernel
# For non-released -rc kernels, this will be appended after the rcX and
# gitX tags, so a 3 here would become part of release "0.rcX.gitX.3"
#
%global baserelease 300
%global baserelease 301
%global fedora_build %{baserelease}
# base_sublevel is the kernel version we're starting with and patching
@ -638,7 +638,20 @@ Patch26041: HID-usbhid-always-poll-quirk-for-Elan-Touchscreen-01.patch
#CVE-2014-7975 rhbz 1151108 1152025
Patch26042: fs-Add-a-missing-permission-check-to-do_umount.patch
Patch26043: Revert-Btrfs-race-free-update-of-commit-root-for-ro-.patch
# btrfs fixes queued for 3.17.y
Patch26043: btrfs-wake-up-transaction-thread-from-SYNC_FS-ioctl.patch
Patch26044: btrfs-don-t-go-readonly-on-existing-qgroup-items.patch
Patch26045: btrfs-Fix-a-deadlock-in-btrfs_dev_replace_finishing.patch
Patch26046: Btrfs-add-missing-compression-property-remove-in-btr.patch
Patch26047: Btrfs-fix-up-bounds-checking-in-lseek.patch
Patch26048: btrfs-Fix-and-enhance-merge_extent_mapping-to-insert.patch
Patch26049: Btrfs-don-t-do-async-reclaim-during-log-replay.patch
Patch26050: Btrfs-try-not-to-ENOSPC-on-log-replay.patch
Patch26051: Btrfs-cleanup-error-handling-in-build_backref_tree.patch
Patch26052: Btrfs-fix-build_backref_tree-issue-with-multiple-sha.patch
Patch26053: btrfs-Fix-the-wrong-condition-judgment-about-subset-.patch
Patch26054: Btrfs-fix-race-in-WAIT_SYNC-ioctl.patch
Patch26055: Revert-Btrfs-race-free-update-of-commit-root-for-ro-.patch
# git clone ssh://git.fedorahosted.org/git/kernel-arm64.git, git diff master...devel
Patch30000: kernel-arm64.patch
@ -1391,6 +1404,19 @@ ApplyPatch HID-usbhid-always-poll-quirk-for-Elan-Touchscreen-01.patch
#CVE-2014-7975 rhbz 1151108 1152025
ApplyPatch fs-Add-a-missing-permission-check-to-do_umount.patch
# btrfs fixes queued for 3.17.y
ApplyPatch btrfs-wake-up-transaction-thread-from-SYNC_FS-ioctl.patch
ApplyPatch btrfs-don-t-go-readonly-on-existing-qgroup-items.patch
ApplyPatch btrfs-Fix-a-deadlock-in-btrfs_dev_replace_finishing.patch
ApplyPatch Btrfs-add-missing-compression-property-remove-in-btr.patch
ApplyPatch Btrfs-fix-up-bounds-checking-in-lseek.patch
ApplyPatch btrfs-Fix-and-enhance-merge_extent_mapping-to-insert.patch
ApplyPatch Btrfs-don-t-do-async-reclaim-during-log-replay.patch
ApplyPatch Btrfs-try-not-to-ENOSPC-on-log-replay.patch
ApplyPatch Btrfs-cleanup-error-handling-in-build_backref_tree.patch
ApplyPatch Btrfs-fix-build_backref_tree-issue-with-multiple-sha.patch
ApplyPatch btrfs-Fix-the-wrong-condition-judgment-about-subset-.patch
ApplyPatch Btrfs-fix-race-in-WAIT_SYNC-ioctl.patch
ApplyPatch Revert-Btrfs-race-free-update-of-commit-root-for-ro-.patch
%if 0%{?aarch64patches}
@ -2261,6 +2287,9 @@ fi
# ||----w |
# || ||
%changelog
* Fri Oct 18 2014 Josh Boyer <jwboyer@fedoraproject.org> - 3.17.1-301
- Add even more btrfs corruption/error fixes
* Wed Oct 15 2014 Josh Boyer <jwboyer@fedoraproject.org> - 3.17.1-300
- Linux v3.17.1
- Revert Btrfs ro snapshot commit that causes filesystem corruption