kernel-ark/fs/btrfs/transaction.h
Filipe Manana 4fbcdf6694 Btrfs: fix -ENOSPC when finishing block group creation
While creating a block group, we often end up getting ENOSPC while updating
the chunk tree, which leads to a transaction abortion that produces a trace
like the following:

[30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[30670.117777] BTRFS: Transaction aborted (error -28)
(...)
[30670.163567] Call Trace:
[30670.163906]  [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[30670.164522]  [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[30670.165171]  [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[30670.166323]  [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.167213]  [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[30670.167862]  [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.169116]  [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[30670.170593]  [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[30670.171960]  [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs]
[30670.174649]  [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[30670.176092]  [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs]
[30670.177218]  [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15
[30670.178622]  [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de
[30670.179642]  [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[30670.180692]  [<ffffffff81152863>] SyS_fallocate+0x47/0x62
[30670.186737]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[30670.187792] ---[ end trace 0373e6b491c4a8cc ]---

This is because we don't do proper space reservation for the chunk block
reserve when we have multiple tasks allocating chunks in parallel.

So block group creation has 2 phases, and the first phase essentially
checks if there is enough space in the system space_info, allocating a
new system chunk if there isn't, while the second phase updates the
device, extent and chunk trees. However, because the updates to the
chunk tree happen in the second phase, if we have N tasks, each with
its own transaction handle, allocating new chunks in parallel and if
there is only enough space in the system space_info to allocate M chunks,
where M < N, none of the tasks ends up allocating a new system chunk in
the first phase and N - M tasks will get -ENOSPC when attempting to
update the chunk tree in phase 2 if they need to COW any nodes/leafs
from the chunk tree.

Fix this by doing proper reservation in the chunk block reserve.

The issue could be reproduced by running fstests generic/038 in a loop,
which eventually triggered the problem.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-06-03 04:03:04 -07:00

196 lines
6.3 KiB
C

/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef __BTRFS_TRANSACTION__
#define __BTRFS_TRANSACTION__
#include "btrfs_inode.h"
#include "delayed-ref.h"
#include "ctree.h"
enum btrfs_trans_state {
TRANS_STATE_RUNNING = 0,
TRANS_STATE_BLOCKED = 1,
TRANS_STATE_COMMIT_START = 2,
TRANS_STATE_COMMIT_DOING = 3,
TRANS_STATE_UNBLOCKED = 4,
TRANS_STATE_COMPLETED = 5,
TRANS_STATE_MAX = 6,
};
struct btrfs_transaction {
u64 transid;
/*
* total external writers(USERSPACE/START/ATTACH) in this
* transaction, it must be zero before the transaction is
* being committed
*/
atomic_t num_extwriters;
/*
* total writers in this transaction, it must be zero before the
* transaction can end
*/
atomic_t num_writers;
atomic_t use_count;
/*
* true if there is free bgs operations in this transaction
*/
int have_free_bgs;
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
struct list_head list;
struct extent_io_tree dirty_pages;
unsigned long start_time;
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
struct list_head pending_ordered;
struct list_head switch_commits;
struct list_head dirty_bgs;
struct list_head io_bgs;
u64 num_dirty_bgs;
/*
* we need to make sure block group deletion doesn't race with
* free space cache writeout. This mutex keeps them from stomping
* on each other
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
int dirty_bg_run;
};
#define __TRANS_FREEZABLE (1U << 0)
#define __TRANS_USERSPACE (1U << 8)
#define __TRANS_START (1U << 9)
#define __TRANS_ATTACH (1U << 10)
#define __TRANS_JOIN (1U << 11)
#define __TRANS_JOIN_NOLOCK (1U << 12)
#define __TRANS_DUMMY (1U << 13)
#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
#define TRANS_ATTACH (__TRANS_ATTACH)
#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
__TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
unsigned long blocks_used;
unsigned long delayed_ref_updates;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
short aborted;
short adding_csums;
bool allocating_chunk;
bool reloc_reserved;
bool sync;
unsigned int type;
/*
* this root is only needed to validate that the root passed to
* start_transaction is the same as the one passed to end_transaction.
* Subvolume quota depends on this
*/
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
struct list_head ordered;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;
struct btrfs_root *root;
struct btrfs_root *snap;
struct btrfs_qgroup_inherit *inherit;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
u64 qgroup_reserved;
/* extra metadata reseration for relocation */
int error;
bool readonly;
struct list_head list;
};
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
{
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_trans = trans->transaction->transid;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
spin_unlock(&BTRFS_I(inode)->lock);
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_items);
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_root *root, int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
int wait_for_unblock);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_throttle(struct btrfs_root *root);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
#endif