kernel-ark/fs/xfs/xfs_log.h
Dave Chinner 110dc24ad2 xfs: log vector rounding leaks log space
The addition of direct formatting of log items into the CIL
linear buffer added alignment restrictions that the start of each
vector needed to be 64 bit aligned. Hence padding was added in
xlog_finish_iovec() to round up the vector length to ensure the next
vector started with the correct alignment.

This adds a small number of bytes to the size of
the linear buffer that is otherwise unused. The issue is that we
then use the linear buffer size to determine the log space used by
the log item, and this includes the unused space. Hence when we
account for space used by the log item, it's more than is actually
written into the iclogs, and hence we slowly leak this space.

This results on log hangs when reserving space, with threads getting
stuck with these stack traces:

Call Trace:
[<ffffffff81d15989>] schedule+0x29/0x70
[<ffffffff8150d3a2>] xlog_grant_head_wait+0xa2/0x1a0
[<ffffffff8150d55d>] xlog_grant_head_check+0xbd/0x140
[<ffffffff8150ee33>] xfs_log_reserve+0x103/0x220
[<ffffffff814b7f05>] xfs_trans_reserve+0x2f5/0x310
.....

The 4 bytes is significant. Brain Foster did all the hard work in
tracking down a reproducable leak to inode chunk allocation (it went
away with the ikeep mount option). His rough numbers were that
creating 50,000 inodes leaked 11 log blocks. This turns out to be
roughly 800 inode chunks or 1600 inode cluster buffers. That
works out at roughly 4 bytes per cluster buffer logged, and at that
I started looking for a 4 byte leak in the buffer logging code.

What I found was that a struct xfs_buf_log_format structure for an
inode cluster buffer is 28 bytes in length. This gets rounded up to
32 bytes, but the vector length remains 28 bytes. Hence the CIL
ticket reservation is decremented by 32 bytes (via lv->lv_buf_len)
for that vector rather than 28 bytes which are written into the log.

The fix for this problem is to separately track the bytes used by
the log vectors in the item and use that instead of the buffer
length when accounting for the log space that will be used by the
formatted log item.

Again, thanks to Brian Foster for doing all the hard work and long
hours to isolate this leak and make finding the bug relatively
simple.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-20 08:18:09 +10:00

194 lines
5.6 KiB
C

/*
* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef __XFS_LOG_H__
#define __XFS_LOG_H__
struct xfs_log_vec {
struct xfs_log_vec *lv_next; /* next lv in build list */
int lv_niovecs; /* number of iovecs in lv */
struct xfs_log_iovec *lv_iovecp; /* iovec array */
struct xfs_log_item *lv_item; /* owner */
char *lv_buf; /* formatted buffer */
int lv_bytes; /* accounted space in buffer */
int lv_buf_len; /* aligned size of buffer */
int lv_size; /* size of allocated lv */
};
#define XFS_LOG_VEC_ORDERED (-1)
static inline void *
xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
uint type)
{
struct xfs_log_iovec *vec = *vecp;
if (vec) {
ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
vec++;
} else {
vec = &lv->lv_iovecp[0];
}
vec->i_type = type;
vec->i_addr = lv->lv_buf + lv->lv_buf_len;
ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
*vecp = vec;
return vec->i_addr;
}
/*
* We need to make sure the next buffer is naturally aligned for the biggest
* basic data type we put into it. We already accounted for this padding when
* sizing the buffer.
*
* However, this padding does not get written into the log, and hence we have to
* track the space used by the log vectors separately to prevent log space hangs
* due to inaccurate accounting (i.e. a leak) of the used log space through the
* CIL context ticket.
*/
static inline void
xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
{
lv->lv_buf_len += round_up(len, sizeof(uint64_t));
lv->lv_bytes += len;
vec->i_len = len;
}
static inline void *
xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
uint type, void *data, int len)
{
void *buf;
buf = xlog_prepare_iovec(lv, vecp, type);
memcpy(buf, data, len);
xlog_finish_iovec(lv, *vecp, len);
return buf;
}
/*
* Structure used to pass callback function and the function's argument
* to the log manager.
*/
typedef struct xfs_log_callback {
struct xfs_log_callback *cb_next;
void (*cb_func)(void *, int);
void *cb_arg;
} xfs_log_callback_t;
/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
*/
static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
{
if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
if (BLOCK_LSN(lsn1) != BLOCK_LSN(lsn2))
return (BLOCK_LSN(lsn1)<BLOCK_LSN(lsn2))? -999 : 999;
return 0;
}
#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
/*
* Macros, structures, prototypes for interface to the log manager.
*/
/*
* Flags to xfs_log_done()
*/
#define XFS_LOG_REL_PERM_RESERV 0x1
/*
* Flags to xfs_log_force()
*
* XFS_LOG_SYNC: Synchronous force in-core log to disk
*/
#define XFS_LOG_SYNC 0x1
/* Log manager interfaces */
struct xfs_mount;
struct xlog_in_core;
struct xlog_ticket;
struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
struct xfs_log_callback;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
uint flags);
int _xfs_log_force(struct xfs_mount *mp,
uint flags,
int *log_forced);
void xfs_log_force(struct xfs_mount *mp,
uint flags);
int _xfs_log_force_lsn(struct xfs_mount *mp,
xfs_lsn_t lsn,
uint flags,
int *log_forced);
void xfs_log_force_lsn(struct xfs_mount *mp,
xfs_lsn_t lsn,
uint flags);
int xfs_log_mount(struct xfs_mount *mp,
struct xfs_buftarg *log_target,
xfs_daddr_t start_block,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
int xfs_log_notify(struct xfs_mount *mp,
struct xlog_in_core *iclog,
struct xfs_log_callback *callback_entry);
int xfs_log_release_iclog(struct xfs_mount *mp,
struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
int length,
int count,
struct xlog_ticket **ticket,
__uint8_t clientid,
bool permanent,
uint t_type);
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
int xfs_log_unmount_write(struct xfs_mount *mp);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
int xfs_log_need_covered(struct xfs_mount *mp);
void xlog_iodone(struct xfs_buf *);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, int flags);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
#endif /* __XFS_LOG_H__ */