diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 84db76e0e3e3..fecd187fcf2c 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -157,6 +157,7 @@ __xfs_ag_resv_free( error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); resv->ar_reserved = 0; resv->ar_asked = 0; + resv->ar_orig_reserved = 0; if (error) trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, @@ -189,13 +190,34 @@ __xfs_ag_resv_init( struct xfs_mount *mp = pag->pag_mount; struct xfs_ag_resv *resv; int error; - xfs_extlen_t reserved; + xfs_extlen_t hidden_space; if (used > ask) ask = used; - reserved = ask - used; - error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); + switch (type) { + case XFS_AG_RESV_RMAPBT: + /* + * Space taken by the rmapbt is not subtracted from fdblocks + * because the rmapbt lives in the free space. Here we must + * subtract the entire reservation from fdblocks so that we + * always have blocks available for rmapbt expansion. + */ + hidden_space = ask; + break; + case XFS_AG_RESV_METADATA: + /* + * Space taken by all other metadata btrees are accounted + * on-disk as used space. We therefore only hide the space + * that is reserved but not used by the trees. + */ + hidden_space = ask - used; + break; + default: + ASSERT(0); + return -EINVAL; + } + error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); @@ -216,7 +238,8 @@ __xfs_ag_resv_init( resv = xfs_perag_resv(pag, type); resv->ar_asked = ask; - resv->ar_reserved = resv->ar_orig_reserved = reserved; + resv->ar_orig_reserved = hidden_space; + resv->ar_reserved = ask - used; trace_xfs_ag_resv_init(pag, type, ask); return 0; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 01628f0c9a0c..7205268b30bc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5780,6 +5780,32 @@ del_cursor: return error; } +/* Make sure we won't be right-shifting an extent past the maximum bound. */ +int +xfs_bmap_can_insert_extents( + struct xfs_inode *ip, + xfs_fileoff_t off, + xfs_fileoff_t shift) +{ + struct xfs_bmbt_irec got; + int is_empty; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_last_extent(NULL, ip, XFS_DATA_FORK, &got, &is_empty); + if (!error && !is_empty && got.br_startoff >= off && + ((got.br_startoff + shift) & BMBT_STARTOFF_MASK) < got.br_startoff) + error = -EINVAL; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + int xfs_bmap_insert_extents( struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 99dddbd0fcc6..9b49ddf99c41 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -227,6 +227,8 @@ int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops); +int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, + xfs_fileoff_t shift); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 1c5a8aaf2bfc..059bc44c27e8 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -962,6 +962,9 @@ typedef enum xfs_dinode_fmt { XFS_DFORK_DSIZE(dip, mp) : \ XFS_DFORK_ASIZE(dip, mp)) +#define XFS_DFORK_MAXEXT(dip, mp, w) \ + (XFS_DFORK_SIZE(dip, mp, w) / sizeof(struct xfs_bmbt_rec)) + /* * Return pointers to the data or attribute forks. */ @@ -1526,6 +1529,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTBLOCK_BITLEN 52 #define BMBT_BLOCKCOUNT_BITLEN 21 +#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) + typedef struct xfs_bmbt_rec { __be64 l0, l1; } xfs_bmbt_rec_t; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d38d724534c4..33dc34655ac3 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -374,6 +374,47 @@ xfs_log_dinode_to_disk( } } +static xfs_failaddr_t +xfs_dinode_verify_fork( + struct xfs_dinode *dip, + struct xfs_mount *mp, + int whichfork) +{ + uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + + switch (XFS_DFORK_FORMAT(dip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISREG(be16_to_cpu(dip->di_mode))) + return __this_address; + if (be64_to_cpu(dip->di_size) > + XFS_DFORK_SIZE(dip, mp, whichfork)) + return __this_address; + } + if (di_nextents) + return __this_address; + break; + case XFS_DINODE_FMT_EXTENTS: + if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork)) + return __this_address; + break; + case XFS_DINODE_FMT_BTREE: + if (whichfork == XFS_ATTR_FORK) { + if (di_nextents > MAXAEXTNUM) + return __this_address; + } else if (di_nextents > MAXEXTNUM) { + return __this_address; + } + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -441,24 +482,9 @@ xfs_dinode_verify( case S_IFREG: case S_IFLNK: case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: - /* - * no local regular files yet - */ - if (S_ISREG(mode)) - return __this_address; - if (di_size > XFS_DFORK_DSIZE(dip, mp)) - return __this_address; - if (dip->di_nextents) - return __this_address; - /* fall through */ - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - break; - default: - return __this_address; - } + fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK); + if (fa) + return fa; break; case 0: /* Uninitialized inode ok. */ @@ -468,17 +494,9 @@ xfs_dinode_verify( } if (XFS_DFORK_Q(dip)) { - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - if (dip->di_anextents) - return __this_address; - /* fall through */ - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - break; - default: - return __this_address; - } + fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); + if (fa) + return fa; } else { /* * If there is no fork offset, this may be a freshly-made inode diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 65fc4ed2e9a1..b228c821bae6 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1029,8 +1029,8 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - if (high_rec->ar_startext >= mp->m_sb.sb_rextents) - high_rec->ar_startext = mp->m_sb.sb_rextents - 1; + if (high_rec->ar_startext > mp->m_sb.sb_rextents) + high_rec->ar_startext = mp->m_sb.sb_rextents; /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c35009a86699..83b1e8c6c18f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -685,12 +685,10 @@ out_unlock_iolock: } /* - * dead simple method of punching delalyed allocation blocks from a range in - * the inode. Walks a block at a time so will be slow, but is only executed in - * rare error cases so the overhead is not critical. This will always punch out - * both the start and end blocks, even if the ranges only partially overlap - * them, so it is up to the caller to ensure that partial blocks are not - * passed in. + * Dead simple method of punching delalyed allocation blocks from a range in + * the inode. This will always punch out both the start and end blocks, even + * if the ranges only partially overlap them, so it is up to the caller to + * ensure that partial blocks are not passed in. */ int xfs_bmap_punch_delalloc_range( @@ -698,63 +696,44 @@ xfs_bmap_punch_delalloc_range( xfs_fileoff_t start_fsb, xfs_fileoff_t length) { - xfs_fileoff_t remaining = length; + struct xfs_ifork *ifp = &ip->i_df; + xfs_fileoff_t end_fsb = start_fsb + length; + struct xfs_bmbt_irec got, del; + struct xfs_iext_cursor icur; int error = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - do { - int done; - xfs_bmbt_irec_t imap; - int nimaps = 1; - xfs_fsblock_t firstblock; - struct xfs_defer_ops dfops; - - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, - XFS_BMAPI_ENTIRE); - - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "Failed delalloc mapping lookup ino %lld fsb %lld.", - ip->i_ino, start_fsb); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_block; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_block; - } - WARN_ON(imap.br_blockcount == 0); - - /* - * Note: while we initialise the firstblock/dfops pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. - */ - xfs_defer_init(&dfops, &firstblock); - error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, - &dfops, &done); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) - break; + return error; + } - ASSERT(!xfs_defer_has_unfinished_work(&dfops)); -next_block: - start_fsb++; - remaining--; - } while(remaining > 0); + if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) + return 0; + + while (got.br_startoff + got.br_blockcount > start_fsb) { + del = got; + xfs_trim_extent(&del, start_fsb, length); + + /* + * A delete can push the cursor forward. Step back to the + * previous extent on non-delalloc or extents outside the + * target range. + */ + if (!del.br_blockcount || + !isnullstartblock(del.br_startblock)) { + if (!xfs_iext_prev_extent(ifp, &icur, &got)) + break; + continue; + } + + error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, + &got, &del); + if (error || !xfs_iext_get_extent(ifp, &icur, &got)) + break; + } return error; } @@ -1208,7 +1187,22 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - return iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + if (error) + return error; + + /* + * If we zeroed right up to EOF and EOF straddles a page boundary we + * must make sure that the post-EOF area is also zeroed because the + * page could be mmap'd and iomap_zero_range doesn't do that for us. + * Writeback of the eof page will do this, albeit clumsily. + */ + if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + (offset + len) & ~PAGE_MASK, LLONG_MAX); + } + + return error; } /* @@ -1404,6 +1398,10 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); + error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); + if (error) + return error; + error = xfs_prepare_shift(ip, offset); if (error) return error; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index c34fa9c342f2..c7157bc48bd1 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -513,8 +513,8 @@ xfs_getfsmap_rtdev_rtbitmap_query( struct xfs_trans *tp, struct xfs_getfsmap_info *info) { - struct xfs_rtalloc_rec alow; - struct xfs_rtalloc_rec ahigh; + struct xfs_rtalloc_rec alow = { 0 }; + struct xfs_rtalloc_rec ahigh = { 0 }; int error; xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a7afcad6b711..3f2bd6032cf8 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -387,7 +387,7 @@ xfs_reserve_blocks( do { free = percpu_counter_sum(&mp->m_fdblocks) - mp->m_alloc_set_aside; - if (!free) + if (free <= 0) break; delta = request - mp->m_resblks; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7a96c4e0ab5c..5df4de666cc1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3236,7 +3236,6 @@ xfs_iflush_cluster( struct xfs_inode *cip; int nr_found; int clcount = 0; - int bufwasdelwri; int i; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -3360,37 +3359,22 @@ cluster_corrupt_out: * inode buffer and shut down the filesystem. */ rcu_read_unlock(); - /* - * Clean up the buffer. If it was delwri, just release it -- - * brelse can handle it with no problems. If not, shut down the - * filesystem before releasing the buffer. - */ - bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); - if (bufwasdelwri) - xfs_buf_relse(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (!bufwasdelwri) { - /* - * Just like incore_relse: if we have b_iodone functions, - * mark the buffer as an error and call them. Otherwise - * mark it as stale and brelse. - */ - if (bp->b_iodone) { - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - } else { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - } - } - /* - * Unlocks the flush lock + * We'll always have an inode attached to the buffer for completion + * process by the time we are called from xfs_iflush(). Hence we have + * always need to do IO completion processing to abort the inodes + * attached to the buffer. handle them just like the shutdown case in + * xfs_buf_submit(). */ + ASSERT(bp->b_iodone); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); + + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(cip, false); kmem_free(cilist); xfs_perag_put(pag); @@ -3486,12 +3470,17 @@ xfs_iflush( xfs_log_force(mp, 0); /* - * inode clustering: - * see if other inodes can be gathered into this write + * inode clustering: try to gather other inodes into this write + * + * Note: Any error during clustering will result in the filesystem + * being shut down and completion callbacks run on the cluster buffer. + * As we have already flushed and attached this inode to the buffer, + * it has already been aborted and released by xfs_iflush_cluster() and + * so we have no further error handling to do here. */ error = xfs_iflush_cluster(ip, bp); if (error) - goto cluster_corrupt_out; + return error; *bpp = bp; return 0; @@ -3500,12 +3489,8 @@ corrupt_out: if (bp) xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -cluster_corrupt_out: - error = -EFSCORRUPTED; abort_out: - /* - * Unlocks the flush lock - */ + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(ip, false); return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 49f5492eed3b..55876dd02f0c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -963,12 +963,13 @@ xfs_ilock_for_iomap( unsigned *lockmode) { unsigned mode = XFS_ILOCK_SHARED; + bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) { + if (xfs_is_reflink_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -989,6 +990,7 @@ xfs_ilock_for_iomap( mode = XFS_ILOCK_EXCL; } +relock: if (flags & IOMAP_NOWAIT) { if (!xfs_ilock_nowait(ip, mode)) return -EAGAIN; @@ -996,6 +998,17 @@ xfs_ilock_for_iomap( xfs_ilock(ip, mode); } + /* + * The reflink iflag could have changed since the earlier unlocked + * check, so if we got ILOCK_SHARED for a write and but we're now a + * reflink inode we have to switch to ILOCK_EXCL and relock. + */ + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + xfs_iunlock(ip, mode); + mode = XFS_ILOCK_EXCL; + goto relock; + } + *lockmode = mode; return 0; } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index e040af120b69..524f543c5b82 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -258,7 +258,12 @@ xfs_trans_alloc( if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + /* + * Zero-reservation ("empty") transactions can't modify anything, so + * they're allowed to run while we're frozen. + */ + WARN_ON(resp->tr_logres > 0 && + mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone,