From 781c036b678c9ae978efb5f89311d0b5e6748b10 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Mon, 13 Apr 2020 12:24:22 +0800 Subject: [PATCH 01/58] ext4: remove unnecessary test_opt for DIOREAD_NOLOCK The DIOREAD_NOLOCK flag has been cleared when doing the test_opt that is meaningless, so remove the unnecessary test_opt for DIOREAD_NOLOCK. Signed-off-by: Kaixu Xia Link: https://lore.kernel.org/r/1586751862-19437-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..79e07e69cef9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3971,17 +3971,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); goto failed_mount; } - if (test_opt(sb, DIOREAD_NOLOCK)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dioread_nolock"); - goto failed_mount; - } if (test_opt(sb, DAX)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); From ef5fd681d5159d64c464715d657660f0151c7419 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Wed, 15 Apr 2020 15:25:42 +0800 Subject: [PATCH 02/58] ext4: remove redundant variable has_bigalloc in ext4_fill_super We can use the ext4_has_feature_bigalloc() function directly to check bigalloc feature and the variable has_bigalloc is reduncant, so remove it. Signed-off-by: Kaixu Xia Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1586935542-29588-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 79e07e69cef9..49821d8a0910 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3679,7 +3679,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int blocksize, clustersize; unsigned int db_count; unsigned int i; - int needs_recovery, has_huge_files, has_bigalloc; + int needs_recovery, has_huge_files; __u64 blocks_count; int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -4194,8 +4194,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - has_bigalloc = ext4_has_feature_bigalloc(sb); - if (has_bigalloc) { + if (ext4_has_feature_bigalloc(sb)) { if (clustersize < blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " From 6b6aeffc932d5469c0dbb114bee59f34e8e02e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Guerrero=20=C3=81lvarez?= Date: Thu, 16 Apr 2020 16:14:56 +0200 Subject: [PATCH 03/58] ext4: fix a style issue in fs/ext4/acl.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed an if statement where braces were not needed. Link: https://lore.kernel.org/r/20200416141456.1089-1-carlosteniswarrior@gmail.com Signed-off-by: Carlos Guerrero Álvarez Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani --- fs/ext4/acl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 8c7bbf3e566d..b3eba92f38f5 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -215,9 +215,8 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, value, size, xattr_flags); kfree(value); - if (!error) { + if (!error) set_cached_acl(inode, type, acl); - } return error; } From 9e52484c713321e84e8834803a44ca0a001376d2 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Wed, 15 Apr 2020 16:31:39 -0400 Subject: [PATCH 04/58] ext4: remove EXT4_GET_BLOCKS_KEEP_SIZE flag The eofblocks code was removed in the 5.7 release by "ext4: remove EOFBLOCKS_FL and associated code" (4337ecd1fe99). The ext4_map_blocks() flag used to trigger it can now be removed as well. Signed-off-by: Eric Whitney Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200415203140.30349-2-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- fs/ext4/extents.c | 4 ---- fs/ext4/inode.c | 12 ++++-------- include/trace/events/ext4.h | 1 - 4 files changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..c8d060627448 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -609,8 +609,6 @@ enum { #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 - /* Request will not result in inode size update (user for fallocate) */ -#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 /* Write zeros to newly created written extents */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f2b577b315a0..5809891eadab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4507,8 +4507,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); @@ -4647,8 +4645,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; inode_lock(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..693a3722337a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -432,11 +432,9 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -541,11 +539,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ind_map_blocks(handle, inode, map, 0); } if (retval > 0) { unsigned int status; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 19c87661eeec..40ff8a2fc763 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -45,7 +45,6 @@ struct partial_cluster; { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ - { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ { EXT4_GET_BLOCKS_ZERO, "ZERO" }) /* From 493e83aafa02316bc79ec90041c378d7902194fa Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Wed, 15 Apr 2020 16:31:40 -0400 Subject: [PATCH 05/58] ext4: translate a few more map flags to strings in tracepoints As new ext4_map_blocks() flags have been added, not all have gotten flag bit to string translations to make tracepoint output more readable. Fix that, and go one step further by adding a translation for the EXT4_EX_NOCACHE flag as well. The EXT4_EX_FORCE_CACHE flag can never be set in a tracepoint in the current code, so there's no need to bother with a translation for it right now. Signed-off-by: Eric Whitney Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200415203140.30349-3-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- include/trace/events/ext4.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 40ff8a2fc763..280475c1cecc 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -45,7 +45,10 @@ struct partial_cluster; { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ - { EXT4_GET_BLOCKS_ZERO, "ZERO" }) + { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, "CONVERT_UNWRITTEN" }, \ + { EXT4_GET_BLOCKS_ZERO, "ZERO" }, \ + { EXT4_GET_BLOCKS_IO_SUBMIT, "IO_SUBMIT" }, \ + { EXT4_EX_NOCACHE, "EX_NOCACHE" }) /* * __print_flags() requires that all enum values be wrapped in the From 39c0ae163f3b3ae691e7cce226ba1984ef6456b1 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 20 Apr 2020 12:29:18 +0800 Subject: [PATCH 06/58] ext4: remove unnecessary comparisons to bool Fix the following coccicheck warning: fs/ext4/extents_status.c:1057:5-28: WARNING: Comparison to bool fs/ext4/inode.c:2314:18-24: WARNING: Comparison to bool Signed-off-by: Jason Yan Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20200420042918.19459-1-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 2 +- fs/ext4/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d996b44d2265..e75171535375 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1054,7 +1054,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; /* record the first block of the first delonly extent seen */ - if (rc->first_do_lblk_found == false) { + if (!rc->first_do_lblk_found) { rc->first_do_lblk = i; rc->first_do_lblk_found = true; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 693a3722337a..4a3381eb1bbe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2307,7 +2307,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ - if (err < 0 || map_bh == true) + if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); From c36a71b4e35ab35340facdd6964a00956b9fef0a Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Mon, 20 Apr 2020 19:39:59 -0700 Subject: [PATCH 07/58] ext4: fix EXT_MAX_EXTENT/INDEX to check for zeroed eh_max If eh->eh_max is 0, EXT_MAX_EXTENT/INDEX would evaluate to unsigned (-1) resulting in illegal memory accesses. Although there is no consistent repro, we see that generic/019 sometimes crashes because of this bug. Ran gce-xfstests smoke and verified that there were no regressions. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20200421023959.20879-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4_extents.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 1c216fcc202a..44e59881a1f0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -170,10 +170,13 @@ struct partial_cluster { (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_LAST_INDEX(__hdr__) \ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) -#define EXT_MAX_EXTENT(__hdr__) \ - (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + ((le16_to_cpu((__hdr__)->eh_max)) ? \ + ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ + : 0) #define EXT_MAX_INDEX(__hdr__) \ - (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + ((le16_to_cpu((__hdr__)->eh_max)) ? \ + ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { From 3bbd0ef26098d241dc59ee77ba14b7dab0df0786 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Thu, 23 Apr 2020 13:09:27 +0800 Subject: [PATCH 08/58] ext4: fix buffer_head refcnt leak when ext4_iget() fails ext4_orphan_get() invokes ext4_read_inode_bitmap(), which returns a reference of the specified buffer_head object to "bitmap_bh" with increased refcnt. When ext4_orphan_get() returns, local variable "bitmap_bh" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in one exception handling path of ext4_orphan_get(). When ext4_iget() fails, the function forgets to decrease the refcnt increased by ext4_read_inode_bitmap(), causing a refcnt leak. Fix this issue by calling brelse() when ext4_iget() fails. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Cc: stable@kernel.org Link: https://lore.kernel.org/r/1587618568-13418-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4b8c9a9bdf0c..011bcb8c4770 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1246,6 +1246,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) ext4_error_err(sb, -err, "couldn't read orphan inode %lu (err %d)", ino, err); + brelse(bitmap_bh); return inode; } From 4301efa4c7cca11556dd89899eee066d49b47bf7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Apr 2020 10:54:44 +0200 Subject: [PATCH 09/58] writeback: Export inode_io_list_del() Ext4 needs to remove inode from writeback lists after it is out of visibility of its journalling machinery (which can still dirty the inode). Export inode_io_list_del() for it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200421085445.5731-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/fs-writeback.c | 1 + fs/internal.h | 2 -- include/linux/writeback.h | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..e58bd5f758d0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1126,6 +1126,7 @@ void inode_io_list_del(struct inode *inode) inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } +EXPORT_SYMBOL(inode_io_list_del); /* * mark an inode as under writeback on the sb diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..8819d0d58b03 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -143,8 +143,6 @@ extern int dentry_needs_remove_privs(struct dentry *dentry); /* * fs-writeback.c */ -extern void inode_io_list_del(struct inode *inode); - extern long get_nr_dirty_inodes(void); extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb..902aa317621b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -197,6 +197,7 @@ void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); +void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) From ceff86fddae8748fe00d4f2d249cb02cae62ad84 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Apr 2020 10:54:45 +0200 Subject: [PATCH 10/58] ext4: Avoid freeing inodes on dirty list When we are evicting inode with journalled data, we may race with transaction commit in the following way: CPU0 CPU1 jbd2_journal_commit_transaction() evict(inode) inode_io_list_del() inode_wait_for_writeback() process BJ_Forget list __jbd2_journal_insert_checkpoint() __jbd2_journal_refile_buffer() __jbd2_journal_unfile_buffer() if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh) __mark_inode_dirty(inode) ext4_evict_inode(inode) frees the inode This results in use-after-free issues in the writeback code (or the assertion added in the previous commit triggering). Fix the problem by removing inode from writeback lists once all the page cache is evicted and so inode cannot be added to writeback lists again. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200421085445.5731-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4a3381eb1bbe..a7087ff533bb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -220,6 +220,16 @@ void ext4_evict_inode(struct inode *inode) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); + /* + * For inodes with journalled data, transaction commit could have + * dirtied the inode. Flush worker is ignoring it because of I_FREEING + * flag but we still need to remove the inode from the writeback lists. + */ + if (!list_empty_careful(&inode->i_io_list)) { + WARN_ON_ONCE(!ext4_should_journal_data(inode)); + inode_io_list_del(inode); + } + /* * Protect us against freezing - iput() caller didn't have to have any * protection against it From 8418897f1bf87da0cb6936489d57a4320c32c0af Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 23 Apr 2020 15:46:44 +0800 Subject: [PATCH 11/58] ext4: fix error pointer dereference Don't pass error pointers to brelse(). commit 7159a986b420 ("ext4: fix some error pointer dereferences") has fixed some cases, fix the remaining one case. Once ext4_xattr_block_find()->ext4_sb_bread() failed, error pointer is stored in @bs->bh, which will be passed to brelse() in the cleanup routine of ext4_xattr_set_handle(). This will then cause a NULL panic crash in __brelse(). BUG: unable to handle kernel NULL pointer dereference at 000000000000005b RIP: 0010:__brelse+0x1b/0x50 Call Trace: ext4_xattr_set_handle+0x163/0x5d0 ext4_xattr_set+0x95/0x110 __vfs_setxattr+0x6b/0x80 __vfs_setxattr_noperm+0x68/0x1b0 vfs_setxattr+0xa0/0xb0 setxattr+0x12c/0x1a0 path_setxattr+0x8d/0xc0 __x64_sys_setxattr+0x27/0x30 do_syscall_64+0x60/0x250 entry_SYSCALL_64_after_hwframe+0x49/0xbe In this case, @bs->bh stores '-EIO' actually. Fixes: fb265c9cb49e ("ext4: add ext4_sb_bread() to disambiguate ENOMEM cases") Signed-off-by: Jeffle Xu Reviewed-by: Joseph Qi Cc: stable@kernel.org # 2.6.19 Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/1587628004-95123-1-git-send-email-jefflexu@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 21df43a25328..01ba66373e97 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1800,8 +1800,11 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); - if (IS_ERR(bs->bh)) - return PTR_ERR(bs->bh); + if (IS_ERR(bs->bh)) { + error = PTR_ERR(bs->bh); + bs->bh = NULL; + return error; + } ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); From 4209ae12b12265d475bba28634184423149bd14f Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Sun, 26 Apr 2020 18:34:37 -0700 Subject: [PATCH 12/58] ext4: handle ext4_mark_inode_dirty errors ext4_mark_inode_dirty() can fail for real reasons. Ignoring its return value may lead ext4 to ignore real failures that would result in corruption / crashes. Harden ext4_mark_inode_dirty error paths to fail as soon as possible and return errors to the caller whenever appropriate. One of the possible scnearios when this bug could affected is that while creating a new inode, its directory entry gets added successfully but while writing the inode itself mark_inode_dirty returns error which is ignored. This would result in inconsistency that the directory entry points to a non-existent inode. Ran gce-xfstests smoke tests and verified that there were no regressions. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20200427013438.219117-1-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/acl.c | 2 +- fs/ext4/ext4.h | 2 +- fs/ext4/ext4_jbd2.h | 5 ++- fs/ext4/extents.c | 34 +++++++++++--------- fs/ext4/file.c | 11 +++++-- fs/ext4/indirect.c | 4 ++- fs/ext4/inline.c | 6 ++-- fs/ext4/inode.c | 38 ++++++++++++++++------- fs/ext4/migrate.c | 12 ++++--- fs/ext4/namei.c | 76 +++++++++++++++++++++++++++++---------------- fs/ext4/super.c | 16 ++++++---- fs/ext4/xattr.c | 6 ++-- 12 files changed, 139 insertions(+), 73 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index b3eba92f38f5..76f634d185f1 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -255,7 +255,7 @@ retry: if (!error && update_mode) { inode->i_mode = mode; inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + error = ext4_mark_inode_dirty(handle, inode); } out_stop: ext4_journal_stop(handle); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c8d060627448..884ce3086486 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3352,7 +3352,7 @@ struct ext4_extent; */ #define EXT_MAX_BLOCKS 0xffffffff -extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 4b9002f0e84c..3bacf76d2609 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -222,7 +222,10 @@ ext4_mark_iloc_dirty(handle_t *handle, int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); -int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); +#define ext4_mark_inode_dirty(__h, __i) \ + __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__) +int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, + const char *func, unsigned int line); int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5809891eadab..49ea6973d65f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -816,7 +816,7 @@ ext4_ext_binsearch(struct inode *inode, } -int ext4_ext_tree_init(handle_t *handle, struct inode *inode) +void ext4_ext_tree_init(handle_t *handle, struct inode *inode) { struct ext4_extent_header *eh; @@ -826,7 +826,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); - return 0; } struct ext4_ext_path * @@ -1319,7 +1318,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_idx_pblock(EXT_FIRST_INDEX(neh))); le16_add_cpu(&neh->eh_depth, 1); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -4363,7 +4362,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, struct inode *inode = file_inode(file); handle_t *handle; int ret = 0; - int ret2 = 0; + int ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; @@ -4423,10 +4422,11 @@ retry: if (ext4_update_inode_size(inode, epos) & 0x1) inode->i_mtime = inode->i_ctime; } - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); - ret2 = ext4_journal_stop(handle); - if (ret2) + ret3 = ext4_journal_stop(handle); + ret2 = ret3 ? ret3 : ret2; + if (unlikely(ret2)) break; } if (ret == -ENOSPC && @@ -4575,7 +4575,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode->i_mtime = inode->i_ctime = current_time(inode); if (new_size) ext4_update_inode_size(inode, new_size); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); @@ -4585,6 +4587,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (file->f_flags & O_SYNC) ext4_handle_sync(handle); +out_handle: ext4_journal_stop(handle); out_mutex: inode_unlock(inode); @@ -4696,8 +4699,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; - int ret = 0; - int ret2 = 0; + int ret = 0, ret2 = 0, ret3 = 0; struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; @@ -4730,9 +4732,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); - ext4_mark_inode_dirty(handle, inode); - if (credits) - ret2 = ext4_journal_stop(handle); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } + if (ret <= 0 || ret2) break; } @@ -5269,7 +5275,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (IS_SYNC(inode)) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0d624250a62b..b8e69f9e3858 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -287,6 +287,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, bool truncate = false; u8 blkbits = inode->i_blkbits; ext4_lblk_t written_blk, end_blk; + int ret; /* * Note that EXT4_I(inode)->i_disksize can get extended up to @@ -327,8 +328,14 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, goto truncate; } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); + if (ext4_update_inode_size(inode, offset + written)) { + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) { + written = ret; + ext4_journal_stop(handle); + goto truncate; + } + } /* * We may need to truncate allocated but not written blocks beyond EOF. diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 107f0043f67f..be2b66eb65f7 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -467,7 +467,9 @@ static int ext4_splice_branch(handle_t *handle, /* * OK, we spliced it into the inode itself on a direct block. */ - ext4_mark_inode_dirty(handle, ar->inode); + err = ext4_mark_inode_dirty(handle, ar->inode); + if (unlikely(err)) + goto err_out; jbd_debug(5, "splicing direct\n"); } return err; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f35e289e17aa..c3a1ad2db122 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1260,7 +1260,7 @@ out: int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - int ret, inline_size, no_expand; + int ret, ret2, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; @@ -1314,7 +1314,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, out: ext4_write_unlock_xattr(dir, &no_expand); - ext4_mark_inode_dirty(handle, dir); + ret2 = ext4_mark_inode_dirty(handle, dir); + if (unlikely(ret2 && !ret)) + ret = ret2; brelse(iloc.bh); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a7087ff533bb..456e8a6b4809 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1302,7 +1302,7 @@ static int ext4_write_end(struct file *file, * filesystems. */ if (i_size_changed || inline_data) - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied @@ -3083,7 +3083,7 @@ static int ext4_da_write_end(struct file *file, * new_i_size is less that inode->i_size * bu greater than i_disksize.(hint delalloc) */ - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); } } @@ -3100,7 +3100,7 @@ static int ext4_da_write_end(struct file *file, if (ret2 < 0) ret = ret2; ret2 = ext4_journal_stop(handle); - if (!ret) + if (unlikely(ret2 && !ret)) ret = ret2; return ret ? ret : copied; @@ -3892,6 +3892,8 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; + int ret; + loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); @@ -3905,10 +3907,10 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); - return 0; + return ret; } static void ext4_wait_dax_page(struct ext4_inode_info *ei) @@ -3960,7 +3962,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) loff_t first_block_offset, last_block_offset; handle_t *handle; unsigned int credits; - int ret = 0; + int ret = 0, ret2 = 0; trace_ext4_punch_hole(inode, offset, length, 0); @@ -4083,7 +4085,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret2)) + ret = ret2; if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: @@ -4152,7 +4156,7 @@ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; - int err = 0; + int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; @@ -4240,7 +4244,9 @@ out_stop: ext4_orphan_del(handle, inode); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); @@ -5298,6 +5304,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) inode->i_gid = attr->ia_gid; error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); + if (unlikely(error)) + return error; } if (attr->ia_valid & ATTR_SIZE) { @@ -5783,7 +5791,8 @@ out_unlock: * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ -int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) +int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, + const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -5793,13 +5802,18 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) - return err; + goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); - return ext4_mark_iloc_dirty(handle, inode, &iloc); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out: + if (unlikely(err)) + ext4_error_inode_err(inode, func, line, 0, err, + "mark_inode_dirty error"); + return err; } /* diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index fb6520f37135..c5e3fc998211 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -287,7 +287,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, struct inode *tmp_inode) { - int retval; + int retval, retval2 = 0; __le32 i_data[3]; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); @@ -342,7 +342,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * i_blocks when freeing the indirect meta-data blocks */ retval = free_ind_block(handle, inode, i_data); - ext4_mark_inode_dirty(handle, inode); + retval2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(retval2 && !retval)) + retval = retval2; err_out: return retval; @@ -601,7 +603,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; - int ret; + int ret, ret2 = 0; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) @@ -655,7 +657,9 @@ int ext4_ind_migrate(struct inode *inode) memset(ei->i_data, 0, sizeof(ei->i_data)); for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret2 && !ret)) + ret = ret2; errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a8aca4772aaa..56738b538ddf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1993,7 +1993,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, { unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; - int err; + int err, err2; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -2028,12 +2028,12 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); - ext4_mark_inode_dirty(handle, dir); + err2 = ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); - return 0; + return err ? err : err2; } /* @@ -2223,7 +2223,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); + if (unlikely(retval)) + goto out; } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { @@ -2576,12 +2578,12 @@ static int ext4_add_nondir(handle_t *handle, struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); d_instantiate_new(dentry, inode); *inodep = NULL; - return 0; + return err; } drop_nlink(inode); ext4_orphan_add(handle, inode); @@ -2775,7 +2777,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; - int err, credits, retries = 0; + int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -2808,7 +2810,9 @@ out_clear_inode: clear_nlink(inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2)) + err = err2; ext4_journal_stop(handle); iput(inode); goto out_retry; @@ -3148,10 +3152,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) inode->i_size = 0; ext4_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + retval = ext4_mark_inode_dirty(handle, inode); + if (retval) + goto end_rmdir; ext4_dec_count(handle, dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and @@ -3221,7 +3227,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); + if (retval) + goto end_unlink; if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", dentry->d_name.len, dentry->d_name.name); @@ -3230,7 +3238,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + retval = ext4_mark_inode_dirty(handle, inode); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and @@ -3419,7 +3427,7 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time */ @@ -3531,7 +3539,7 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, static int ext4_setent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { - int retval; + int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); retval = ext4_journal_get_write_access(handle, ent->bh); @@ -3543,19 +3551,19 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, inode_inc_iversion(ent->dir); ent->dir->i_ctime = ent->dir->i_mtime = current_time(ent->dir); - ext4_mark_inode_dirty(handle, ent->dir); + retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { - retval = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); - if (unlikely(retval)) { - ext4_std_error(ent->dir->i_sb, retval); - return retval; + retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); + if (unlikely(retval2)) { + ext4_std_error(ent->dir->i_sb, retval2); + return retval2; } } brelse(ent->bh); ent->bh = NULL; - return 0; + return retval; } static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, @@ -3790,7 +3798,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_FT_CHRDEV); if (retval) goto end_rename; - ext4_mark_inode_dirty(handle, whiteout); + retval = ext4_mark_inode_dirty(handle, whiteout); + if (unlikely(retval)) + goto end_rename; } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); @@ -3811,7 +3821,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * rename. */ old.inode->i_ctime = current_time(old.inode); - ext4_mark_inode_dirty(handle, old.inode); + retval = ext4_mark_inode_dirty(handle, old.inode); + if (unlikely(retval)) + goto end_rename; if (!whiteout) { /* @@ -3840,12 +3852,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, } else { ext4_inc_count(handle, new.dir); ext4_update_dx_flag(new.dir); - ext4_mark_inode_dirty(handle, new.dir); + retval = ext4_mark_inode_dirty(handle, new.dir); + if (unlikely(retval)) + goto end_rename; } } - ext4_mark_inode_dirty(handle, old.dir); + retval = ext4_mark_inode_dirty(handle, old.dir); + if (unlikely(retval)) + goto end_rename; if (new.inode) { - ext4_mark_inode_dirty(handle, new.inode); + retval = ext4_mark_inode_dirty(handle, new.inode); + if (unlikely(retval)) + goto end_rename; if (!new.inode->i_nlink) ext4_orphan_add(handle, new.inode); } @@ -3979,8 +3997,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, ctime = current_time(old.inode); old.inode->i_ctime = ctime; new.inode->i_ctime = ctime; - ext4_mark_inode_dirty(handle, old.inode); - ext4_mark_inode_dirty(handle, new.inode); + retval = ext4_mark_inode_dirty(handle, old.inode); + if (unlikely(retval)) + goto end_rename; + retval = ext4_mark_inode_dirty(handle, new.inode); + if (unlikely(retval)) + goto end_rename; if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 49821d8a0910..c3983f87587d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5880,7 +5880,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); @@ -5982,12 +5982,14 @@ static int ext4_quota_off(struct super_block *sb, int type) * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + err = PTR_ERR(handle); goto out_unlock; + } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); @@ -6045,7 +6047,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err, offset = off & (sb->s_blocksize - 1); + int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -6093,9 +6095,11 @@ out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; } - return len; + return err ? err : len; } #endif diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 01ba66373e97..9b29a40738ac 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1327,7 +1327,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, int blocksize = ea_inode->i_sb->s_blocksize; int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; - int ret = 0; + int ret = 0, ret2 = 0; int retries = 0; retry: @@ -1385,7 +1385,9 @@ retry: ext4_update_i_disksize(ea_inode, wsize); inode_unlock(ea_inode); - ext4_mark_inode_dirty(handle, ea_inode); + ret2 = ext4_mark_inode_dirty(handle, ea_inode); + if (unlikely(ret2 && !ret)) + ret = ret2; out: brelse(bh); From b60ca3343e78761c6ebe6ff52c628893c8297b73 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Sun, 26 Apr 2020 18:34:38 -0700 Subject: [PATCH 13/58] ext4: don't ignore return values from ext4_ext_dirty() Don't ignore return values from ext4_ext_dirty, since the errors indicate valid failures below Ext4. In all of the other instances of ext4_ext_dirty calls, the error return value is handled in some way. This patch makes those remaining couple of places to handle ext4_ext_dirty errors as well. In case of ext4_split_extent_at(), the ignorance of return value is intentional. The reason is that we are already in error path and there isn't much we can do if ext4_ext_dirty returns error. This patch adds a comment for that case explaining why we ignore the return value. In the longer run, we probably should make sure that errors from other mark_dirty routines are handled as well. Ran gce-xfstests smoke tests and verified that there were no regressions. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200427013438.219117-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 49ea6973d65f..6befdecf977a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3243,6 +3243,10 @@ out: fix_extent_len: ex->ee_len = orig_ex.ee_len; + /* + * Ignore ext4_ext_dirty return value since we are already in error path + * and err is a non-zero error code. + */ ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } @@ -3502,7 +3506,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } if (allocated) { /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; From ee802f86899341d835c629a4ad9607135f1b71a2 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 30 Apr 2020 14:53:17 -0400 Subject: [PATCH 14/58] ext4: remove dead GET_BLOCKS_ZERO code There's no call to ext4_map_blocks() in the current ext4 code with a flags argument that combines EXT4_GET_BLOCKS_CONVERT and EXT4_GET_BLOCKS_ZERO. Remove the code that corresponds to this case from ext4_ext_handle_unwritten_extents(). Signed-off-by: Eric Whitney Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20200430185320.23001-2-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6befdecf977a..728f4a620fde 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3829,14 +3829,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - if (flags & EXT4_GET_BLOCKS_ZERO) { - if (allocated > map->m_len) - allocated = map->m_len; - err = ext4_issue_zeroout(inode, map->m_lblk, newblock, - allocated); - if (err < 0) - goto out2; - } ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (ret >= 0) From bee6cf00c7f17df27c842c169db31d53bdd775ba Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 30 Apr 2020 14:53:18 -0400 Subject: [PATCH 15/58] ext4: remove redundant GET_BLOCKS_CONVERT code Remove the redundant code assigning values to ext4_map_blocks components in ext4_ext_handle_unwritten_extents() for the EXT4_GET_BLOCKS_CONVERT case, using the code at the function exit instead. Clean up and reorder that code to eliminate more redundancy and improve readability. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20200430185320.23001-3-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 728f4a620fde..57f02e2d2f52 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3829,20 +3829,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - ret = ext4_convert_unwritten_extents_endio(handle, inode, map, + err = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); - else - err = ret; - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = newblock; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - goto out2; + if (err < 0) + goto out2; + ext4_update_inode_fsync_trans(handle, inode, 1); + goto map_out; } - /* buffered IO case */ + /* buffered IO cases */ /* * repeat fallocate creation request * we already have an unwritten extent @@ -3876,18 +3870,14 @@ out: } else allocated = ret; map->m_flags |= EXT4_MAP_NEW; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: + map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; - ext4_ext_show_leaf(inode, path); - map->m_pblk = newblock; map->m_len = allocated; + ext4_ext_show_leaf(inode, path); out2: return err ? err : allocated; } From 779e26517b3600830fe58933d5f97627711c9435 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 30 Apr 2020 14:53:19 -0400 Subject: [PATCH 16/58] ext4: clean up GET_BLOCKS_PRE_IO error handling If the call to ext4_split_convert_extents() fails in the EXT4_GET_BLOCKS_PRE_IO case within ext4_ext_handle_unwritten_extents(), error out through the exit point at function end rather than jumping through an intermediate point. Fix the error handling in the event ext4_split_convert_extents() returns 0, which it shouldn't do when splitting an existing extent. The current code returns the passed in value of allocated (which is likely non-zero) while failing to set m_flags, m_pblk, and m_len. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20200430185320.23001-4-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 57f02e2d2f52..c63bc13f9a72 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3818,12 +3818,25 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); - /* get_block() before submit the IO, split the extent */ + /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { ret = ext4_split_convert_extents(handle, inode, map, ppath, flags | EXT4_GET_BLOCKS_CONVERT); - if (ret <= 0) - goto out; + if (ret < 0) { + err = ret; + goto out2; + } + /* + * shouldn't get a 0 return when splitting an extent unless + * m_len is 0 (bug) or extent has been corrupted + */ + if (unlikely(ret == 0)) { + EXT4_ERROR_INODE(inode, + "unexpected ret == 0, m_len = %u", + map->m_len); + err = -EFSCORRUPTED; + goto out2; + } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } @@ -3863,12 +3876,13 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); -out: + if (ret <= 0) { err = ret; goto out2; - } else - allocated = ret; + } +out: + allocated = ret; map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; From be809e1274ebc043640eeeb287accb7b4a4bcbff Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 30 Apr 2020 14:53:20 -0400 Subject: [PATCH 17/58] ext4: clean up ext4_ext_convert_to_initialized() error handling If ext4_ext_convert_to_initialized() fails when called within ext4_ext_handle_unwritten_extents(), immediately error out through the exit point at function end. Fix the error handling in the event ext4_ext_convert_to_initialized() returns 0, which it shouldn't do when converting an existing extent. The current code returns the passed in value of allocated (which is likely non-zero) while failing to set m_flags, m_pblk, and m_len. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20200430185320.23001-5-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c63bc13f9a72..03b94c6e90cc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3872,15 +3872,28 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, goto out1; } - /* buffered write, writepage time, convert*/ + /* + * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. + * For buffered writes, at writepage time, etc. Convert a + * discovered unwritten extent to written. + */ ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); - - if (ret <= 0) { + if (ret < 0) { err = ret; goto out2; } + ext4_update_inode_fsync_trans(handle, inode, 1); + /* + * shouldn't get a 0 return when converting an unwritten extent + * unless m_len is 0 (bug) or extent has been corrupted + */ + if (unlikely(ret == 0)) { + EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", + map->m_len); + err = -EFSCORRUPTED; + goto out2; + } + out: allocated = ret; map->m_flags |= EXT4_MAP_NEW; From 80dd4978ddd78b2fe5aec2d52b69cbc2d06b08d8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 3 May 2020 22:06:47 +0200 Subject: [PATCH 18/58] ext4: fix a typo in a comment s/extnets/extents/ Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20200503200647.154701-1-christophe.jaillet@wanadoo.fr Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 03b94c6e90cc..3ca797faa86b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4503,7 +4503,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode_lock(inode); /* - * Indirect files do not support unwritten extnets + * Indirect files do not support unwritten extents */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; From 08adf452e628b0e2ce9a01048cfbec52353703d7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 6 May 2020 11:31:40 -0700 Subject: [PATCH 19/58] ext4: fix race between ext4_sync_parent() and rename() 'igrab(d_inode(dentry->d_parent))' without holding dentry->d_lock is broken because without d_lock, d_parent can be concurrently changed due to a rename(). Then if the old directory is immediately deleted, old d_parent->inode can be NULL. That causes a NULL dereference in igrab(). To fix this, use dget_parent() to safely grab a reference to the parent dentry, which pins the inode. This also eliminates the need to use d_find_any_alias() other than for the initial inode, as we no longer throw away the dentry at each step. This is an extremely hard race to hit, but it is possible. Adding a udelay() in between the reads of ->d_parent and its ->d_inode makes it reproducible on a no-journal filesystem using the following program: #include #include int main() { if (fork()) { for (;;) { mkdir("dir1", 0700); int fd = open("dir1/file", O_RDWR|O_CREAT|O_SYNC); write(fd, "X", 1); close(fd); } } else { mkdir("dir2", 0700); for (;;) { rename("dir1/file", "dir2/file"); rmdir("dir1"); } } } Fixes: d59729f4e794 ("ext4: fix races in ext4_sync_parent()") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20200506183140.541194-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fsync.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e10206e7f4bb..093c359952cd 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,30 +44,28 @@ */ static int ext4_sync_parent(struct inode *inode) { - struct dentry *dentry = NULL; - struct inode *next; + struct dentry *dentry, *next; int ret = 0; if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) return 0; - inode = igrab(inode); + dentry = d_find_any_alias(inode); + if (!dentry) + return 0; while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = d_find_any_alias(inode); - if (!dentry) - break; - next = igrab(d_inode(dentry->d_parent)); + + next = dget_parent(dentry); dput(dentry); - if (!next) - break; - iput(inode); - inode = next; + dentry = next; + inode = dentry->d_inode; + /* * The directory inode may have gone through rmdir by now. But * the inode itself and its blocks are still allocated (we hold - * a reference to the inode so it didn't go through - * ext4_evict_inode()) and so we are safe to flush metadata - * blocks and the inode. + * a reference to the inode via its dentry), so it didn't go + * through ext4_evict_inode()) and so we are safe to flush + * metadata blocks and the inode. */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) @@ -76,7 +74,7 @@ static int ext4_sync_parent(struct inode *inode) if (ret) break; } - iput(inode); + dput(dentry); return ret; } From 73c384c0cdaa8ea9ca9ef2d0cff6a25930f1648e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 7 May 2020 10:50:28 -0700 Subject: [PATCH 20/58] ext4: avoid ext4_error()'s caused by ENOMEM in the truncate path We can't fail in the truncate path without requiring an fsck. Add work around for this by using a combination of retry loops and the __GFP_NOFAIL flag. From: Theodore Ts'o Signed-off-by: Theodore Ts'o Signed-off-by: Anna Pendleton Reviewed-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20200507175028.15061-1-pendleton@google.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 43 +++++++++++++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 884ce3086486..5d901bf92ce9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -630,6 +630,7 @@ enum { */ #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 +#define EXT4_EX_NOFAIL 0x10000000 /* * Flags used by ext4_free_blocks diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3ca797faa86b..ff7eeb5a77ef 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -297,11 +297,14 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, { struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; + + if (nofail) + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, - EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | - (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); + flags); } static int @@ -487,8 +490,12 @@ __read_extent_tree_block(const char *function, unsigned int line, { struct buffer_head *bh; int err; + gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; - bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; + + bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -837,6 +844,10 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; + gfp_t gfp_flags = GFP_NOFS; + + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; eh = ext_inode_hdr(inode); depth = ext_depth(inode); @@ -857,7 +868,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, if (!path) { /* account possible depth increase */ path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), - GFP_NOFS); + gfp_flags); if (unlikely(!path)) return ERR_PTR(-ENOMEM); path[0].p_maxdepth = depth + 1; @@ -1007,9 +1018,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock, oldblock; __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + gfp_t gfp_flags = GFP_NOFS; int err = 0; size_t ext_size = 0; + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; + /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ @@ -1043,7 +1058,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, * We need this to handle errors and free blocks * upon them. */ - ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS); + ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); if (!ablocks) return -ENOMEM; @@ -2019,7 +2034,7 @@ prepend: if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_find_extent(inode, next, NULL, 0); + npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -2792,7 +2807,8 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, end, NULL, + EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2878,7 +2894,7 @@ again: le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), - GFP_NOFS); + GFP_NOFS | __GFP_NOFAIL); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; @@ -3303,7 +3319,7 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_find_extent(inode, map->m_lblk, ppath, flags); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -4365,7 +4381,14 @@ retry: } if (err) return err; - return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); +retry_remove_space: + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_remove_space; + } + return err; } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, From 212da3ec6fafe458cd54c3a365cbcb4f9bd794e1 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:41 +0530 Subject: [PATCH 21/58] ext4: mballoc: print bb_free info even when it is 0 Improve the debugging msg by also printing even if bb_free is 0. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/c894f1d1d30f86ae38f4e3a861949665b6dc61cd.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 30d5d97548c4..bcfaaad62167 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4170,8 +4170,6 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) } ext4_unlock_group(sb, i); - if (grp->bb_free == 0) - continue; printk(KERN_ERR "%u: %d/%d \n", i, grp->bb_free, grp->bb_fragments); } From e68cf40c0d098a63bc571bc5981dee6c2013c494 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:42 +0530 Subject: [PATCH 22/58] ext4: mballoc: refactor ext4_mb_show_ac() This factors out ext4_mb_show_pa() function to show all the group's preallocation info. This could be useful info to be added in later patches. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/8f07d890b0038dcc935e9c10e6043ec9f3792721.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 65 +++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bcfaaad62167..d1464d9110ef 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4120,38 +4120,16 @@ repeat: } #ifdef CONFIG_EXT4_DEBUG -static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +static inline void ext4_mb_show_pa(struct super_block *sb) { - struct super_block *sb = ac->ac_sb; - ext4_group_t ngroups, i; + ext4_group_t i, ngroups; if (!ext4_mballoc_debug || (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" - " Allocation context details:"); - ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", - ac->ac_status, ac->ac_flags); - ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " - "goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d", - (unsigned long)ac->ac_o_ex.fe_group, - (unsigned long)ac->ac_o_ex.fe_start, - (unsigned long)ac->ac_o_ex.fe_len, - (unsigned long)ac->ac_o_ex.fe_logical, - (unsigned long)ac->ac_g_ex.fe_group, - (unsigned long)ac->ac_g_ex.fe_start, - (unsigned long)ac->ac_g_ex.fe_len, - (unsigned long)ac->ac_g_ex.fe_logical, - (unsigned long)ac->ac_b_ex.fe_group, - (unsigned long)ac->ac_b_ex.fe_start, - (unsigned long)ac->ac_b_ex.fe_len, - (unsigned long)ac->ac_b_ex.fe_logical, - (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); - ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); + ext4_msg(sb, KERN_ERR, "groups: "); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; @@ -4175,9 +4153,46 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) } printk(KERN_ERR "\n"); } + +static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + + if (!ext4_mballoc_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + return; + + ext4_msg(sb, KERN_ERR, "Can't allocate:" + " Allocation context details:"); + ext4_msg(sb, KERN_ERR, "status %d flags %d", + ac->ac_status, ac->ac_flags); + ext4_msg(sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", + (unsigned long)ac->ac_o_ex.fe_group, + (unsigned long)ac->ac_o_ex.fe_start, + (unsigned long)ac->ac_o_ex.fe_len, + (unsigned long)ac->ac_o_ex.fe_logical, + (unsigned long)ac->ac_g_ex.fe_group, + (unsigned long)ac->ac_g_ex.fe_start, + (unsigned long)ac->ac_g_ex.fe_len, + (unsigned long)ac->ac_g_ex.fe_logical, + (unsigned long)ac->ac_b_ex.fe_group, + (unsigned long)ac->ac_b_ex.fe_start, + (unsigned long)ac->ac_b_ex.fe_len, + (unsigned long)ac->ac_b_ex.fe_logical, + (int)ac->ac_criteria); + ext4_msg(sb, KERN_ERR, "%d found", ac->ac_found); + ext4_mb_show_pa(sb); +} #else +static inline void ext4_mb_show_pa(struct super_block *sb) +{ + return; +} static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { + ext4_mb_show_pa(ac->ac_sb); return; } #endif From bbc4ec77e9f9c7ac71aee15c6adbd1674fe66c60 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:43 +0530 Subject: [PATCH 23/58] ext4: mballoc: add more mb_debug() msgs This patch adds some more debugging mb_debug() msgs to help improve mballoc code debugging. Other than adding more mb_debug() msgs at few more places, there should be no other functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/5fc8e7788b924e211fcfa4a4c1d2f8503511661a.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d1464d9110ef..2e4697e7b945 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2108,7 +2108,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; - int cr; + int cr = -1; int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; @@ -2260,6 +2260,10 @@ repeat: out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; + + mb_debug(1, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, + ac->ac_flags, cr, err); return err; } @@ -3918,7 +3922,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, mb_debug(1, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) - return 0; + goto out_dbg; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { @@ -3926,7 +3930,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); - return 0; + goto out_dbg; } err = ext4_mb_load_buddy(sb, group, &e4b); @@ -3934,7 +3938,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); - return 0; + goto out_dbg; } if (needed == 0) @@ -3979,6 +3983,8 @@ repeat: /* found anything to free? */ if (list_empty(&list)) { BUG_ON(free != 0); + mb_debug(1, "Someone else may have freed PA for this group %u\n", + group); goto out; } @@ -4003,6 +4009,9 @@ out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); +out_dbg: + mb_debug(1, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", + free, group, grp->bb_free); return free; } @@ -4538,6 +4547,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ar->len = ar->len >> 1; } if (!ar->len) { + ext4_mb_show_pa(sb); *errp = -ENOSPC; return 0; } From 36bad4233cc50e29022c1095666935cee1c978ad Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:44 +0530 Subject: [PATCH 24/58] ext4: mballoc: correct the mb_debug() format specifier for pa_len var pa->pa_len is an integer. Fix all of the format specifier used in mb_debug() for pa_len to %d instead of %u. As such no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/af4987f643c586f62bcc9961e43f0a67151d5551.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2e4697e7b945..49de715d04f9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3720,7 +3720,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; - mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); @@ -3780,7 +3780,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; - mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); @@ -3862,10 +3862,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, } if (free != pa->pa_free) { ext4_msg(e4b->bd_sb, KERN_CRIT, - "pa %p: logic %lu, phys. %lu, len %lu", + "pa %p: logic %lu, phys. %lu, len %d", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); + pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* @@ -4152,7 +4152,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%u:%d:%u \n", i, + printk(KERN_ERR "PA:%u:%d:%d \n", i, start, pa->pa_len); } ext4_unlock_group(sb, i); From 004379d0b02ab8e8efbee1f2f878d5f578bed72c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:45 +0530 Subject: [PATCH 25/58] ext4: mballoc: fix few other format specifier in mb_debug() Fix few other format specifiers in mb_debug() msgs. As such no other functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/574fa7f833abf2dbf3b53a2fea3195e71f6cdbd8.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 49de715d04f9..4ada63cf425f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3280,8 +3280,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, - (unsigned) orig_size, (unsigned) start); + mb_debug(1, "goal: %lld(was %lld) blocks at %u\n", size, orig_size, + start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) @@ -3370,7 +3370,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); + mb_debug(1, "use %llu/%d from inode pa %p\n", start, len, pa); } /* @@ -3577,7 +3577,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "preallocated %u for group %u\n", preallocated, group); + mb_debug(1, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -4173,7 +4173,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) ext4_msg(sb, KERN_ERR, "Can't allocate:" " Allocation context details:"); - ext4_msg(sb, KERN_ERR, "status %d flags %d", + ext4_msg(sb, KERN_ERR, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); ext4_msg(sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " @@ -4191,7 +4191,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(sb, KERN_ERR, "%d found", ac->ac_found); + ext4_msg(sb, KERN_ERR, "%u found", ac->ac_found); ext4_mb_show_pa(sb); } #else From f283529abac45d8c2b4d4b69d356cca9e6a2de43 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:46 +0530 Subject: [PATCH 26/58] ext4: mballoc: simplify error handling in ext4_init_mballoc() This patch simplifies error handling logic in ext4_init_mballoc(), by adding all the cleanups at one place at the end of that function. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/8621a7bc68f7107a9ac4292afeb784515333bd25.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4ada63cf425f..aaf43c6c08e1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2913,23 +2913,26 @@ int __init ext4_init_mballoc(void) ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) - return -ENOMEM; + goto out; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); - if (ext4_ac_cachep == NULL) { - kmem_cache_destroy(ext4_pspace_cachep); - return -ENOMEM; - } + if (ext4_ac_cachep == NULL) + goto out_pa_free; ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); - if (ext4_free_data_cachep == NULL) { - kmem_cache_destroy(ext4_pspace_cachep); - kmem_cache_destroy(ext4_ac_cachep); - return -ENOMEM; - } + if (ext4_free_data_cachep == NULL) + goto out_ac_free; + return 0; + +out_ac_free: + kmem_cache_destroy(ext4_ac_cachep); +out_pa_free: + kmem_cache_destroy(ext4_pspace_cachep); +out: + return -ENOMEM; } void ext4_exit_mballoc(void) From 4fca8f07790a62c2b3da028ae423cf4d71c1bacd Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:47 +0530 Subject: [PATCH 27/58] ext4: mballoc: make ext4_mb_use_preallocated() return type as bool Change return type of function ext4_mb_use_preallocated() to bool to better reflect what this function can return. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/7880cb6ef911465beafefcd7e9c3ea214688744b.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index aaf43c6c08e1..262a53f1d283 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3432,7 +3432,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, /* * search goal blocks in preallocated space */ -static noinline_for_stack int +static noinline_for_stack bool ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); @@ -3444,7 +3444,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) - return 0; + return false; /* first, try per-file preallocation */ rcu_read_lock(); @@ -3471,7 +3471,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) spin_unlock(&pa->pa_lock); ac->ac_criteria = 10; rcu_read_unlock(); - return 1; + return true; } spin_unlock(&pa->pa_lock); } @@ -3479,12 +3479,12 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) - return 0; + return false; /* inode may have no locality group for some reason */ lg = ac->ac_lg; if (lg == NULL) - return 0; + return false; order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ @@ -3513,9 +3513,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) if (cpa) { ext4_mb_use_group_pa(ac, cpa); ac->ac_criteria = 20; - return 1; + return true; } - return 0; + return false; } /* From a345021553f7e6343b05b1ad1c25ed931140b47c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:48 +0530 Subject: [PATCH 28/58] ext4: mballoc: refactor code inside DOUBLE_CHECK into separate function This patch implemets mb_group_bb_bitmap_alloc() and mb_group_bb_bitmap_free() function to remove #ifdef DOUBLE_CHECK macro and it's related code from inside ext4_mb_add_groupinfo()/ext4_mb_release(). There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/8c2095d74b779f0254a19b24982490dc6f07c4f9.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 50 +++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 262a53f1d283..3555e72f149c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -511,6 +511,26 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) } } +static void mb_group_bb_bitmap_alloc(struct super_block *sb, + struct ext4_group_info *grp, ext4_group_t group) +{ + struct buffer_head *bh; + + grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); + BUG_ON(grp->bb_bitmap == NULL); + + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(IS_ERR_OR_NULL(bh)); + + memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); + put_bh(bh); +} + +static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) +{ + kfree(grp->bb_bitmap); +} + #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) @@ -526,6 +546,17 @@ static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } + +static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, + struct ext4_group_info *grp, ext4_group_t group) +{ + return; +} + +static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) +{ + return; +} #endif #ifdef AGGRESSIVE_CHECK @@ -2456,20 +2487,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i]->bb_free_root = RB_ROOT; meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ -#ifdef DOUBLE_CHECK - { - struct buffer_head *bh; - meta_group_info[i]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_NOFS); - BUG_ON(meta_group_info[i]->bb_bitmap == NULL); - bh = ext4_read_block_bitmap(sb, group); - BUG_ON(IS_ERR_OR_NULL(bh)); - memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, - sb->s_blocksize); - put_bh(bh); - } -#endif - + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; exit_group_info: @@ -2736,9 +2754,7 @@ int ext4_mb_release(struct super_block *sb) for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); -#ifdef DOUBLE_CHECK - kfree(grinfo->bb_bitmap); -#endif + mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); ext4_mb_cleanup_pa(grinfo); ext4_unlock_group(sb, i); From eb2b8ebb867fa895d5c4768310998bc940f6506c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:49 +0530 Subject: [PATCH 29/58] ext4: mballoc: fix possible NULL ptr & remove BUG_ONs from DOUBLE_CHECK Make sure to check for e4b->bd_info->bb_bitmap == NULL, in mb_cmp_bitmaps() and return if NULL, to avoid possible NULL ptr dereference. Similar to how we do this in other ifdef DOUBLE_CHECK functions. Also remove the BUG_ON() logic if kmalloc() or ext4_read_block_bitmap() fails. We should simply mark grp->bb_bitmap as NULL if above happens. In fact ext4_read_block_bitmap() may even return an error in case of resize ioctl. Hence remove this BUG_ON logic (fstests ext4/032 may trigger this). Link: https://lore.kernel.org/r/9a54f8a696ff17c057cd571be3d15ac3ec1407f1.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3555e72f149c..c713d06e70b7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -493,6 +493,8 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; @@ -517,10 +519,15 @@ static void mb_group_bb_bitmap_alloc(struct super_block *sb, struct buffer_head *bh; grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); - BUG_ON(grp->bb_bitmap == NULL); + if (!grp->bb_bitmap) + return; bh = ext4_read_block_bitmap(sb, group); - BUG_ON(IS_ERR_OR_NULL(bh)); + if (IS_ERR_OR_NULL(bh)) { + kfree(grp->bb_bitmap); + grp->bb_bitmap = NULL; + return; + } memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); From 9bee5779ee26d6debc84e0f1e4e54daa93f13ebc Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:50 +0530 Subject: [PATCH 30/58] ext4: balloc: use task_pid_nr() helper Use task_pid_nr() function instead of current->pid. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/4b58403e15e9c8deb34a1b93deb3fc9cd153ab84.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a32e5f7b5385..1ba46d87cdf1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -903,10 +903,11 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * + colour = (task_pid_nr(current) % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); + colour = (task_pid_nr(current) % 16) * + ((last_block - bg_start) / 16); return bg_start + colour; } From 6db074618969dc6fac4978e8043945fd440b310a Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:51 +0530 Subject: [PATCH 31/58] ext4: use BIT() macro for BH_** state bits Simply use BIT() macro for all BH_** state bits instead of open coding it. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/57667689f51a3f9dba2fcef7d3425187fa3ba69f.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 8 ++++---- fs/ext4/inode.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5d901bf92ce9..89cac4e32018 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -171,10 +171,10 @@ struct ext4_allocation_request { * well as to store the information returned by ext4_map_blocks(). It * takes less room on the stack than a struct buffer_head. */ -#define EXT4_MAP_NEW (1 << BH_New) -#define EXT4_MAP_MAPPED (1 << BH_Mapped) -#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) -#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_NEW BIT(BH_New) +#define EXT4_MAP_MAPPED BIT(BH_Mapped) +#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) +#define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 456e8a6b4809..043ee7efce5f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2084,7 +2084,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) return err; } -#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) +#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... @@ -2364,7 +2364,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (map->m_flags & (1 << BH_Delay)) + if (map->m_flags & BIT(BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); From ec8c60be96d6de74be601fbca56342efb9a1e039 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:52 +0530 Subject: [PATCH 32/58] ext4: improve ext_debug() msg in case of block allocation failure ext4_map_blocks() has ext_debug msg early at the start of function. We also get ext_debug msg if we could allocate a block from ext4_ext_map_blocks(). But there is no ext_debug() msg in case of block allocation failure. So add one along with error code. Also add more info in ext_debug() msg like how many blocks were allocated v/s how many were requested in ext4_ext_map_blocks(). Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/1610ec2aa932396be00f9d552fe29da473ead176.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 ++-- fs/ext4/inode.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ff7eeb5a77ef..7b4b0c0110ac 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4246,10 +4246,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; - ext_debug("allocate new block: goal %llu, found %llu/%u\n", - ar.goal, newblock, allocated); allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; + ext_debug("allocate new block: goal %llu, found %llu/%u, requested %u\n", + ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 043ee7efce5f..0a52f98512d7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -732,6 +732,10 @@ out_sem: return ret; } } + + if (retval < 0) + ext_debug("failed for inode %lu with err %d\n", + inode->i_ino, retval); return retval; } From 8ec2d31b27f683e3deeaf5d562b534a695052de3 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:53 +0530 Subject: [PATCH 33/58] ext4: replace EXT_DEBUG with __maybe_unused in ext4_ext_handle_unwritten_extents() Replace EXT_DEBUG with __maybe_unused from inside ext4_ext_handle_unwritten_extents() function. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/ae335b94506cd9db9d2648c1f4dd25a80f9f3ce2.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7b4b0c0110ac..2f711cc3cdce 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3813,9 +3813,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { -#ifdef EXT_DEBUG - struct ext4_ext_path *path = *ppath; -#endif + struct ext4_ext_path __maybe_unused *path = *ppath; int ret = 0; int err = 0; From d3df14535f4a5b5af58ef12b4263202df3155356 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:54 +0530 Subject: [PATCH 34/58] ext4: mballoc: make mb_debug() implementation to use pr_debug() mb_debug() msg had only 1 control level for all type of msgs. And if we enable mballoc_debug then all of those msgs would be enabled. Instead of adding multiple debug levels for mb_debug() msgs, use pr_debug() with which we could have finer control to print msgs at all of different levels (i.e. at file, func, line no.). Also add process name/pid, superblk id, and other info in mb_debug() msg. This also kills the mballoc_debug module parameter, since it is not needed any more. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/f0c660cbde9e2edbe95c67942ca9ad80dd2231eb.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/Kconfig | 3 +- fs/ext4/mballoc.c | 104 +++++++++++++++++++++------------------------- fs/ext4/mballoc.h | 16 +++---- 3 files changed, 55 insertions(+), 68 deletions(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 2a592e38cdfe..02376ddb0cb5 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -99,8 +99,7 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - with a command such as: - echo 1 > /sys/module/ext4/parameters/mballoc_debug + using dynamic debug control for mb_debug() msgs. config EXT4_KUNIT_TESTS tristate "KUnit tests for ext4" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c713d06e70b7..33a69424942c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -18,13 +18,6 @@ #include #include -#ifdef CONFIG_EXT4_DEBUG -ushort ext4_mballoc_debug __read_mostly; - -module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); -MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); -#endif - /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -858,14 +851,14 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - mb_debug(1, "init page %lu\n", page->index); - inode = page->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; + mb_debug(sb, "init page %lu\n", page->index); + groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) groups_per_page = 1; @@ -905,7 +898,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) bh[i] = NULL; goto out; } - mb_debug(1, "read bitmap for group %u\n", group); + mb_debug(sb, "read bitmap for group %u\n", group); } /* wait for I/O completion */ @@ -950,7 +943,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(1, "put buddy for group %u in page %lu/%x\n", + mb_debug(sb, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); @@ -970,7 +963,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(1, "put bitmap for group %u in page %lu/%x\n", + mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); @@ -1076,7 +1069,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) int ret = 0; might_sleep(); - mb_debug(1, "init group %u\n", group); + mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache @@ -1148,7 +1141,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct inode *inode = sbi->s_buddy_cache; might_sleep(); - mb_debug(1, "load group %u\n", group); + mb_debug(sb, "load group %u\n", group); blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); @@ -2299,7 +2292,7 @@ out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; - mb_debug(1, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", + mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, ac->ac_flags, cr, err); return err; @@ -2731,7 +2724,7 @@ out: } /* need to called with the ext4 group lock held */ -static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) +static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; @@ -2743,9 +2736,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) count++; kmem_cache_free(ext4_pspace_cachep, pa); } - if (count) - mb_debug(1, "mballoc: %u PAs left\n", count); - + return count; } int ext4_mb_release(struct super_block *sb) @@ -2756,6 +2747,7 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + int count; if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { @@ -2763,7 +2755,10 @@ int ext4_mb_release(struct super_block *sb) grinfo = ext4_get_group_info(sb, i); mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); - ext4_mb_cleanup_pa(grinfo); + count = ext4_mb_cleanup_pa(grinfo); + if (count) + mb_debug(sb, "mballoc: %d PAs left\n", + count); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } @@ -2836,7 +2831,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, struct ext4_group_info *db; int err, count = 0, count2 = 0; - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); @@ -2876,7 +2871,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb, kmem_cache_free(ext4_free_data_cachep, entry); ext4_mb_unload_buddy(&e4b); - mb_debug(1, "freed %u blocks in %u structures\n", count, count2); + mb_debug(sb, "freed %d blocks in %d structures\n", count, + count2); } /* @@ -3107,8 +3103,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) BUG_ON(lg == NULL); ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; - mb_debug(1, "#%u: goal %u blocks for locality group\n", - current->pid, ac->ac_g_ex.fe_len); + mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } /* @@ -3306,8 +3301,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - mb_debug(1, "goal: %lld(was %lld) blocks at %u\n", size, orig_size, - start); + mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, + orig_size, start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) @@ -3396,7 +3391,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug(1, "use %llu/%d from inode pa %p\n", start, len, pa); + mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); } /* @@ -3420,7 +3415,8 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ - mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); + mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", + pa->pa_lstart-len, len, pa); } /* @@ -3603,7 +3599,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "preallocated %d for group %u\n", preallocated, group); + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3746,8 +3742,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; - mb_debug(1, "new inode pa %p: %llu/%d for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); @@ -3806,8 +3802,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; - mb_debug(1, "new group pa %p: %llu/%d for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); @@ -3874,7 +3870,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - mb_debug(1, " free preallocated %u/%u in group %u\n", + mb_debug(sb, "free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; @@ -3945,8 +3941,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug(1, "discard preallocation for group %u\n", group); - + mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) goto out_dbg; @@ -4009,7 +4004,7 @@ repeat: /* found anything to free? */ if (list_empty(&list)) { BUG_ON(free != 0); - mb_debug(1, "Someone else may have freed PA for this group %u\n", + mb_debug(sb, "Someone else may have freed PA for this group %u\n", group); goto out; } @@ -4036,7 +4031,7 @@ out: ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: - mb_debug(1, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", + mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", free, group, grp->bb_free); return free; } @@ -4066,7 +4061,8 @@ void ext4_discard_preallocations(struct inode *inode) return; } - mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); + mb_debug(sb, "discard preallocation for inode %lu\n", + inode->i_ino); trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); @@ -4159,12 +4155,11 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (!ext4_mballoc_debug || - (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) return; ngroups = ext4_get_groups_count(sb); - ext4_msg(sb, KERN_ERR, "groups: "); + mb_debug(sb, "groups: "); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; @@ -4178,30 +4173,27 @@ static inline void ext4_mb_show_pa(struct super_block *sb) ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%u:%d:%d \n", i, - start, pa->pa_len); + mb_debug(sb, "PA:%u:%d:%d\n", i, start, + pa->pa_len); } ext4_unlock_group(sb, i); - - printk(KERN_ERR "%u: %d/%d \n", - i, grp->bb_free, grp->bb_fragments); + mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, + grp->bb_fragments); } - printk(KERN_ERR "\n"); } static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (!ext4_mballoc_debug || - (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) return; - ext4_msg(sb, KERN_ERR, "Can't allocate:" + mb_debug(sb, "Can't allocate:" " Allocation context details:"); - ext4_msg(sb, KERN_ERR, "status %u flags 0x%x", + mb_debug(sb, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); - ext4_msg(sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " + mb_debug(sb, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, @@ -4217,7 +4209,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(sb, KERN_ERR, "%u found", ac->ac_found); + mb_debug(sb, "%u found", ac->ac_found); ext4_mb_show_pa(sb); } #else @@ -4330,7 +4322,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); - mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, @@ -4351,7 +4343,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; - mb_debug(1, "discard locality group preallocation\n"); + mb_debug(sb, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 88c98f17e3d9..6b4d17c2935d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -24,19 +24,15 @@ #include "ext4.h" /* + * mb_debug() dynamic printk msgs could be used to debug mballoc code. */ #ifdef CONFIG_EXT4_DEBUG -extern ushort ext4_mballoc_debug; - -#define mb_debug(n, fmt, ...) \ -do { \ - if ((n) <= ext4_mballoc_debug) { \ - printk(KERN_DEBUG "(%s, %d): %s: " fmt, \ - __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ - } \ -} while (0) +#define mb_debug(sb, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \ + current->comm, task_pid_nr(current), sb->s_id, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) #else -#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ From 70aa1554b01474ab08d08e5a18b0215a7ff1e8dc Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 10 May 2020 11:54:55 +0530 Subject: [PATCH 35/58] ext4: make ext_debug() implementation to use pr_debug() ext_debug() msgs could be helpful, provided those could be enabled without recompiling kernel and also if we could selectively enable only required prints for case by case debugging. So make ext_debug() implementation use pr_debug(). Also change ext_debug() to be defined with CONFIG_EXT4_DEBUG. So EXT_DEBUG macro now mostly remain for below 3 functions. ext4_ext_show_path/leaf/move() (whose print msgs use ext_debug() which again could be dynamically enabled using pr_debug()) This also changes the ext_debug() to take inode as a parameter to add inode no. in all of it's msgs. Prints additional info like process name / pid, superblock id etc. This also removes any explicit function names passed in ext_debug(). Since ext_debug() on it's own prints file, func and line no. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/d31dc189b0aeda9384fe7665e36da7cd8c61571f.1589086800.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/Kconfig | 2 +- fs/ext4/ext4.h | 20 +++++-- fs/ext4/extents.c | 144 ++++++++++++++++++++++------------------------ fs/ext4/inode.c | 11 ++-- 4 files changed, 88 insertions(+), 89 deletions(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 02376ddb0cb5..cf9e430514c4 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -99,7 +99,7 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - using dynamic debug control for mb_debug() msgs. + using dynamic debug control for mb_debug() / ext_debug() msgs. config EXT4_KUNIT_TESTS tristate "KUnit tests for ext4" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 89cac4e32018..80866f124b9a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -80,14 +80,22 @@ #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -/* - * Turn on EXT_DEBUG to get lots of info about extents operations. - */ + /* + * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c + */ #define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) + +/* + * Dynamic printk for controlled extents debugging. + */ +#ifdef CONFIG_EXT4_DEBUG +#define ext_debug(ino, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ + current->comm, task_pid_nr(current), \ + ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ + __func__, ##__VA_ARGS__) #else -#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* data type for block offset of block group */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2f711cc3cdce..969f4c030cf0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -607,22 +607,22 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { int k, l = path->p_depth; - ext_debug("path:"); + ext_debug(inode, "path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { - ext_debug(" %d->%llu", + ext_debug(inode, " %d->%llu", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { - ext_debug(" %d:[%d]%d:%llu ", + ext_debug(inode, " %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else - ext_debug(" []"); + ext_debug(inode, " []"); } - ext_debug("\n"); + ext_debug(inode, "\n"); } static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) @@ -638,14 +638,14 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); - ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + ext_debug(inode, "Displaying leaf extents\n"); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { - ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } - ext_debug("\n"); + ext_debug(inode, "\n"); } static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, @@ -658,10 +658,9 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *idx; idx = path[level].p_idx; while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { - ext_debug("%d: move %d:%llu in new index %llu\n", level, - le32_to_cpu(idx->ei_block), - ext4_idx_pblock(idx), - newblock); + ext_debug(inode, "%d: move %d:%llu in new index %llu\n", + level, le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), newblock); idx++; } @@ -670,7 +669,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_is_unwritten(ex), @@ -714,7 +713,7 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_extent_idx *r, *l, *m; - ext_debug("binsearch for %u(idx): ", block); + ext_debug(inode, "binsearch for %u(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; r = EXT_LAST_INDEX(eh); @@ -724,13 +723,13 @@ ext4_ext_binsearch_idx(struct inode *inode, r = m - 1; else l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), - m, le32_to_cpu(m->ei_block), - r, le32_to_cpu(r->ei_block)); + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, + le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); } path->p_idx = l - 1; - ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH @@ -781,7 +780,7 @@ ext4_ext_binsearch(struct inode *inode, return; } - ext_debug("binsearch for %u: ", block); + ext_debug(inode, "binsearch for %u: ", block); l = EXT_FIRST_EXTENT(eh) + 1; r = EXT_LAST_EXTENT(eh); @@ -792,13 +791,13 @@ ext4_ext_binsearch(struct inode *inode, r = m - 1; else l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), - m, le32_to_cpu(m->ee_block), - r, le32_to_cpu(r->ee_block)); + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, + le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); } path->p_ext = l - 1; - ext_debug(" -> %d:%llu:[%d]%d ", + ext_debug(inode, " -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), ext4_ext_is_unwritten(path->p_ext), @@ -881,7 +880,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { - ext_debug("depth %d: num %d, max %d\n", + ext_debug(inode, "depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); @@ -958,18 +957,20 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ - ext_debug("insert new index %d after: %llu\n", logical, ptr); + ext_debug(inode, "insert new index %d after: %llu\n", + logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ - ext_debug("insert new index %d before: %llu\n", logical, ptr); + ext_debug(inode, "insert new index %d before: %llu\n", + logical, ptr); ix = curp->p_idx; } len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { - ext_debug("insert new index %d: " + ext_debug(inode, "insert new index %d: " "move %d indices from 0x%p to 0x%p\n", logical, len, ix, ix + 1); memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); @@ -1036,12 +1037,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; - ext_debug("leaf will be split." + ext_debug(inode, "leaf will be split." " next leaf starts at %d\n", le32_to_cpu(border)); } else { border = newext->ee_block; - ext_debug("leaf will be added." + ext_debug(inode, "leaf will be added." " next leaf starts at %d\n", le32_to_cpu(border)); } @@ -1063,7 +1064,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, return -ENOMEM; /* allocate all needed blocks */ - ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); + ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err, flags); @@ -1149,7 +1150,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } if (k) - ext_debug("create %d intermediate indices\n", k); + ext_debug(inode, "create %d intermediate indices\n", k); /* insert new index into current index block */ /* current depth stored in i var */ i = depth - 1; @@ -1176,7 +1177,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); - ext_debug("int.index at %d (block %llu): %u -> %llu\n", + ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); /* move remainder of path[i] to the new index block */ @@ -1190,7 +1191,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } /* start copy indexes */ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; - ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); ext4_ext_show_move(inode, path, newblock, i); if (m) { @@ -1327,7 +1328,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block; } - ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", + ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); @@ -1969,7 +1970,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* Try to append newex to the ex */ if (ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d" + ext_debug(inode, "append [%d]%d block to %u:[%d]%d" "(from %llu)\n", ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), @@ -1994,7 +1995,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, prepend: /* Try to prepend newex to the ex */ if (ext4_can_extents_be_merged(inode, newext, ex)) { - ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), ext4_ext_is_unwritten(newext), @@ -2032,7 +2033,7 @@ prepend: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { - ext_debug("next leaf block - %u\n", next); + ext_debug(inode, "next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) @@ -2040,12 +2041,12 @@ prepend: BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { - ext_debug("next leaf isn't full(%d)\n", + ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; goto has_space; } - ext_debug("next leaf has no free space(%d,%d)\n", + ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); } @@ -2071,7 +2072,7 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", + ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), @@ -2081,7 +2082,7 @@ has_space: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ - ext_debug("insert %u:%llu:[%d]%d before: " + ext_debug(inode, "insert %u:%llu:[%d]%d before: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2092,7 +2093,7 @@ has_space: } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); - ext_debug("insert %u:%llu:[%d]%d after: " + ext_debug(inode, "insert %u:%llu:[%d]%d after: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2102,7 +2103,7 @@ has_space: } len = EXT_LAST_EXTENT(eh) - nearex + 1; if (len > 0) { - ext_debug("insert %u:%llu:[%d]%d: " + ext_debug(inode, "insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2246,7 +2247,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, return; hole_len = min(es.es_lblk - hole_start, hole_len); } - ext_debug(" -> %u:%u\n", hole_start, hole_len); + ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); ext4_es_insert_extent(inode, hole_start, hole_len, ~0, EXTENT_STATUS_HOLE); } @@ -2283,7 +2284,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, err = ext4_ext_dirty(handle, inode, path); if (err) return err; - ext_debug("index is empty, remove it, free block %llu\n", leaf); + ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); trace_ext4_ext_rm_idx(inode, leaf); ext4_free_blocks(handle, inode, NULL, leaf, 1, @@ -2562,7 +2563,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ - ext_debug("truncate since %u in leaf to %u\n", start, end); + ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; @@ -2588,7 +2589,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, else unwritten = 0; - ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, unwritten, ex_ee_len); path[depth].p_ext = ex; @@ -2596,7 +2597,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, b = ex_ee_block+ex_ee_len - 1 < end ? ex_ee_block+ex_ee_len - 1 : end; - ext_debug(" border %u:%u\n", a, b); + ext_debug(inode, " border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { @@ -2705,7 +2706,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, + ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2782,7 +2783,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, partial.lblk = 0; partial.state = initial; - ext_debug("truncate since %u to %u\n", start, end); + ext_debug(inode, "truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, @@ -2924,7 +2925,7 @@ again: /* this is index block */ if (!path[i].p_hdr) { - ext_debug("initialize header\n"); + ext_debug(inode, "initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); } @@ -2932,7 +2933,7 @@ again: /* this level hasn't been touched yet */ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; - ext_debug("init index ptr: hdr 0x%p, num %d\n", + ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", path[i].p_hdr, le16_to_cpu(path[i].p_hdr->eh_entries)); } else { @@ -2940,13 +2941,13 @@ again: path[i].p_idx--; } - ext_debug("level %d - index, first 0x%p, cur 0x%p\n", + ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { struct buffer_head *bh; /* go to the next level */ - ext_debug("move to level %d (block %llu)\n", + ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, @@ -2982,7 +2983,7 @@ again: brelse(path[i].p_bh); path[i].p_bh = NULL; i--; - ext_debug("return to level %d\n", i); + ext_debug(inode, "return to level %d\n", i); } } @@ -3150,8 +3151,7 @@ static int ext4_split_extent_at(handle_t *handle, BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); - ext_debug("ext4_split_extents_at: inode %lu, logical" - "block %llu\n", inode->i_ino, (unsigned long long)split); + ext_debug(inode, "logical block %llu\n", (unsigned long long)split); ext4_ext_show_leaf(inode, path); @@ -3388,9 +3388,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; - ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map_len); + ext_debug(inode, "logical block %llu, max_blocks %u\n", + (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) @@ -3642,8 +3641,7 @@ static int ext4_split_convert_extents(handle_t *handle, unsigned int ee_len; int split_flag = 0, depth; - ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", - __func__, inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) @@ -3689,8 +3687,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); /* If extent is larger than requested it is a clear sign that we still @@ -3760,8 +3757,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - ext_debug("%s: inode %lu, logical" - "block %llu, max_blocks %u\n", __func__, inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { @@ -3817,10 +3813,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, int ret = 0; int err = 0; - ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " - "block %llu, max_blocks %u, flags %x, allocated %u\n", - inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, - flags, allocated); + ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", + (unsigned long long)map->m_lblk, map->m_len, flags, + allocated); ext4_ext_show_leaf(inode, path); /* @@ -4057,8 +4052,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; - ext_debug("blocks %u/%u requested for inode %lu\n", - map->m_lblk, map->m_len, inode->i_ino); + ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ @@ -4105,8 +4099,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); - ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, - ee_block, ee_len, newblock); + ext_debug(inode, "%u fit into %u:%d -> %llu\n", + map->m_lblk, ee_block, ee_len, newblock); /* * If the extent is initialized check whether the @@ -4246,7 +4240,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; - ext_debug("allocate new block: goal %llu, found %llu/%u, requested %u\n", + ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0a52f98512d7..e7bf9388538b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -501,9 +501,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, #endif map->m_flags = 0; - ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," - "logical block %lu\n", inode->i_ino, flags, map->m_len, - (unsigned long) map->m_lblk); + ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", + flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int @@ -734,8 +733,7 @@ out_sem: } if (retval < 0) - ext_debug("failed for inode %lu with err %d\n", - inode->i_ino, retval); + ext_debug(inode, "failed with err %d\n", retval); return retval; } @@ -1691,8 +1689,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, invalid_block = ~0; map->m_flags = 0; - ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," - "logical block %lu\n", inode->i_ino, map->m_len, + ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ From 8ad8d710035edf8d14f8ecc5fa15f8e1a27ecb59 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Sun, 10 May 2020 11:58:05 -0400 Subject: [PATCH 36/58] ext4: rework map struct instantiation in ext4_ext_map_blocks() The path performing block allocations in ext4_ext_map_blocks() contains code trimming the length of a new extent that is repeated later in the function. This code is both redundant and unnecessary as the exact length of the new extent has already been calculated. Rewrite the instantiation of the map struct in this case to use the available values, avoiding the overhead of unnecessary conversions and improving clarity. Add another map struct instantiation tailored specifically to the separate case for an existing written extent. Remove an old comment that no longer appears applicable to the current code. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20200510155805.18808-1-enwlinux@gmail.com Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani --- fs/ext4/extents.c | 50 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 969f4c030cf0..ce394706c61a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4045,7 +4045,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_ext_path *path = NULL; struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_fsblk_t newblock = 0; + ext4_fsblk_t newblock = 0, pblk; int err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; @@ -4060,7 +4060,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; - goto out2; + goto out; } depth = ext_depth(inode); @@ -4076,7 +4076,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (unsigned long) map->m_lblk, depth, path[depth].p_block); err = -EFSCORRUPTED; - goto out2; + goto out; } ex = path[depth].p_ext; @@ -4110,8 +4110,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { err = convert_initialized_extent(handle, inode, map, &path, &allocated); - goto out2; + goto out; } else if (!ext4_ext_is_unwritten(ex)) { + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + ext4_ext_show_leaf(inode, path); goto out; } @@ -4122,7 +4128,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, err = ret; else allocated = ret; - goto out2; + goto out; } } @@ -4147,7 +4153,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, hole_len); - goto out2; + goto out; } /* @@ -4171,12 +4177,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) - goto out2; + goto out; ar.lright = map->m_lblk; ex2 = NULL; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err) - goto out2; + goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ @@ -4237,7 +4243,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) - goto out2; + goto out; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", @@ -4247,7 +4253,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, got_allocated_blocks: /* try to insert new extent into found leaf and return */ - ext4_ext_store_pblock(&newex, newblock + offset); + pblk = newblock + offset; + ext4_ext_store_pblock(&newex, pblk); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { @@ -4272,16 +4279,9 @@ got_allocated_blocks: EXT4_C2B(sbi, allocated_clusters), fb_flags); } - goto out2; + goto out; } - /* previous routine could use block we allocated */ - newblock = ext4_ext_pblock(&newex); - allocated = ext4_ext_get_actual_len(&newex); - if (allocated > map->m_len) - allocated = map->m_len; - map->m_flags |= EXT4_MAP_NEW; - /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of @@ -4327,14 +4327,14 @@ got_allocated_blocks: ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); -out: - if (allocated > map->m_len) - allocated = map->m_len; + + map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); + map->m_pblk = pblk; + map->m_len = ar.len; + allocated = map->m_len; ext4_ext_show_leaf(inode, path); - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = newblock; - map->m_len = allocated; -out2: + +out: ext4_ext_drop_refs(path); kfree(path); From de8ff14cab998f51a3a289d2b58d6d440782f77e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 May 2020 14:52:52 -0700 Subject: [PATCH 37/58] ext4: add casefold flag to EXT4_INODE_* flags No one currently needs EXT4_INODE_CASEFOLD, but add it to keep the EXT4_INODE_* definitions in sync with the EXT4_*_FL definitions. Also make it clearer that the casefold flag is only for directories. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20200510215252.87833-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 80866f124b9a..af60be906528 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -425,7 +425,7 @@ struct flex_groups { /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ -#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ @@ -498,6 +498,7 @@ enum { /* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; @@ -543,6 +544,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(CASEFOLD); CHECK_FLAG_VALUE(RESERVED); } From 53f86b170dfa8d50b8b3fb1c5cf17c33b2327db2 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:32 +0530 Subject: [PATCH 38/58] ext4: mballoc: add blocks to PA list under same spinlock after allocating blocks ext4_mb_discard_preallocations() only checks for grp->bb_prealloc_list of every group to discard the group's PA to free up the space if allocation request fails. Consider below race:- Process A Process B 1. allocate blocks 1. Fails block allocation from ext4_mb_regular_allocator() ext4_lock_group() allocated blocks more than ac_o_ex.fe_len ext4_unlock_group() 2. Scans the grp->bb_prealloc_list (under ext4_lock_group()) and find nothing and thus return -ENOSPC. 2. Add the additional blocks to PA list ext4_lock_group() add blocks to grp->bb_prealloc_list ext4_unlock_group() Above race could be avoided if we add those additional blocks to grp->bb_prealloc_list at the same time with block allocation when ext4_lock_group() was still held. With this discard-PA will know if there are actually any blocks which could be freed from the PA Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/a2217dd782585b42328981832e6d396abaaccb80.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 97 ++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 35 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 33a69424942c..decc5168d126 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -349,6 +349,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -1701,6 +1702,14 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, sbi->s_mb_last_start = ac->ac_f_ex.fe_start; spin_unlock(&sbi->s_md_lock); } + /* + * As we've just preallocated more space than + * user requested originally, we store allocated + * space in a special descriptor. + */ + if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + ext4_mb_new_preallocation(ac); + } /* @@ -1949,7 +1958,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); - BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); + BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); @@ -3675,7 +3684,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, /* * creates new preallocated space for given inode */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; @@ -3688,10 +3697,9 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + BUG_ON(ac->ac_pa == NULL); - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); - if (pa == NULL) - return -ENOMEM; + pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { int winl; @@ -3735,7 +3743,6 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); @@ -3755,21 +3762,17 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_obj_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; - ext4_lock_group(sb, ac->ac_b_ex.fe_group); list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); spin_lock(pa->pa_obj_lock); list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); - - return 0; } /* * creates new preallocated space for locality group inodes belongs to */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; @@ -3781,11 +3784,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + BUG_ON(ac->ac_pa == NULL); - BUG_ON(ext4_pspace_cachep == NULL); - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); - if (pa == NULL) - return -ENOMEM; + pa = ac->ac_pa; /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ @@ -3795,7 +3796,6 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); @@ -3816,26 +3816,20 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_obj_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; - ext4_lock_group(sb, ac->ac_b_ex.fe_group); list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ - return 0; } -static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { - int err; - if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) - err = ext4_mb_new_group_pa(ac); + ext4_mb_new_group_pa(ac); else - err = ext4_mb_new_inode_pa(ac); - return err; + ext4_mb_new_inode_pa(ac); } /* @@ -4150,6 +4144,29 @@ repeat: } } +static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa; + + BUG_ON(ext4_pspace_cachep == NULL); + pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); + if (!pa) + return -ENOMEM; + atomic_set(&pa->pa_count, 1); + ac->ac_pa = pa; + return 0; +} + +static void ext4_mb_pa_free(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + + BUG_ON(!pa); + ac->ac_pa = NULL; + WARN_ON(!atomic_dec_and_test(&pa->pa_count)); + kmem_cache_free(ext4_pspace_cachep, pa); +} + #ifdef CONFIG_EXT4_DEBUG static inline void ext4_mb_show_pa(struct super_block *sb) { @@ -4606,23 +4623,28 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); + + *errp = ext4_mb_pa_alloc(ac); + if (*errp) + goto errout; repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) - goto discard_and_exit; - - /* as we've just preallocated more space than - * user requested originally, we store allocated - * space in a special descriptor */ - if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - *errp = ext4_mb_new_preallocation(ac); + /* + * pa allocated above is added to grp->bb_prealloc_list only + * when we were able to allocate some block i.e. when + * ac->ac_status == AC_STATUS_FOUND. + * And error from above mean ac->ac_status != AC_STATUS_FOUND + * So we have to free this pa here itself. + */ if (*errp) { - discard_and_exit: + ext4_mb_pa_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) + ext4_mb_pa_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -4637,6 +4659,11 @@ repeat: freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) goto repeat; + /* + * If block allocation fails then the pa allocated above + * needs to be freed here itself. + */ + ext4_mb_pa_free(ac); *errp = -ENOSPC; } From cf5e2ca6c99077d128e971149f0c262e808ca831 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:33 +0530 Subject: [PATCH 39/58] ext4: mballoc: refactor ext4_mb_discard_preallocations() Implement ext4_mb_discard_preallocations_should_retry() which we will need in later patches to add more logic like check for sequence number match to see if we should retry for block allocation or not. There should be no functionality change in this patch. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/1cfae0098d2aa9afbeb59331401258182868c8f2.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index decc5168d126..b75408d72773 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4543,6 +4543,17 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) return freed; } +static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, + struct ext4_allocation_context *ac) +{ + int freed; + + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); + if (freed) + return true; + return false; +} + /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back @@ -4551,7 +4562,6 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { - int freed; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; @@ -4656,8 +4666,7 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); - if (freed) + if (ext4_mb_discard_preallocations_should_retry(sb, ac)) goto repeat; /* * If block allocation fails then the pa allocated above From 07b5b8e1ac4004b7db1065a301df65cd434c31c9 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:34 +0530 Subject: [PATCH 40/58] ext4: mballoc: introduce pcpu seqcnt for freeing PA to improve ENOSPC handling There could be a race in function ext4_mb_discard_group_preallocations() where the 1st thread may iterate through group's bb_prealloc_list and remove all the PAs and add to function's local list head. Now if the 2nd thread comes in to discard the group preallocations, it will see that the group->bb_prealloc_list is empty and will return 0. Consider for a case where we have less number of groups (for e.g. just group 0), this may even return an -ENOSPC error from ext4_mb_new_blocks() (where we call for ext4_mb_discard_group_preallocations()). But that is wrong, since 2nd thread should have waited for 1st thread to release all the PAs and should have retried for allocation. Since 1st thread was anyway going to discard the PAs. The algorithm using this percpu seq counter goes below: 1. We sample the percpu discard_pa_seq counter before trying for block allocation in ext4_mb_new_blocks(). 2. We increment this percpu discard_pa_seq counter when we either allocate or free these blocks i.e. while marking those blocks as used/free in mb_mark_used()/mb_free_blocks(). 3. We also increment this percpu seq counter when we successfully identify that the bb_prealloc_list is not empty and hence proceed for discarding of those PAs inside ext4_mb_discard_group_preallocations(). Now to make sure that the regular fast path of block allocation is not affected, as a small optimization we only sample the percpu seq counter on that cpu. Only when the block allocation fails and when freed blocks found were 0, that is when we sample percpu seq counter for all cpus using below function ext4_get_discard_pa_seq_sum(). This happens after making sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. It can be well argued that why don't just check for grp->bb_free to see if there are any free blocks to be allocated. So here are the two concerns which were discussed:- 1. If for some reason the blocks available in the group are not appropriate for allocation logic (say for e.g. EXT4_MB_HINT_GOAL_ONLY, although this is not yet implemented), then the retry logic may result into infinte looping since grp->bb_free is non-zero. 2. Also before preallocation was clubbed with block allocation with the same ext4_lock_group() held, there were lot of races where grp->bb_free could not be reliably relied upon. Due to above, this patch considers discard_pa_seq logic to determine if we should retry for block allocation. Say if there are are n threads trying for block allocation and none of those could allocate or discard any of the blocks, then all of those n threads will fail the block allocation and return -ENOSPC error. (Since the seq counter for all of those will match as no block allocation/discard was done during that duration). Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/7f254686903b87c419d798742fd9a1be34f0657b.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 56 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b75408d72773..754ff9f65199 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -351,6 +351,35 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); +/* + * The algorithm using this percpu seq counter goes below: + * 1. We sample the percpu discard_pa_seq counter before trying for block + * allocation in ext4_mb_new_blocks(). + * 2. We increment this percpu discard_pa_seq counter when we either allocate + * or free these blocks i.e. while marking those blocks as used/free in + * mb_mark_used()/mb_free_blocks(). + * 3. We also increment this percpu seq counter when we successfully identify + * that the bb_prealloc_list is not empty and hence proceed for discarding + * of those PAs inside ext4_mb_discard_group_preallocations(). + * + * Now to make sure that the regular fast path of block allocation is not + * affected, as a small optimization we only sample the percpu seq counter + * on that cpu. Only when the block allocation fails and when freed blocks + * found were 0, that is when we sample percpu seq counter for all cpus using + * below function ext4_get_discard_pa_seq_sum(). This happens after making + * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. + */ +static DEFINE_PER_CPU(u64, discard_pa_seq); +static inline u64 ext4_get_discard_pa_seq_sum(void) +{ + int __cpu; + u64 __seq = 0; + + for_each_possible_cpu(__cpu) + __seq += per_cpu(discard_pa_seq, __cpu); + return __seq; +} + static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 @@ -1462,6 +1491,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); + this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; @@ -1603,6 +1633,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); + this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; @@ -3962,6 +3993,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, INIT_LIST_HEAD(&list); repeat: ext4_lock_group(sb, group); + this_cpu_inc(discard_pa_seq); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); @@ -4544,14 +4576,26 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) } static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, - struct ext4_allocation_context *ac) + struct ext4_allocation_context *ac, u64 *seq) { int freed; + u64 seq_retry = 0; + bool ret = false; freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); - if (freed) - return true; - return false; + if (freed) { + ret = true; + goto out_dbg; + } + seq_retry = ext4_get_discard_pa_seq_sum(); + if (seq_retry != *seq) { + *seq = seq_retry; + ret = true; + } + +out_dbg: + mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); + return ret; } /* @@ -4568,6 +4612,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + u64 seq; might_sleep(); sb = ar->inode->i_sb; @@ -4630,6 +4675,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; + seq = *this_cpu_ptr(&discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); @@ -4666,7 +4712,7 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - if (ext4_mb_discard_preallocations_should_retry(sb, ac)) + if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above From 8ef123fe02ca0923b01b57bdf639800a23a2faa8 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:35 +0530 Subject: [PATCH 41/58] ext4: mballoc: refactor ext4_mb_good_group() ext4_mb_good_group() definition was changed some time back and now it even initializes the buddy cache (via ext4_mb_init_group()), if in case the EXT4_MB_GRP_NEED_INIT() is true for a group. Note that ext4_mb_init_group() could sleep and so should not be called under a spinlock held. This is fine as of now because ext4_mb_good_group() is called before loading the buddy bitmap without ext4_lock_group() held and again called after loading the bitmap, only this time with ext4_lock_group() held. But still this whole thing is confusing. So this patch refactors out ext4_mb_good_group_nolock() which should be called when without holding ext4_lock_group(). Also in further patches we hold the spinlock (ext4_lock_group()) while doing any calculations which involves grp->bb_free or grp->bb_fragments. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/d9f7d031a5fbe1c943fae6bf1ff5cdf0604ae722.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 78 ++++++++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 754ff9f65199..c9297c878a90 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2106,15 +2106,14 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } /* - * This is now called BEFORE we load the buddy bitmap. + * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable - * for the allocation or not. In addition it can also return negative - * error code when something goes wrong. + * for the allocation or not. */ -static int ext4_mb_good_group(struct ext4_allocation_context *ac, +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { - unsigned free, fragments; + ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); @@ -2122,23 +2121,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) - return 0; + return false; if (cr <= 2 && free < ac->ac_g_ex.fe_len) - return 0; + return false; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) - return 0; - - /* We only do this if the grp has never been initialized */ - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); - if (ret) - return ret; - } + return false; fragments = grp->bb_fragments; if (fragments == 0) - return 0; + return false; switch (cr) { case 0: @@ -2148,31 +2140,63 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) - return 0; + return false; if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || (free / fragments) >= ac->ac_g_ex.fe_len) - return 1; + return true; if (grp->bb_largest_free_order < ac->ac_2order) - return 0; + return false; - return 1; + return true; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) - return 1; + return true; break; case 2: if (free >= ac->ac_g_ex.fe_len) - return 1; + return true; break; case 3: - return 1; + return true; default: BUG(); } - return 0; + return false; +} + +/* + * This could return negative error code if something goes wrong + * during ext4_mb_init_group(). This should not be called with + * ext4_lock_group() held. + */ +static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) +{ + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + ext4_grpblk_t free; + int ret = 0; + + free = grp->bb_free; + if (free == 0) + goto out; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + goto out; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + goto out; + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); + if (ret) + return ret; + } + + ret = ext4_mb_good_group(ac, group, cr); +out: + return ret; } static noinline_for_stack int @@ -2260,7 +2284,7 @@ repeat: group = 0; /* This now checks without needing the buddy page */ - ret = ext4_mb_good_group(ac, group, cr); + ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!first_err) first_err = ret; @@ -2278,11 +2302,9 @@ repeat: * block group */ ret = ext4_mb_good_group(ac, group, cr); - if (ret <= 0) { + if (ret == 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); - if (!first_err) - first_err = ret; continue; } From 993778306e7901a7286322f25c7c681dd47bede6 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 20 May 2020 12:10:36 +0530 Subject: [PATCH 42/58] ext4: mballoc: use lock for checking free blocks while retrying Currently while doing block allocation grp->bb_free may be getting modified if discard is happening in parallel. For e.g. consider a case where there are lot of threads who have preallocated lot of blocks and there is a thread which is trying to discard all of this group's PA. Now it could happen that we see all of those group's bb_free is zero and fail the allocation while there is sufficient space if we free up all the PA. So this patch adds another flag "EXT4_MB_STRICT_CHECK" which will be set if we are unable to allocate any blocks in the first try (since we may not have considered blocks about to be discarded from PA lists). So during retry attempt to allocate blocks we will use ext4_lock_group() for checking if the group is good or not. Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/9cb740a117c958c36596f167b12af1beae9a68b7.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/mballoc.c | 13 ++++++++++++- include/trace/events/ext4.h | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index af60be906528..dbc36e377eb0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -150,6 +150,8 @@ enum SHIFT_DIRECTION { #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 /* Use blocks from reserved pool */ #define EXT4_MB_USE_RESERVED 0x2000 +/* Do strict check for free blocks while retrying block allocation */ +#define EXT4_MB_STRICT_CHECK 0x4000 struct ext4_allocation_request { /* target inode for block we're allocating */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c9297c878a90..a9083113a8c0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2176,9 +2176,13 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct super_block *sb = ac->ac_sb; + bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; + if (should_lock) + ext4_lock_group(sb, group); free = grp->bb_free; if (free == 0) goto out; @@ -2186,6 +2190,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; + if (should_lock) + ext4_unlock_group(sb, group); /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { @@ -2194,8 +2200,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, return ret; } + if (should_lock) + ext4_lock_group(sb, group); ret = ext4_mb_good_group(ac, group, cr); out: + if (should_lock) + ext4_unlock_group(sb, group); return ret; } @@ -4610,7 +4620,8 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, goto out_dbg; } seq_retry = ext4_get_discard_pa_seq_sum(); - if (seq_retry != *seq) { + if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { + ac->ac_flags |= EXT4_MB_STRICT_CHECK; *seq = seq_retry; ret = true; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 280475c1cecc..cc41d692ae8e 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -35,7 +35,8 @@ struct partial_cluster; { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ - { EXT4_MB_USE_RESERVED, "USE_RESV" }) + { EXT4_MB_USE_RESERVED, "USE_RESV" }, \ + { EXT4_MB_STRICT_CHECK, "STRICT_CHECK" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ From dfcd4489e270282d984cd06c00f3a45d52a3f0a7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 20 May 2020 15:31:18 +0200 Subject: [PATCH 43/58] ext4: drop ext4_journal_free_reserved() Remove ext4_journal_free_reserved() function. It is never used. Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20200520133119.1383-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 3bacf76d2609..00dc668e052b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -338,12 +338,6 @@ static inline handle_t *__ext4_journal_start(struct inode *inode, handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type); -static inline void ext4_journal_free_reserved(handle_t *handle) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_free_reserved(handle); -} - static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); From 14ff6286309e2853aed50083c9a83328423fdd8c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 20 May 2020 15:31:19 +0200 Subject: [PATCH 44/58] jbd2: avoid leaking transaction credits when unreserving handle When reserved transaction handle is unused, we subtract its reserved credits in __jbd2_journal_unreserve_handle() called from jbd2_journal_stop(). However this function forgets to remove reserved credits from transaction->t_outstanding_credits and thus the transaction space that was reserved remains effectively leaked. The leaked transaction space can be quite significant in some cases and leads to unnecessarily small transactions and thus reducing throughput of the journalling machinery. E.g. fsmark workload creating lots of 4k files was observed to have about 20% lower throughput due to this when ext4 is mounted with dioread_nolock mount option. Subtract reserved credits from t_outstanding_credits as well. CC: stable@vger.kernel.org Fixes: 8f7d89f36829 ("jbd2: transaction reservation support") Reviewed-by: Andreas Dilger Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200520133119.1383-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3dccc23cf010..e91aad3637a2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -541,17 +541,24 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) } EXPORT_SYMBOL(jbd2_journal_start); -static void __jbd2_journal_unreserve_handle(handle_t *handle) +static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t) { journal_t *journal = handle->h_journal; WARN_ON(!handle->h_reserved); sub_reserved_credits(journal, handle->h_total_credits); + if (t) + atomic_sub(handle->h_total_credits, &t->t_outstanding_credits); } void jbd2_journal_free_reserved(handle_t *handle) { - __jbd2_journal_unreserve_handle(handle); + journal_t *journal = handle->h_journal; + + /* Get j_state_lock to pin running transaction if it exists */ + read_lock(&journal->j_state_lock); + __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction); + read_unlock(&journal->j_state_lock); jbd2_free_handle(handle); } EXPORT_SYMBOL(jbd2_journal_free_reserved); @@ -722,7 +729,8 @@ static void stop_this_handle(handle_t *handle) atomic_sub(handle->h_total_credits, &transaction->t_outstanding_credits); if (handle->h_rsv_handle) - __jbd2_journal_unreserve_handle(handle->h_rsv_handle); + __jbd2_journal_unreserve_handle(handle->h_rsv_handle, + transaction); if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); From 9f364e1d9537d44fdf58deeeddb51277d8327ce5 Mon Sep 17 00:00:00 2001 From: Jonathan Grant Date: Fri, 22 May 2020 16:07:58 +0100 Subject: [PATCH 45/58] add comment for ext4_dir_entry_2 file_type member Signed-off-by: Jonathan Grant Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/ad3290d5-86af-99c1-f9d5-cd1bab710429@jguk.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index dbc36e377eb0..a94e7aaea6e6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2061,7 +2061,7 @@ struct ext4_dir_entry_2 { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ - __u8 file_type; + __u8 file_type; /* See file type macros EXT4_FT_* below */ char name[EXT4_NAME_LEN]; /* File name */ }; From 175efa81feb8405676e0136d97b10380179c92e0 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 5 May 2020 17:43:14 +0200 Subject: [PATCH 46/58] ext4: fix EXT4_MAX_LOGICAL_BLOCK macro ext4 supports max number of logical blocks in a file to be 0xffffffff. (This is since ext4_extent's ee_block is __le32). This means that EXT4_MAX_LOGICAL_BLOCK should be 0xfffffffe (starting from 0 logical offset). This patch fixes this. The issue was seen when ext4 moved to iomap_fiemap API and when overlayfs was mounted on top of ext4. Since overlayfs was missing filemap_check_ranges(), so it could pass a arbitrary huge length which lead to overflow of map.m_len logic. This patch fixes that. Fixes: d3b6f23f7167 ("ext4: move ext4_fiemap to use iomap framework") Reported-by: syzbot+77fa5bdb65cc39711820@syzkaller.appspotmail.com Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20200505154324.3226743-2-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a94e7aaea6e6..1eb07ca91fca 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -733,7 +733,7 @@ enum { #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* Max logical block we can support */ -#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE /* * Structure of an inode on the disk From 328e24ae14aeb8ef624ec181e0d546b05c34f031 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2020 17:43:15 +0200 Subject: [PATCH 47/58] ext4: fix fiemap size checks for bitmap files Add an extra validation of the len parameter, as for ext4 some files might have smaller file size limits than others. This also means the redundant size check in ext4_ioctl_get_es_cache can go away, as all size checking is done in the shared fiemap handler. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200505154324.3226743-3-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 31 +++++++++++++++++++++++++++++++ fs/ext4/ioctl.c | 33 ++------------------------------- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ce394706c61a..844773d3b64b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4862,6 +4862,28 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { .iomap_begin = ext4_iomap_xattr_begin, }; +static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) +{ + u64 maxbytes; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + maxbytes = inode->i_sb->s_maxbytes; + else + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + + if (*len == 0) + return -EINVAL; + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + return 0; +} + static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, bool from_es_cache) { @@ -4882,6 +4904,15 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; + /* + * For bitmap files the maximum size limit could be smaller than + * s_maxbytes, so check len here manually instead of just relying on the + * generic check. + */ + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + return error; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; error = iomap_fiemap(inode, fieinfo, start, len, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfc1281fc4cb..0746532ba463 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -733,29 +733,6 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); } -/* copied from fs/ioctl.c */ -static int fiemap_check_ranges(struct super_block *sb, - u64 start, u64 len, u64 *new_len) -{ - u64 maxbytes = (u64) sb->s_maxbytes; - - *new_len = len; - - if (len == 0) - return -EINVAL; - - if (start > maxbytes) - return -EFBIG; - - /* - * Shrink request scope to what the fs can actually handle. - */ - if (len > maxbytes || (maxbytes - len) < start) - *new_len = maxbytes - start; - - return 0; -} - /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) @@ -765,8 +742,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - u64 len; int error; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) @@ -775,11 +750,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; - error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, - &len); - if (error) - return error; - fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; @@ -792,7 +762,8 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) filemap_write_and_wait(inode->i_mapping); - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len); + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) From 03a5ed24c9b8f0180a59ba7b7b9b9517fcf4335b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:08 +0200 Subject: [PATCH 48/58] ext4: split _ext4_fiemap The fiemap and EXT4_IOC_GET_ES_CACHE cases share almost no code, so split them into entirely separate functions. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200523073016.2944131-2-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 72 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 844773d3b64b..9e90f324d75f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4884,11 +4884,9 @@ static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) return 0; } -static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, bool from_es_cache) +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) { - ext4_lblk_t start_blk; - u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR; int error = 0; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { @@ -4898,10 +4896,7 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - if (from_es_cache) - ext4_fiemap_flags &= FIEMAP_FLAG_XATTR; - - if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) + if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)) return -EBADR; /* @@ -4915,40 +4910,20 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - error = iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); - } else if (!from_es_cache) { - error = iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_report_ops); - } else { - ext4_lblk_t len_blks; - __u64 last_blk; - - start_blk = start >> inode->i_sb->s_blocksize_bits; - last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; - if (last_blk >= EXT_MAX_BLOCKS) - last_blk = EXT_MAX_BLOCKS-1; - len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; - - /* - * Walk the extent tree gathering extent information - * and pushing extents back to the user. - */ - error = ext4_fill_es_cache_info(inode, start_blk, len_blks, - fieinfo); + return iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); } - return error; -} -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) -{ - return _ext4_fiemap(inode, fieinfo, start, len, false); + return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { + ext4_lblk_t start_blk, len_blks; + __u64 last_blk; + int error = 0; + if (ext4_has_inline_data(inode)) { int has_inline; @@ -4959,9 +4934,32 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, return 0; } - return _ext4_fiemap(inode, fieinfo, start, len, true); -} + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; + } + if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)) + return -EBADR; + + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + return error; + + start_blk = start >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; + + /* + * Walk the extent tree gathering extent information + * and pushing extents back to the user. + */ + return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); +} /* * ext4_access_path: From da565e792be540a5726af7f8cd50b282af0358b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:09 +0200 Subject: [PATCH 49/58] ext4: remove the call to fiemap_check_flags in ext4_fiemap iomap_fiemap already calls fiemap_check_flags first thing, so this additional check is redundant. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200523073016.2944131-3-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9e90f324d75f..b365c8b407c4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4896,9 +4896,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)) - return -EBADR; - /* * For bitmap files the maximum size limit could be smaller than * s_maxbytes, so check len here manually instead of just relying on the From 44ebcd06bbb3ab3ee446b933800aca32fc4ca9b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:10 +0200 Subject: [PATCH 50/58] fs: mark __generic_block_fiemap static There is no caller left outside of ioctl.c. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-4-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ioctl.c | 4 +--- include/linux/fs.h | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index 5e80b40bc1b5..8fe5131b1dee 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -307,8 +307,7 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) * If you use this function directly, you need to do your own locking. Use * generic_block_fiemap if you want the locking done for you. */ - -int __generic_block_fiemap(struct inode *inode, +static int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, get_block_t *get_block) { @@ -453,7 +452,6 @@ int __generic_block_fiemap(struct inode *inode, return ret; } -EXPORT_SYMBOL(__generic_block_fiemap); /** * generic_block_fiemap - FIEMAP for block based inodes diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..3104c6f7527b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3299,10 +3299,6 @@ static inline int vfs_fstat(int fd, struct kstat *stat) extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -extern int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, - get_block_t *get_block); extern int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block); From 10c5db286452b8c60e8f58e9a4c1cbc5a91e4e5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:11 +0200 Subject: [PATCH 51/58] fs: move the fiemap definitions out of fs.h No need to pull the fiemap definitions into almost every file in the kernel build. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-5-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/bad_inode.c | 1 + fs/btrfs/extent_io.h | 1 + fs/cifs/inode.c | 1 + fs/cifs/smb2ops.c | 1 + fs/ext2/inode.c | 1 + fs/ext4/ext4.h | 1 + fs/f2fs/data.c | 1 + fs/f2fs/inline.c | 1 + fs/gfs2/inode.c | 1 + fs/hpfs/file.c | 1 + fs/ioctl.c | 1 + fs/iomap/fiemap.c | 1 + fs/nilfs2/inode.c | 1 + fs/overlayfs/inode.c | 1 + fs/xfs/xfs_iops.c | 1 + include/linux/fiemap.h | 24 ++++++++++++++++++++++++ include/linux/fs.h | 19 +------------------ include/uapi/linux/fiemap.h | 6 +++--- 18 files changed, 43 insertions(+), 21 deletions(-) create mode 100644 include/linux/fiemap.h diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 8035d2a44561..54f0ce444272 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -15,6 +15,7 @@ #include #include #include +#include static int bad_file_open(struct inode *inode, struct file *filp) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2ed65bd0760e..817698bc0669 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -5,6 +5,7 @@ #include #include +#include #include "ulist.h" /* diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 390d2b15ef6e..3f276eb8ca68 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "cifsfs.h" diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f829f4165d38..09047f1ddfb6 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..0f12a0e8a8d9 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1eb07ca91fca..9e5c332a2b94 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..25abbbb65ba0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4167e5408151..9686ffea177e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -8,6 +8,7 @@ #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 70b2d3a1e866..4842f313a808 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "gfs2.h" diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..62959a8e43ad 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include +#include #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/ioctl.c b/fs/ioctl.c index 8fe5131b1dee..3f300cc07dee 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index d55e8f491a5e..0a807bbb2b4a 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -6,6 +6,7 @@ #include #include #include +#include struct fiemap_ctx { struct fiemap_extent_info *fi; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..6e1aca38931f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b0d42ece4d7c..b5fec3410556 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "overlayfs.h" diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f7a99b3bbcf7..44c353998ac5 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * Directories have different lock order w.r.t. mmap_sem compared to regular diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h new file mode 100644 index 000000000000..240d4f7d9116 --- /dev/null +++ b/include/linux/fiemap.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FIEMAP_H +#define _LINUX_FIEMAP_H 1 + +#include +#include + +struct fiemap_extent_info { + unsigned int fi_flags; /* Flags as passed from user */ + unsigned int fi_extents_mapped; /* Number of mapped extents */ + unsigned int fi_extents_max; /* Size of fiemap_extent array */ + struct fiemap_extent __user *fi_extents_start; /* Start of + fiemap_extent array */ +}; + +int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, + u64 phys, u64 len, u32 flags); +int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, u64 len, + get_block_t *get_block); + +#endif /* _LINUX_FIEMAP_H 1 */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3104c6f7527b..09bcd329c062 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,7 @@ struct backing_dev_info; struct bdi_writeback; struct bio; struct export_operations; +struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; @@ -1745,19 +1745,6 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); -/* - * VFS FS_IOC_FIEMAP helper definitions. - */ -struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ -}; -int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags); -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); /* * This is the "filldir" function type, used by readdir() to let @@ -3299,10 +3286,6 @@ static inline int vfs_fstat(int fd, struct kstat *stat) extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -extern int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block); - extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 7a900b2377b6..24ca0c00cae3 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -9,8 +9,8 @@ * Andreas Dilger */ -#ifndef _LINUX_FIEMAP_H -#define _LINUX_FIEMAP_H +#ifndef _UAPI_LINUX_FIEMAP_H +#define _UAPI_LINUX_FIEMAP_H #include @@ -67,4 +67,4 @@ struct fiemap { #define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other * files. */ -#endif /* _LINUX_FIEMAP_H */ +#endif /* _UAPI_LINUX_FIEMAP_H */ From 2732881894714f545ffac42dad7ba7730069874d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:12 +0200 Subject: [PATCH 52/58] iomap: fix the iomap_fiemap prototype iomap_fiemap should take u64 start and len arguments, just like the ->fiemap prototype. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-6-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/iomap/fiemap.c | 2 +- include/linux/iomap.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 0a807bbb2b4a..449705575acf 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -66,7 +66,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, - loff_t start, loff_t len, const struct iomap_ops *ops) + u64 start, u64 len, const struct iomap_ops *ops) { struct fiemap_ctx ctx; loff_t ret; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b09463dae0d..63db02528b70 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -178,7 +178,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, const struct iomap_ops *ops); + u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops); loff_t iomap_seek_data(struct inode *inode, loff_t offset, From cddf8a2c4a8286ae60fc866eab59c8bc524e93a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:13 +0200 Subject: [PATCH 53/58] fs: move fiemap range validation into the file systems instances Replace fiemap_check_flags with a fiemap_prep helper that also takes the inode and mapped range, and performs the sanity check and truncation previously done in fiemap_check_range. This way the validation is inside the file system itself and thus properly works for the stacked overlayfs case as well. Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-7-hch@lst.de Signed-off-by: Theodore Ts'o --- Documentation/filesystems/fiemap.txt | 12 +++--- fs/btrfs/inode.c | 2 +- fs/cifs/smb2ops.c | 6 ++- fs/ext4/extents.c | 5 ++- fs/f2fs/data.c | 3 +- fs/ioctl.c | 63 +++++++++++----------------- fs/iomap/fiemap.c | 2 +- fs/nilfs2/inode.c | 2 +- fs/ocfs2/extent_map.c | 3 +- include/linux/fiemap.h | 3 +- 10 files changed, 47 insertions(+), 54 deletions(-) diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt index ac87e6fda842..35c8571eccb6 100644 --- a/Documentation/filesystems/fiemap.txt +++ b/Documentation/filesystems/fiemap.txt @@ -203,16 +203,18 @@ EINTR once fatal signal received. Flag checking should be done at the beginning of the ->fiemap callback via the -fiemap_check_flags() helper: +fiemap_prep() helper: -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags); The struct fieinfo should be passed in as received from ioctl_fiemap(). The set of fiemap flags which the fs understands should be passed via fs_flags. If -fiemap_check_flags finds invalid user flags, it will place the bad values in +fiemap_prep finds invalid user flags, it will place the bad values in fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from -fiemap_check_flags(), it should immediately exit, returning that error back to -ioctl_fiemap(). +fiemap_prep(), it should immediately exit, returning that error back to +ioctl_fiemap(). Additionally the range is validate against the supported +maximum file size. For each extent in the request range, the file system should call diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 320d1062068d..1f1ec361089b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8250,7 +8250,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int ret; - ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, start, &len, BTRFS_FIEMAP_FLAGS); if (ret) return ret; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 09047f1ddfb6..828e53e795c6 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3408,8 +3408,10 @@ static int smb3_fiemap(struct cifs_tcon *tcon, int i, num, rc, flags, last_blob; u64 next; - if (fiemap_check_flags(fei, FIEMAP_FLAG_SYNC)) - return -EBADR; + rc = fiemap_prep(d_inode(cfile->dentry), fei, start, &len, + FIEMAP_FLAG_SYNC); + if (rc) + return rc; xid = get_xid(); again: diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b365c8b407c4..cea083efb650 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4938,8 +4938,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)) - return -EBADR; + error = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); + if (error) + return error; error = ext4_fiemap_check_ranges(inode, start, &len); if (error) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 25abbbb65ba0..03faafc591b1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1825,7 +1825,8 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); + ret = fiemap_prep(inode, fieinfo, start, &len, + FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); if (ret) return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 3f300cc07dee..56bbf02209ae 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -149,61 +149,50 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, EXPORT_SYMBOL(fiemap_fill_next_extent); /** - * fiemap_check_flags - check validity of requested flags for fiemap + * fiemap_prep - check validity of requested flags for fiemap + * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap - * @fs_flags: Set of fiemap flags that the file system understands + * @start: Start of the mapped range + * @len: Length of the mapped range, can be truncated by this function. + * @supported_flags: Set of fiemap flags that the file system understands * - * Called from file system ->fiemap callback. This will compute the - * intersection of valid fiemap flags and those that the fs supports. That - * value is then compared against the user supplied flags. In case of bad user - * flags, the invalid values will be written into the fieinfo structure, and - * -EBADR is returned, which tells ioctl_fiemap() to return those values to - * userspace. For this reason, a return code of -EBADR should be preserved. + * This function must be called from each ->fiemap instance to validate the + * fiemap request against the file system parameters. * - * Returns 0 on success, -EBADR on bad flags. + * Returns 0 on success, or a negative error on failure. */ -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags) { + u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; - incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); - if (incompat_flags) { - fieinfo->fi_flags = incompat_flags; - return -EBADR; - } - return 0; -} -EXPORT_SYMBOL(fiemap_check_flags); - -static int fiemap_check_ranges(struct super_block *sb, - u64 start, u64 len, u64 *new_len) -{ - u64 maxbytes = (u64) sb->s_maxbytes; - - *new_len = len; - - if (len == 0) + if (*len == 0) return -EINVAL; - if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ - if (len > maxbytes || (maxbytes - len) < start) - *new_len = maxbytes - start; + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + supported_flags &= FIEMAP_FLAGS_COMPAT; + incompat_flags = fieinfo->fi_flags & ~supported_flags; + if (incompat_flags) { + fieinfo->fi_flags = incompat_flags; + return -EBADR; + } return 0; } +EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - u64 len; int error; if (!inode->i_op->fiemap) @@ -215,11 +204,6 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; - error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, - &len); - if (error) - return error; - fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; @@ -232,7 +216,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) filemap_write_and_wait(inode->i_mapping); - error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) @@ -320,7 +305,7 @@ static int __generic_block_fiemap(struct inode *inode, bool past_eof = false, whole_file = false; int ret = 0; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 449705575acf..89dca4a97e4a 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -75,7 +75,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, ctx.fi = fi; ctx.prev.type = IOMAP_HOLE; - ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fi, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6e1aca38931f..052c2da11e4d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1006,7 +1006,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, unsigned int blkbits = inode->i_blkbits; int ret, n; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e3e2d1b2af51..3744179b73fa 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -746,7 +746,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, map_start, &map_len, + OCFS2_FIEMAP_FLAGS); if (ret) return ret; diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index 240d4f7d9116..4e624c466583 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -13,9 +13,10 @@ struct fiemap_extent_info { fiemap_extent array */ }; +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags); int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, u64 phys, u64 len, u32 flags); -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, From 45dd052e67ad17c7a24874a783f41aeab15bc294 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:14 +0200 Subject: [PATCH 54/58] fs: handle FIEMAP_FLAG_SYNC in fiemap_prep By moving FIEMAP_FLAG_SYNC handling to fiemap_prep we ensure it is handled once instead of duplicated, but can still be done under fs locks, like xfs/iomap intended with its duplicate handling. Also make sure the error value of filemap_write_and_wait is propagated to user space. Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-8-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/btrfs/inode.c | 4 +--- fs/cifs/smb2ops.c | 3 +-- fs/ext4/extents.c | 2 +- fs/ext4/ioctl.c | 3 --- fs/f2fs/data.c | 3 +-- fs/ioctl.c | 10 ++++++---- fs/iomap/fiemap.c | 8 +------- fs/nilfs2/inode.c | 2 +- fs/ocfs2/extent_map.c | 5 +---- fs/overlayfs/inode.c | 4 ---- 10 files changed, 13 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f1ec361089b..529ffa5e7b45 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8243,14 +8243,12 @@ out: return ret; } -#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { int ret; - ret = fiemap_prep(inode, fieinfo, start, &len, BTRFS_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 828e53e795c6..300ade2acc41 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3408,8 +3408,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon, int i, num, rc, flags, last_blob; u64 next; - rc = fiemap_prep(d_inode(cfile->dentry), fei, start, &len, - FIEMAP_FLAG_SYNC); + rc = fiemap_prep(d_inode(cfile->dentry), fei, start, &len, 0); if (rc) return rc; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cea083efb650..7d088ff1e902 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4938,7 +4938,7 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - error = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); + error = fiemap_prep(inode, fieinfo, start, &len, 0); if (error) return error; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0746532ba463..f81acbbb1b12 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -759,9 +759,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) return -EFAULT; - if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(inode->i_mapping); - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 03faafc591b1..9de7dc476ed1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1825,8 +1825,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - ret = fiemap_prep(inode, fieinfo, start, &len, - FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); if (ret) return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 56bbf02209ae..b16e962340db 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -166,6 +166,7 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, { u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; + int ret = 0; if (*len == 0) return -EINVAL; @@ -178,13 +179,17 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; + supported_flags |= FIEMAP_FLAG_SYNC; supported_flags &= FIEMAP_FLAGS_COMPAT; incompat_flags = fieinfo->fi_flags & ~supported_flags; if (incompat_flags) { fieinfo->fi_flags = incompat_flags; return -EBADR; } - return 0; + + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) + ret = filemap_write_and_wait(inode->i_mapping); + return ret; } EXPORT_SYMBOL(fiemap_prep); @@ -213,9 +218,6 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) return -EFAULT; - if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(inode->i_mapping); - error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 89dca4a97e4a..aab070df4a21 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -75,16 +75,10 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, ctx.fi = fi; ctx.prev.type = IOMAP_HOLE; - ret = fiemap_prep(inode, fi, start, &len, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fi, start, &len, 0); if (ret) return ret; - if (fi->fi_flags & FIEMAP_FLAG_SYNC) { - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - return ret; - } - while (len > 0) { ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 052c2da11e4d..25b0d368ecdb 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1006,7 +1006,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, unsigned int blkbits = inode->i_blkbits; int ret, n; - ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 3744179b73fa..a94852af5510 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -733,8 +733,6 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, return 0; } -#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len) { @@ -746,8 +744,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - ret = fiemap_prep(inode, fieinfo, map_start, &map_len, - OCFS2_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, map_start, &map_len, 0); if (ret) return ret; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b5fec3410556..c7cb883c47b8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -462,10 +462,6 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -EOPNOTSUPP; old_cred = ovl_override_creds(inode->i_sb); - - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(realinode->i_mapping); - err = realinode->i_op->fiemap(realinode, fieinfo, start, len); revert_creds(old_cred); From c7d216e8c44cfc0b680d8c0de4f9bdafd92f7ef6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:15 +0200 Subject: [PATCH 55/58] fs: remove the access_ok() check in ioctl_fiemap access_ok just checks we are fed a proper user pointer. We also do that in copy_to_user itself, so no need to do this early. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-9-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ioctl.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index b16e962340db..d69786d1dd91 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -213,13 +213,9 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; - if (fiemap.fm_extent_count != 0 && - !access_ok(fieinfo.fi_extents_start, - fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) - return -EFAULT; - error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); + fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) From ba988903937c1b1ce5d54567b50f2ad9604b3bfe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:16 +0200 Subject: [PATCH 56/58] ext4: remove the access_ok() check in ext4_ioctl_get_es_cache access_ok just checks we are fed a proper user pointer. We also do that in copy_to_user itself, so no need to do this early. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200523073016.2944131-10-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f81acbbb1b12..2162db0c747d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -754,11 +754,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; - if (fiemap.fm_extent_count != 0 && - !access_ok(fieinfo.fi_extents_start, - fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) - return -EFAULT; - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; From 6e014c621e7271649f0d51e54dbe1db4c10486c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 24 May 2020 16:53:16 -0600 Subject: [PATCH 57/58] ext4: don't block for O_DIRECT if IOCB_NOWAIT is set Running with some debug patches to detect illegal blocking triggered the extend/unaligned condition in ext4. If ext4 needs to extend the file (and hence go to buffered IO), or if the app is doing unaligned IO, then ext4 asks the iomap code to wait for IO completion. If the caller asked for no-wait semantics by setting IOCB_NOWAIT, then ext4 should return -EAGAIN instead. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/76152096-2bbb-7682-8fce-4cb498bcd909@kernel.dk Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b8e69f9e3858..2a01e31a032c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -502,6 +502,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) return ret; + /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */ + if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) { + ret = -EAGAIN; + goto out; + } + offset = iocb->ki_pos; count = ret; From 6b8ed62008a49751fc71fefd2a4f89202a7c2d4d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 25 May 2020 10:12:15 +0200 Subject: [PATCH 58/58] ext4: avoid unnecessary transaction starts during writeback ext4_writepages() currently works in a loop like: start a transaction scan inode for pages to write map and submit these pages stop the transaction This loop results in starting transaction once more than is needed because in the last iteration we start a transaction only to scan the inode and find there are no pages to write. This can be significant increase in number of transaction starts for single-extent files or files that have all blocks already mapped. Furthermore we already know from previous iteration whether there are more pages to write or not. So propagate the information from mpage_prepare_extent_to_map() and avoid unnecessary looping in case there are no more pages to write. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20200525081215.29451-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e7bf9388538b..6694f0c8e0f7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1534,6 +1534,7 @@ struct mpage_da_data { struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; + unsigned int scanned_until_end:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -1549,6 +1550,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (mpd->first_page >= mpd->next_page) return; + mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { @@ -2195,7 +2197,11 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, if (err < 0) return err; } - return lblk < blocks; + if (lblk >= blocks) { + mpd->scanned_until_end = 1; + return 0; + } + return 1; } /* @@ -2553,7 +2559,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, tag); if (nr_pages == 0) - goto out; + break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2608,6 +2614,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) pagevec_release(&pvec); cond_resched(); } + mpd->scanned_until_end = 1; return 0; out: pagevec_release(&pvec); @@ -2626,7 +2633,6 @@ static int ext4_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - bool done; struct blk_plug plug; bool give_up_on_write = false; @@ -2712,7 +2718,6 @@ static int ext4_writepages(struct address_space *mapping, retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); - done = false; blk_start_plug(&plug); /* @@ -2722,6 +2727,7 @@ retry: * started. */ mpd.do_map = 0; + mpd.scanned_until_end = 0; mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { ret = -ENOMEM; @@ -2737,7 +2743,7 @@ retry: if (ret < 0) goto unplug; - while (!done && mpd.first_page <= mpd.last_page) { + while (!mpd.scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { @@ -2772,20 +2778,9 @@ retry: trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); ret = mpage_prepare_extent_to_map(&mpd); - if (!ret) { - if (mpd.map.m_len) - ret = mpage_map_and_submit_extent(handle, &mpd, + if (!ret && mpd.map.m_len) + ret = mpage_map_and_submit_extent(handle, &mpd, &give_up_on_write); - else { - /* - * We scanned the whole range (or exhausted - * nr_to_write), submitted what was mapped and - * didn't find anything needing mapping. We are - * done. - */ - done = true; - } - } /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit