From e54514aaeab6be64679787e78679e8eb700740f6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 2 Feb 2024 11:29:21 +1030 Subject: [PATCH] btrfs-progs: fix stray fd close in open_ctree_fs_info() [BUG] Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors open during whole time") is making sure we're only closing the writeable fds after the fs is properly created, there is still a missing fd not following the requirement. And this explains the issue why sometimes after mkfs.btrfs, lsblk still doesn't give a valid uuid. Shown by the strace output (the command is "mkfs.btrfs -f /dev/test/scratch1"): openat(AT_FDCWD, "/dev/test/scratch1", O_RDWR) = 5 <<< Writeable open fadvise64(5, 0, 0, POSIX_FADV_DONTNEED) = 0 sysinfo({uptime=2529, loads=[8704, 6272, 2496], totalram=4104548352, freeram=3376611328, sharedram=9211904, bufferram=43016192, totalswap=3221221376, freeswap=3221221376, procs=190, totalhigh=0, freehigh=0, mem_unit=1}) = 0 lseek(5, 0, SEEK_END) = 10737418240 lseek(5, 0, SEEK_SET) = 0 ...... close(5) = 0 <<< Closed now pwrite64(6, "O\250\22\261\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 1163264) = 16384 pwrite64(6, "\201\316\272\342\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 1179648) = 16384 pwrite64(6, "K}S\t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 1196032) = 16384 pwrite64(6, "\207j$\265\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 1212416) = 16384 pwrite64(6, "q\267;\336\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 5242880) = 16384 fsync(6) <<< But we're still writing into the disk. [CAUSE] After more digging, it turns out we have a very obvious escape in open_ctree_fs_info(): open_ctree_fs_info() |- fp = open(oca->filename, flags); |- info = __open_ctree_fd(); |- close(fp); As later we only do IO using the device fd, this close() seems fine. But the truth is, for mkfs usage, this fs_info is a temporary one, with a special magic number for the disk. And since mkfs is doing writeable operations, this close() would immediately trigger udev scan. And since at this stage, the fs is not yet fully created, udev can race with mkfs, and may get the invalid temporary superblock. [FIX] Introduce a new btrfs_fs_info member, initial_fd, for open_ctree_fs_info() to record the fd. And on close_ctree(), if we find fs_info::initial_fd is a valid fd, then close it. By this, we make sure all writeable fds are only closed after we have written valid super blocks into the disk. Issue: #734 Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- kernel-shared/ctree.h | 9 +++++++++ kernel-shared/disk-io.c | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel-shared/ctree.h b/kernel-shared/ctree.h index bcf11d87..94463222 100644 --- a/kernel-shared/ctree.h +++ b/kernel-shared/ctree.h @@ -404,6 +404,15 @@ struct btrfs_fs_info { u32 sectorsize; u32 stripesize; u32 leaf_data_size; + + /* + * For open_ctree_fs_info() to hold the initial fd until close. + * + * For writeable open_ctree_fs_info() call, we should not close + * the fd until the fs_info is properly closed, or it will trigger + * udev scan while our fs is not properly initialized. + */ + int initial_fd; u16 csum_type; u16 csum_size; diff --git a/kernel-shared/disk-io.c b/kernel-shared/disk-io.c index c0533192..05323b2c 100644 --- a/kernel-shared/disk-io.c +++ b/kernel-shared/disk-io.c @@ -913,6 +913,7 @@ struct btrfs_fs_info *btrfs_new_fs_info(int writable, u64 sb_bytenr) fs_info->metadata_alloc_profile = (u64)-1; fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; fs_info->nr_global_roots = 1; + fs_info->initial_fd = -1; return fs_info; @@ -1690,7 +1691,10 @@ struct btrfs_fs_info *open_ctree_fs_info(struct open_ctree_args *oca) return NULL; } info = __open_ctree_fd(fp, oca); - close(fp); + if (info) + info->initial_fd = fp; + else + close(fp); return info; } @@ -2297,6 +2301,8 @@ skip_commit: btrfs_release_all_roots(fs_info); ret = btrfs_close_devices(fs_info->fs_devices); + if (fs_info->initial_fd >= 0) + close(fs_info->initial_fd); btrfs_cleanup_all_caches(fs_info); btrfs_free_fs_info(fs_info); if (!err)