btrfs-progs: fix stray fd close in open_ctree_fs_info()

[BUG]
Although commit b2a1be83b8 ("btrfs-progs: mkfs: keep file descriptors
open during whole time") is making sure we're only closing the writeable
fds after the fs is properly created, there is still a missing fd not
following the requirement.

And this explains the issue why sometimes after mkfs.btrfs, lsblk still
doesn't give a valid uuid.

Shown by the strace output (the command is "mkfs.btrfs -f
/dev/test/scratch1"):

  openat(AT_FDCWD, "/dev/test/scratch1", O_RDWR) = 5 <<< Writeable open
  fadvise64(5, 0, 0, POSIX_FADV_DONTNEED) = 0
  sysinfo({uptime=2529, loads=[8704, 6272, 2496], totalram=4104548352, freeram=3376611328, sharedram=9211904, bufferram=43016192, totalswap=3221221376, freeswap=3221221376, procs=190, totalhigh=0, freehigh=0, mem_unit=1}) = 0
  lseek(5, 0, SEEK_END)                   = 10737418240
  lseek(5, 0, SEEK_SET)                   = 0
  ......
  close(5)                                = 0 <<< Closed now
  pwrite64(6, "O\250\22\261\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 1163264) = 16384
  pwrite64(6, "\201\316\272\342\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 1179648) = 16384
  pwrite64(6, "K}S\t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 1196032) = 16384
  pwrite64(6, "\207j$\265\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 1212416) = 16384
  pwrite64(6, "q\267;\336\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 5242880) = 16384
  fsync(6) <<< But we're still writing into the disk.

[CAUSE]
After more digging, it turns out we have a very obvious escape in
open_ctree_fs_info():

	open_ctree_fs_info()
	|- fp = open(oca->filename, flags);
	|- info = __open_ctree_fd();
	|- close(fp);

As later we only do IO using the device fd, this close() seems fine.

But the truth is, for mkfs usage, this fs_info is a temporary one, with
a special magic number for the disk.  And since mkfs is doing writeable
operations, this close() would immediately trigger udev scan.

And since at this stage, the fs is not yet fully created, udev can race
with mkfs, and may get the invalid temporary superblock.

[FIX]
Introduce a new btrfs_fs_info member, initial_fd, for
open_ctree_fs_info() to record the fd.

And on close_ctree(), if we find fs_info::initial_fd is a valid fd, then
close it.

By this, we make sure all writeable fds are only closed after we have
written valid super blocks into the disk.

Issue: #734
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Qu Wenruo 2024-02-02 11:29:21 +10:30 committed by David Sterba
parent ace7d241cc
commit e54514aaea
2 changed files with 16 additions and 1 deletions

View File

@ -404,6 +404,15 @@ struct btrfs_fs_info {
u32 sectorsize;
u32 stripesize;
u32 leaf_data_size;
/*
* For open_ctree_fs_info() to hold the initial fd until close.
*
* For writeable open_ctree_fs_info() call, we should not close
* the fd until the fs_info is properly closed, or it will trigger
* udev scan while our fs is not properly initialized.
*/
int initial_fd;
u16 csum_type;
u16 csum_size;

View File

@ -913,6 +913,7 @@ struct btrfs_fs_info *btrfs_new_fs_info(int writable, u64 sb_bytenr)
fs_info->metadata_alloc_profile = (u64)-1;
fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
fs_info->nr_global_roots = 1;
fs_info->initial_fd = -1;
return fs_info;
@ -1690,7 +1691,10 @@ struct btrfs_fs_info *open_ctree_fs_info(struct open_ctree_args *oca)
return NULL;
}
info = __open_ctree_fd(fp, oca);
close(fp);
if (info)
info->initial_fd = fp;
else
close(fp);
return info;
}
@ -2297,6 +2301,8 @@ skip_commit:
btrfs_release_all_roots(fs_info);
ret = btrfs_close_devices(fs_info->fs_devices);
if (fs_info->initial_fd >= 0)
close(fs_info->initial_fd);
btrfs_cleanup_all_caches(fs_info);
btrfs_free_fs_info(fs_info);
if (!err)