btrfs-progs/mkfs/main.c

2094 lines
55 KiB
C
Raw Normal View History

2007-06-12 13:07:11 +00:00
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include "kerncompat.h"
#include <sys/stat.h>
2007-02-20 21:41:09 +00:00
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <string.h>
#include <pthread.h>
#include <uuid/uuid.h>
#include <blkid/blkid.h>
#include "kernel-lib/list.h"
#include "kernel-lib/list_sort.h"
#include "kernel-lib/rbtree.h"
#include "kernel-lib/sizes.h"
#include "kernel-shared/accessors.h"
#include "kernel-shared/extent_io.h"
#include "kernel-shared/uapi/btrfs_tree.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/disk-io.h"
#include "kernel-shared/volumes.h"
#include "kernel-shared/transaction.h"
#include "kernel-shared/zoned.h"
#include "crypto/hash.h"
#include "common/defs.h"
#include "common/internal.h"
#include "common/messages.h"
#include "common/cpu-utils.h"
#include "common/utils.h"
#include "common/path-utils.h"
#include "common/device-utils.h"
#include "common/device-scan.h"
#include "common/help.h"
#include "common/rbtree-utils.h"
#include "common/parse-utils.h"
#include "common/fsfeatures.h"
#include "common/box.h"
#include "common/units.h"
#include "common/string-utils.h"
#include "common/string-table.h"
#include "cmds/commands.h"
#include "check/qgroup-verify.h"
#include "mkfs/common.h"
#include "mkfs/rootdir.h"
2007-02-20 21:41:09 +00:00
#include "libbtrfs/ctree.h"
struct mkfs_allocation {
u64 data;
u64 metadata;
u64 mixed;
u64 system;
};
static bool opt_zero_end = true;
static bool opt_discard = true;
static bool opt_zoned = true;
static int opt_oflags = O_RDWR;
struct prepare_device_progress {
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
int fd;
char *file;
u64 dev_byte_count;
u64 byte_count;
int ret;
};
static int create_metadata_block_groups(struct btrfs_root *root, bool mixed,
Revert "btrfs-progs: mkfs: create only desired block groups for single device" This reverts commit 5f8232e5c8f0b0de0ef426274911385b0e877392. This commit causes a regression: $ mkfs.btrfs -f /dev/sda6 $ btrfsck /dev/sda6 Checking filesystem on /dev/sda6 UUID: 2ebb483c-1986-4610-802a-c6f3e6ab4b76 checking extents Chunk[256, 228, 0]: length(4194304), offset(0), type(2) mismatch with block group[0, 192, 4194304]: offset(4194304), objectid(0), flags(34) Chunk[256, 228, 4194304]: length(8388608), offset(4194304), type(4) mismatch with block group[4194304, 192, 8388608]: offset(8388608), objectid(4194304), flags(36) Block group[0, 4194304] (flags = 34) didn't find the relative chunk. Block group[4194304, 8388608] (flags = 36) didn't find the relative chunk. ...... The commit has the following bug causing the problem. 1) Typo forgets to add meta/data_profile for alloc_chunk. Only meta/data_profile is added to allocate a block group, but not chunk. 2) Type for the first system chunk is impossible to modify yet. The type for the first chunk and its stripe is hard coded into make_btrfs() function. So even we try to modify the type of the block group, we are unable to change the type of the first chunk. Causing the chunk type mismatch problem. The 1st bug can be fixed quite easily but the second is not. The good news is, the last patch "btrfs-progs: mkfs: Cleanup temporary chunk to avoid strange balance behavior." from my patchset can handle it quite well alone. So just revert the patch. New bug fix for btrfsck(err is 0 even chunk/extent tree is corrupted) and new test cases for mkfs will follow soon. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: David Sterba <dsterba@suse.com>
2015-07-14 02:13:01 +00:00
struct mkfs_allocation *allocation)
{
struct btrfs_fs_info *fs_info = root->fs_info;
2007-04-06 19:39:12 +00:00
struct btrfs_trans_handle *trans;
btrfs-progs: Fix false ENOSPC alert by tracking used space correctly [BUG] There is a bug report of unexpected ENOSPC from btrfs-convert, issue #123. After some debugging, even when we have enough unallocated space, we still hit ENOSPC at btrfs_reserve_extent(). [CAUSE] Btrfs-progs relies on chunk preallocator to make enough space for data/metadata. However after the introduction of delayed-ref, it's no longer reliable to rely on btrfs_space_info::bytes_used and btrfs_space_info::bytes_pinned to calculate used metadata space. For a running transaction with a lot of allocated tree blocks, btrfs_space_info::bytes_used stays its original value, and will only be updated when running delayed ref. This makes btrfs-progs chunk preallocator completely useless. And for btrfs-convert/mkfs.btrfs --rootdir, if we're going to have enough metadata to fill a metadata block group in one transaction, we will hit ENOSPC no matter whether we have enough unallocated space. [FIX] This patch will introduce btrfs_space_info::bytes_reserved to track how many space we have reserved but not yet committed to extent tree. To support this change, this commit also introduces the following modification: - More comment on btrfs_space_info::bytes_* To make code a little easier to read - Export update_space_info() to preallocate empty data/metadata space info for mkfs. For mkfs, we only have a temporary fs image with SYSTEM chunk only. Export update_space_info() so that we can preallocate empty data/metadata space info before we start a transaction. - Proper btrfs_space_info::bytes_reserved update The timing is the as kernel (except we don't need to update bytes_reserved for data extents) * Increase bytes_reserved when call alloc_reserved_tree_block() * Decrease bytes_reserved when running delayed refs With the help of head->must_insert_reserved to determine whether we need to decrease. Issue: #123 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-05-24 23:32:43 +00:00
struct btrfs_space_info *sinfo;
u64 flags = BTRFS_BLOCK_GROUP_METADATA;
u64 chunk_start = 0;
u64 chunk_size = 0;
u64 system_group_size = BTRFS_MKFS_SYSTEM_GROUP_SIZE;
int ret;
2007-04-06 19:39:12 +00:00
if (btrfs_is_zoned(fs_info)) {
/* Two zones are reserved for superblock */
system_group_size = fs_info->zone_size;
}
if (mixed)
flags |= BTRFS_BLOCK_GROUP_DATA;
btrfs-progs: Fix false ENOSPC alert by tracking used space correctly [BUG] There is a bug report of unexpected ENOSPC from btrfs-convert, issue #123. After some debugging, even when we have enough unallocated space, we still hit ENOSPC at btrfs_reserve_extent(). [CAUSE] Btrfs-progs relies on chunk preallocator to make enough space for data/metadata. However after the introduction of delayed-ref, it's no longer reliable to rely on btrfs_space_info::bytes_used and btrfs_space_info::bytes_pinned to calculate used metadata space. For a running transaction with a lot of allocated tree blocks, btrfs_space_info::bytes_used stays its original value, and will only be updated when running delayed ref. This makes btrfs-progs chunk preallocator completely useless. And for btrfs-convert/mkfs.btrfs --rootdir, if we're going to have enough metadata to fill a metadata block group in one transaction, we will hit ENOSPC no matter whether we have enough unallocated space. [FIX] This patch will introduce btrfs_space_info::bytes_reserved to track how many space we have reserved but not yet committed to extent tree. To support this change, this commit also introduces the following modification: - More comment on btrfs_space_info::bytes_* To make code a little easier to read - Export update_space_info() to preallocate empty data/metadata space info for mkfs. For mkfs, we only have a temporary fs image with SYSTEM chunk only. Export update_space_info() so that we can preallocate empty data/metadata space info before we start a transaction. - Proper btrfs_space_info::bytes_reserved update The timing is the as kernel (except we don't need to update bytes_reserved for data extents) * Increase bytes_reserved when call alloc_reserved_tree_block() * Decrease bytes_reserved when running delayed refs With the help of head->must_insert_reserved to determine whether we need to decrease. Issue: #123 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-05-24 23:32:43 +00:00
/* Create needed space info to trace extents reservation */
ret = update_space_info(fs_info, flags, 0, 0, &sinfo);
btrfs-progs: Fix false ENOSPC alert by tracking used space correctly [BUG] There is a bug report of unexpected ENOSPC from btrfs-convert, issue #123. After some debugging, even when we have enough unallocated space, we still hit ENOSPC at btrfs_reserve_extent(). [CAUSE] Btrfs-progs relies on chunk preallocator to make enough space for data/metadata. However after the introduction of delayed-ref, it's no longer reliable to rely on btrfs_space_info::bytes_used and btrfs_space_info::bytes_pinned to calculate used metadata space. For a running transaction with a lot of allocated tree blocks, btrfs_space_info::bytes_used stays its original value, and will only be updated when running delayed ref. This makes btrfs-progs chunk preallocator completely useless. And for btrfs-convert/mkfs.btrfs --rootdir, if we're going to have enough metadata to fill a metadata block group in one transaction, we will hit ENOSPC no matter whether we have enough unallocated space. [FIX] This patch will introduce btrfs_space_info::bytes_reserved to track how many space we have reserved but not yet committed to extent tree. To support this change, this commit also introduces the following modification: - More comment on btrfs_space_info::bytes_* To make code a little easier to read - Export update_space_info() to preallocate empty data/metadata space info for mkfs. For mkfs, we only have a temporary fs image with SYSTEM chunk only. Export update_space_info() so that we can preallocate empty data/metadata space info before we start a transaction. - Proper btrfs_space_info::bytes_reserved update The timing is the as kernel (except we don't need to update bytes_reserved for data extents) * Increase bytes_reserved when call alloc_reserved_tree_block() * Decrease bytes_reserved when running delayed refs With the help of head->must_insert_reserved to determine whether we need to decrease. Issue: #123 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-05-24 23:32:43 +00:00
if (ret < 0)
return ret;
2007-04-06 19:39:12 +00:00
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
errno = -ret;
error_msg(ERROR_MSG_START_TRANS, "%m");
return ret;
}
root->fs_info->system_allocs = 1;
/*
* We already created the block group item for our temporary system
* chunk in make_btrfs(), so account for the size here.
*/
allocation->system += system_group_size;
if (ret)
return ret;
if (mixed) {
ret = btrfs_alloc_chunk(trans, fs_info,
&chunk_start, &chunk_size,
BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA);
if (ret == -ENOSPC) {
error("no space to allocate data/metadata chunk");
goto err;
}
if (ret)
return ret;
ret = btrfs_make_block_group(trans, fs_info, 0,
BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA,
chunk_start, chunk_size);
if (ret)
return ret;
allocation->mixed += chunk_size;
} else {
ret = btrfs_alloc_chunk(trans, fs_info,
&chunk_start, &chunk_size,
BTRFS_BLOCK_GROUP_METADATA);
if (ret == -ENOSPC) {
error("no space to allocate metadata chunk");
goto err;
}
if (ret)
return ret;
ret = btrfs_make_block_group(trans, fs_info, 0,
BTRFS_BLOCK_GROUP_METADATA,
chunk_start, chunk_size);
allocation->metadata += chunk_size;
if (ret)
return ret;
}
root->fs_info->system_allocs = 0;
ret = btrfs_commit_transaction(trans, root);
if (ret) {
errno = -ret;
error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
}
err:
return ret;
}
static int create_data_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root, bool mixed,
struct mkfs_allocation *allocation)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 chunk_start = 0;
u64 chunk_size = 0;
int ret = 0;
if (!mixed) {
struct btrfs_space_info *sinfo;
ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA,
0, 0, &sinfo);
if (ret < 0)
return ret;
ret = btrfs_alloc_chunk(trans, fs_info,
&chunk_start, &chunk_size,
BTRFS_BLOCK_GROUP_DATA);
if (ret == -ENOSPC) {
error("no space to allocate data chunk");
goto err;
}
if (ret)
return ret;
ret = btrfs_make_block_group(trans, fs_info, 0,
BTRFS_BLOCK_GROUP_DATA,
chunk_start, chunk_size);
allocation->data += chunk_size;
if (ret)
return ret;
}
err:
return ret;
}
static int make_root_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_key location;
int ret;
ret = btrfs_make_root_dir(trans, root->fs_info->tree_root,
BTRFS_ROOT_TREE_DIR_OBJECTID);
2007-03-21 15:13:29 +00:00
if (ret)
2007-04-06 19:39:12 +00:00
goto err;
ret = btrfs_make_root_dir(trans, root, BTRFS_FIRST_FREE_OBJECTID);
2007-04-06 19:39:12 +00:00
if (ret)
goto err;
memcpy(&location, &root->fs_info->fs_root->root_key, sizeof(location));
location.offset = (u64)-1;
ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
"default", 7,
btrfs_super_root_dir(root->fs_info->super_copy),
&location, BTRFS_FT_DIR, 0);
2007-04-06 19:39:12 +00:00
if (ret)
goto err;
ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
"default", 7, location.objectid,
2008-07-24 16:13:32 +00:00
BTRFS_ROOT_TREE_DIR_OBJECTID, 0);
if (ret)
goto err;
2007-04-06 19:39:12 +00:00
err:
2007-03-21 15:13:29 +00:00
return ret;
}
2007-03-21 00:35:03 +00:00
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
static int __recow_root(struct btrfs_trans_handle *trans, struct btrfs_root *root)
{
struct btrfs_path path = { 0 };
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
struct btrfs_key key;
int ret;
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
key.objectid = 0;
key.type = 0;
key.offset = 0;
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
/* Get a path to the left-most leaves */
ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
if (ret < 0)
return ret;
while (true) {
struct btrfs_key found_key;
/*
* Our parent nodes must not be newer than the leaf, thus if
* the leaf is as new as the transaction, no need to re-COW.
*/
if (btrfs_header_generation(path.nodes[0]) == trans->transid)
goto next;
/*
* Grab the key of current tree block and do a COW search to
* the current tree block.
*/
btrfs_item_key_to_cpu(path.nodes[0], &key, 0);
btrfs_release_path(&path);
/* This will ensure this leaf and all its parent get COWed */
ret = btrfs_search_slot(trans, root, &key, &path, 0, 1);
if (ret < 0)
goto out;
ret = 0;
btrfs_item_key_to_cpu(path.nodes[0], &found_key, 0);
UASSERT(btrfs_comp_cpu_keys(&key, &found_key) == 0);
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
next:
ret = btrfs_next_leaf(root, &path);
if (ret < 0)
goto out;
if (ret > 0) {
ret = 0;
goto out;
}
}
out:
btrfs_release_path(&path);
return ret;
}
static int recow_global_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
struct rb_node *n;
int ret = 0;
for (n = rb_first(&fs_info->global_roots_tree); n; n = rb_next(n)) {
root = rb_entry(n, struct btrfs_root, rb_node);
ret = __recow_root(trans, root);
if (ret)
return ret;
}
return ret;
}
static int recow_roots(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *info = root->fs_info;
int ret;
ret = __recow_root(trans, info->fs_root);
if (ret)
return ret;
ret = __recow_root(trans, info->tree_root);
if (ret)
return ret;
ret = __recow_root(trans, info->chunk_root);
if (ret)
return ret;
ret = __recow_root(trans, info->dev_root);
if (ret)
return ret;
if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
ret = __recow_root(trans, info->block_group_root);
if (ret)
return ret;
}
ret = recow_global_roots(trans);
if (ret)
return ret;
return 0;
}
static int create_one_raid_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 type,
struct mkfs_allocation *allocation)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 chunk_start;
u64 chunk_size;
int ret;
ret = btrfs_alloc_chunk(trans, fs_info,
&chunk_start, &chunk_size, type);
if (ret == -ENOSPC) {
error("not enough free space to allocate chunk");
exit(1);
}
if (ret)
return ret;
ret = btrfs_make_block_group(trans, fs_info, 0,
type, chunk_start, chunk_size);
type &= BTRFS_BLOCK_GROUP_TYPE_MASK;
if (type == BTRFS_BLOCK_GROUP_DATA) {
allocation->data += chunk_size;
} else if (type == BTRFS_BLOCK_GROUP_METADATA) {
allocation->metadata += chunk_size;
} else if (type == BTRFS_BLOCK_GROUP_SYSTEM) {
allocation->system += chunk_size;
} else if (type ==
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)) {
allocation->mixed += chunk_size;
} else {
error("unrecognized profile type: 0x%llx", type);
ret = -EINVAL;
}
return ret;
}
static int create_raid_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 data_profile,
u64 metadata_profile, bool mixed,
struct mkfs_allocation *allocation)
{
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
int ret = 0;
if (metadata_profile) {
u64 meta_flags = BTRFS_BLOCK_GROUP_METADATA;
ret = create_one_raid_group(trans, root,
BTRFS_BLOCK_GROUP_SYSTEM |
metadata_profile, allocation);
if (ret)
return ret;
if (mixed)
meta_flags |= BTRFS_BLOCK_GROUP_DATA;
ret = create_one_raid_group(trans, root, meta_flags |
metadata_profile, allocation);
if (ret)
return ret;
}
btrfs-progs: mkfs: allow --data DUP for single device Current code don't support DUP profile on single device, except it is in mixed mode, because of following reasons: 1: Some SSD do deduplication internally, so the duplication on the filesystem side has no effect. 2: On a physical device, if the entire disk broken, --data DUP does not help. 3: Half performance compared to single profile. 4: We have a workaround: create multi-partition on a single device, and btffs will treat them as multi device. Instead of refusing --data DUP, we give the user a choice and print a wrning. Test: 1: Tested by xfstests Run with modified xfstests, I add test items of -d dup in single device into btrfs/* and common/rc, run tests of btrfs/*, with all mount option, no regression diffed with v4.3. 2: Tested by btrfs-progs Checked following commands in "-m dup -d dup" fs with memleck checking, all passed: mkfs.btrfs -f --data dup --metadata dup /dev/sda6 btrfs filesystem show /dev/sda6 btrfs filesystem label /dev/sda6 btrfs_label_test btrfs filesystem label /dev/sda6 btrfs device scan --all-devices btrfs device scan /dev/sda6 btrfs device scan /dev/sda6 btrfs device ready /dev/sda6 btrfs check /dev/sda6 btrfs check -s 1 /dev/sda6 btrfs check --repair /dev/sda6 btrfs check --init-csum-tree /dev/sda6 btrfs check --init-extent-tree /dev/sda6 btrfs check --check-data-csum /dev/sda6 btrfs check --qgroup-report /dev/sda6 btrfs rescue super-recover -y /dev/sda6 btrfs rescue zero-log /dev/sda6 btrfs restore -l /dev/sda6 btrfs restore /dev/sda6 / btrfs restore -s /dev/sda6 / btrfs restore -x /dev/sda6 / btrfs restore -m /dev/sda6 / btrfs restore -S /dev/sda6 / btrfs restore -v /dev/sda6 / btrfs restore -i /dev/sda6 / btrfs restore -o /dev/sda6 / btrfs restore -u0 /dev/sda6 / btrfs restore -u1 /dev/sda6 / btrfs restore -D /dev/sda6 / btrfs property list /dev/sda6 btrfs property get /dev/sda6 label btrfs property set /dev/sda6 label test btrfs property set /dev/sda6 label btrfs_label_test btrfs help btrfs help --full btrfs version btrfsck /dev/sda6 btrfs-find-root /dev/sda6 btrfs-find-root -a /dev/sda6 btrfs-map-logical -l1 /dev/sda6 btrfs-map-logical -l1 -c1 /dev/sda6 btrfs-map-logical -l1 -o /tmp/btrfs-map-logic-out /dev/sda6 btrfs-map-logical -l1 -b1 /dev/sda6 btrfs-select-super -s 0 /dev/sda6 btrfs-select-super -s 1 /dev/sda6 btrfstune -S 1 /dev/sda6 btrfstune -f -S 0 /dev/sda6 btrfstune -r /dev/sda6 btrfstune -x /dev/sda6 btrfstune -n /dev/sda6 btrfstune -f -U 00000000-0000-0000-0000-000000000000 /dev/sda6 btrfstune -f -u /dev/sda6 btrfs-calc-size /dev/sda6 btrfs-calc-size -v /dev/sda6 btrfs-calc-size -b /dev/sda6 btrfs-debug-tree /dev/sda6 btrfs-debug-tree -e /dev/sda6 btrfs-debug-tree -d /dev/sda6 btrfs-debug-tree -r /dev/sda6 btrfs-debug-tree -R /dev/sda6 btrfs-debug-tree -u /dev/sda6 btrfs-debug-tree -b 0 /dev/sda6 btrfs-debug-tree -t 0 /dev/sda6 btrfs-debug-tree -t 2 /dev/sda6 btrfs-show-super /dev/sda6 btrfs-show-super -i 0 /dev/sda6 btrfs-show-super -i 1 /dev/sda6 btrfs-show-super -i 2 /dev/sda6 btrfs-show-super -a /dev/sda6 btrfs-show-super -f /dev/sda6 btrfs-show-super -F /dev/sda6 btrfs subvolume list /mnt/btrfs-progs-tests btrfs subvolume create /mnt/btrfs-progs-tests/mysubvol btrfs subvolume list /mnt/btrfs-progs-tests btrfs subvolume get-default /mnt/btrfs-progs-tests btrfs subvolume set-default 258 /mnt/btrfs-progs-tests btrfs subvolume get-default /mnt/btrfs-progs-tests btrfs subvolume set-default /mnt/btrfs-progs-tests btrfs subvolume snapshot /mnt/btrfs-progs-tests/mysubvol /mnt/btrfs-progs-tests/mysubvol_snap btrfs subvolume list /mnt/btrfs-progs-tests btrfs subvolume find-new /mnt/btrfs-progs-tests 0 btrfs subvolume find-new /mnt/btrfs-progs-tests 0 btrfs subvolume find-new /mnt/btrfs-progs-tests/mysubvol 0 btrfs subvolume find-new /mnt/btrfs-progs-tests/mysubvol 0 btrfs subvolume show /mnt/btrfs-progs-tests btrfs subvolume show /mnt/btrfs-progs-tests/mysubvol btrfs subvolume show /mnt/btrfs-progs-tests/mysubvol_snap btrfs subvolume sync /mnt/btrfs-progs-tests btrfs subvolume delete /mnt/btrfs-progs-tests/mysubvol_snap btrfs subvolume delete /mnt/btrfs-progs-tests/mysubvol btrfs subvolume sync /mnt/btrfs-progs-tests btrfs filesystem df /mnt/btrfs-progs-tests btrfs filesystem show /mnt/btrfs-progs-tests btrfs filesystem sync /mnt/btrfs-progs-tests btrfs filesystem label /mnt/btrfs-progs-tests btrfs_label_test btrfs filesystem label /mnt/btrfs-progs-tests btrfs filesystem usage /mnt/btrfs-progs-tests btrfs filesystem defragment -s 1024 -l 2048 /mnt/btrfs-progs-tests/filesystem_test_dir/test_dir_0/test_file_0 btrfs filesystem defragment /mnt/btrfs-progs-tests/filesystem_test_dir/test_dir_0/test_file_1 btrfs filesystem defragment -f /mnt/btrfs-progs-tests/filesystem_test_dir/test_dir_0/test_file_2 btrfs filesystem defragment -czlib /mnt/btrfs-progs-tests/filesystem_test_dir/test_dir_0/test_file_3 btrfs filesystem defragment -clzo /mnt/btrfs-progs-tests/filesystem_test_dir/test_dir_0/test_file_4 btrfs filesystem defragment /mnt/btrfs-progs-tests/filesystem_test_dir btrfs filesystem defragment -r /mnt/btrfs-progs-tests/filesystem_test_dir btrfs filesystem defragment /mnt/btrfs-progs-tests btrfs filesystem resize 1:-10M /mnt/btrfs-progs-tests btrfs filesystem resize 1:max /mnt/btrfs-progs-tests btrfs balance start /mnt/btrfs-progs-tests btrfs balance start -v /mnt/btrfs-progs-tests btrfs balance start -f /mnt/btrfs-progs-tests btrfs balance status -v /mnt/btrfs-progs-tests btrfs balance pause /mnt/btrfs-progs-tests btrfs balance status /mnt/btrfs-progs-tests btrfs balance resume /mnt/btrfs-progs-tests btrfs balance status -v /mnt/btrfs-progs-tests btrfs balance cancel /mnt/btrfs-progs-tests btrfs balance start -dprofiles=single /mnt/btrfs-progs-tests btrfs balance start -dconvert=single /mnt/btrfs-progs-tests btrfs balance start -ddevid=1 /mnt/btrfs-progs-tests btrfs balance start -f -mprofiles=single /mnt/btrfs-progs-tests btrfs balance start -f -mconvert=single /mnt/btrfs-progs-tests btrfs balance start -f -mdevid=1 /mnt/btrfs-progs-tests btrfs balance start -f -sprofiles=single /mnt/btrfs-progs-tests btrfs balance start -f -sconvert=single /mnt/btrfs-progs-tests btrfs balance start -f -sdevid=1 /mnt/btrfs-progs-tests btrfs device add -f /dev/sda10 /mnt/btrfs-progs-tests btrfs device del /dev/sda10 /mnt/btrfs-progs-tests btrfs device stats /dev/sda6 btrfs device stats -z /dev/sda6 btrfs device stats /mnt/btrfs-progs-tests btrfs device stats -z /mnt/btrfs-progs-tests btrfs device usage /mnt/btrfs-progs-tests btrfs scrub status /mnt/btrfs-progs-tests btrfs scrub start -B /mnt/btrfs-progs-tests btrfs scrub start -B -d /mnt/btrfs-progs-tests btrfs scrub start -B -r /mnt/btrfs-progs-tests btrfs scrub status /mnt/btrfs-progs-tests btrfs scrub start /mnt/btrfs-progs-tests btrfs scrub status /mnt/btrfs-progs-tests btrfs scrub status /mnt/btrfs-progs-tests btrfs scrub status -d /mnt/btrfs-progs-tests btrfs scrub status -R /mnt/btrfs-progs-tests btrfs scrub status /mnt/btrfs-progs-tests btrfs scrub start /dev/sda6 btrfs scrub status /dev/sda6 btrfs scrub status /dev/sda6 btrfs scrub status -d /dev/sda6 btrfs scrub status -R /dev/sda6 btrfs scrub status /dev/sda6 btrfs subvolume snapshot -r /mnt/btrfs-progs-tests /mnt/btrfs-progs-tests/snap1 btrfs send -f /tmp/btrfs_snapshot_test /mnt/btrfs-progs-tests/snap1 btrfs send -e -f /tmp/btrfs_snapshot_test /mnt/btrfs-progs-tests/snap1 btrfs send --no-data -f /tmp/btrfs_snapshot_test /mnt/btrfs-progs-tests/snap1 btrfs quota enable /mnt/btrfs-progs-tests btrfs quota rescan /mnt/btrfs-progs-tests btrfs quota rescan -s /mnt/btrfs-progs-tests btrfs quota rescan -w /mnt/btrfs-progs-tests btrfs quota disable /mnt/btrfs-progs-tests btrfs quota enable /mnt/btrfs-progs-tests btrfs qgroup create 1/5 /mnt/btrfs-progs-tests btrfs qgroup create 2/5 /mnt/btrfs-progs-tests btrfs qgroup assign 1/5 2/5 /mnt/btrfs-progs-tests btrfs qgroup limit 1G 1/5 /mnt/btrfs-progs-tests btrfs qgroup show /mnt/btrfs-progs-tests btrfs qgroup show -p -c -r -e -F -f /mnt/btrfs-progs-tests btrfs qgroup remove 1/5 2/5 /mnt/btrfs-progs-tests btrfs qgroup destroy 2/5 /mnt/btrfs-progs-tests btrfs qgroup destroy 1/5 /mnt/btrfs-progs-tests btrfs quota disable /mnt/btrfs-progs-tests btrfs replace start -f -B /dev/sda6 /dev/sda10 /mnt/btrfs-progs-tests btrfs replace status /mnt/btrfs-progs-tests btrfs replace start -f -B /dev/sda10 /dev/sda6 /mnt/btrfs-progs-tests btrfs-convert /dev/sda6 btrfs-convert -r /dev/sda6 btrfs-convert -d /dev/sda6 btrfs-convert -i /dev/sda6 btrfs-convert -n /dev/sda6 btrfs-convert -N 4096 /dev/sda6 btrfs-convert -l test /dev/sda6 btrfs-convert -L /dev/sda6 btrfs-convert --no-progress /dev/sda6 mkfs.btrfs -f /dev/sda6 btrfs-image /dev/sda6 /tmp/btrfs_image.img btrfs-image -r /tmp/btrfs_image.img /dev/sda6 mkfs.btrfs -f /dev/sda6 btrfs-image -c 0 /dev/sda6 /tmp/btrfs_image.img btrfs-image -r /tmp/btrfs_image.img /dev/sda6 mkfs.btrfs -f /dev/sda6 btrfs-image -c 9 /dev/sda6 /tmp/btrfs_image.img btrfs-image -r /tmp/btrfs_image.img /dev/sda6 mkfs.btrfs -f /dev/sda6 btrfs-image -t 0 /dev/sda6 /tmp/btrfs_image.img btrfs-image -r /tmp/btrfs_image.img /dev/sda6 mkfs.btrfs -f /dev/sda6 btrfs-image -t 1 /dev/sda6 /tmp/btrfs_image.img btrfs-image -r /tmp/btrfs_image.img /dev/sda6 mkfs.btrfs -f /dev/sda6 btrfs-image -t 32 /dev/sda6 /tmp/btrfs_image.img btrfs-image -r /tmp/btrfs_image.img /dev/sda6 mkfs.btrfs -f /dev/sda6 btrfs-image -w /dev/sda6 /tmp/btrfs_image.img btrfs-image -r /tmp/btrfs_image.img /dev/sda6 mkfs.btrfs -f /dev/sda6 btrfs-image -w /dev/sda6 /tmp/btrfs_image.img btrfs-image -r /tmp/btrfs_image.img /dev/sda6 btrfs-image -r -t 0 /tmp/btrfs_image.img /dev/sda6 btrfs-image -r -t 1 /tmp/btrfs_image.img /dev/sda6 btrfs-image -r -t 32 /tmp/btrfs_image.img /dev/sda6 btrfs-image -r -o /tmp/btrfs_image.img /dev/sda6 3: Manual check relation source by: grep DUP *.c Confirmed that all source are modified. 4: Use this raid type manually, do some operations in fs, no error found in command and dmesg. 5: Combination of dup conversion with fsck Confirmed OK with relative kernel patch titled: [PATCH] btrfs: Support convert to -d dup for btrfs-convert export TEST_DEV='/dev/vdc' export TEST_DIR='/var/ltf/tester/mnt' do_dup_test() { local m_from="$1" local d_from="$2" local m_to="$3" local d_to="$4" echo "Convert from -m $m_from -d $d_from to -m $m_to -d $d_to" umount "$TEST_DIR" &>/dev/null ./mkfs.btrfs -f -m "$m_from" -d "$d_from" "$TEST_DEV" >/dev/null || return 1 mount "$TEST_DEV" "$TEST_DIR" || return 1 cp -a /sbin/* "$TEST_DIR" [[ "$m_from" != "$m_to" ]] && { ./btrfs balance start -f -mconvert="$m_to" "$TEST_DIR" || return 1 } [[ "$d_from" != "$d_to" ]] && { local opt=() [[ "$d_to" == single ]] && opt+=("-f") ./btrfs balance start "${opt[@]}" -dconvert="$d_to" "$TEST_DIR" || return 1 } umount "$TEST_DIR" || return 1 ./btrfsck "$TEST_DEV" || return 1 echo return 0 } test_all() { for m_from in single dup; do for d_from in single dup; do for m_to in single dup; do for d_to in single dup; do do_dup_test "$m_from" "$d_from" "$m_to" "$d_to" || return 1 done done done done } test_all Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Tested-by: Austin S. Hemmelgarn <ahferroin7@gmail.com> [ minor updates in the changelog ] Signed-off-by: David Sterba <dsterba@suse.com>
2015-11-19 09:36:24 +00:00
if (!mixed && data_profile) {
ret = create_one_raid_group(trans, root,
BTRFS_BLOCK_GROUP_DATA |
data_profile, allocation);
if (ret)
return ret;
}
return ret;
}
static const char * const mkfs_usage[] = {
"mkfs.btrfs [options] <dev> [<dev...>]",
"Create a BTRFS filesystem on a device or multiple devices",
"",
"Allocation profiles:",
OPTLINE("-d|--data PROFILE", "data profile, raid0, raid1, raid1c3, raid1c4, raid5, raid6, raid10, dup or single"),
OPTLINE("-m|--metadata PROFILE", "metadata profile, values like for data profile"),
OPTLINE("-M|--mixed","mix metadata and data together"),
"Features:",
OPTLINE("--csum TYPE", ""),
OPTLINE("--checksum TYPE", "checksum algorithm to use, crc32c (default), xxhash, sha256, blake2"),
OPTLINE("-n|--nodesize SIZE", "size of btree nodes"),
OPTLINE("-s|--sectorsize SIZE", "data block size (may not be mountable by current kernel)"),
OPTLINE("-O|--features LIST", "comma separated list of filesystem features (use '-O list-all' to list features)"),
OPTLINE("-L|--label LABEL", "set the filesystem label"),
OPTLINE("-U|--uuid UUID", "specify the filesystem UUID (must be unique for a filesystem with multiple devices)"),
OPTLINE("--device-uuid UUID", "Specify the filesystem device UUID (a.k.a sub-uuid) (for single device filesystem only)"),
"Creation:",
OPTLINE("-b|--byte-count SIZE", "set size of each device to SIZE (filesystem size is sum of all device sizes)"),
OPTLINE("-r|--rootdir DIR", "copy files from DIR to the image root directory"),
OPTLINE("--shrink", "(with --rootdir) shrink the filled filesystem to minimal size"),
OPTLINE("-K|--nodiscard", "do not perform whole device TRIM"),
OPTLINE("-f|--force", "force overwrite of existing filesystem"),
"General:",
OPTLINE("-q|--quiet", "no messages except errors"),
OPTLINE("-v|--verbose", "increase verbosity level, default is 1"),
OPTLINE("-V|--version", "print the mkfs.btrfs version and exit"),
OPTLINE("--help", "print this help and exit"),
"Deprecated:",
OPTLINE("-l|--leafsize SIZE", "removed in 6.0, use --nodesize"),
OPTLINE("-R|--runtime-features LIST", "removed in 6.3, use -O|--features"),
NULL
};
static const struct cmd_struct mkfs_cmd = {
.usagestr = mkfs_usage
};
static int zero_output_file(int out_fd, u64 size)
{
int loop_num;
u64 location = 0;
char buf[SZ_4K];
int ret = 0, i;
ssize_t written;
memset(buf, 0, SZ_4K);
/* Only zero out the first 1M */
loop_num = SZ_1M / SZ_4K;
for (i = 0; i < loop_num; i++) {
btrfs-progs: stop using legacy *64 interfaces The *64 interfaces, such as fstat64, off64_t, etc, are legacy interfaces created at a time when 64-bit file support was still new. They are generally exposed when defining a macro named _LARGEFILE64_SOURCE, as e.g. the glibc docs[0] say. The modern way to utilise largefile support, is to continue to use the regular interfaces (off_t, fstat, ..), and define _FILE_OFFSET_BITS=64. We already use the autoconf macro AC_SYS_LARGEFILE[1] which arranges this and sets this macro for us. Therefore, we can utilise the non-64 names without fear of breaking on 32-bit systems. This fixes the build against musl libc, ever since musl dropped the *64 compat from interfaces by default[2] just for _GNU_SOURCE, unless _LARGEFILE64_SOURCE is defined. However, there are plans for a future removal of the whole *64 header API, and that workaround (adding another define) might cease to exist. So, rename all *64 API use to the regular non-suffixed names. For consistency, rename the internal functions that were *64 named (lstat64_path, ..) too. This should have no regressions on any platform. [0]: https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html#index-_005fLARGEFILE64_005fSOURCE [1]: https://www.gnu.org/software/autoconf/manual/autoconf-2.67/html_node/System-Services.html [2]: https://github.com/bminor/musl/commit/25e6fee27f4a293728dd15b659170e7b9c7db9bc Pull-request: #615 Signed-off-by: psykose <alice@ayaya.dev> Signed-off-by: David Sterba <dsterba@suse.com>
2023-04-15 17:15:49 +00:00
written = pwrite(out_fd, buf, SZ_4K, location);
if (written != SZ_4K)
ret = -EIO;
location += SZ_4K;
}
/* Then enlarge the file to size */
btrfs-progs: stop using legacy *64 interfaces The *64 interfaces, such as fstat64, off64_t, etc, are legacy interfaces created at a time when 64-bit file support was still new. They are generally exposed when defining a macro named _LARGEFILE64_SOURCE, as e.g. the glibc docs[0] say. The modern way to utilise largefile support, is to continue to use the regular interfaces (off_t, fstat, ..), and define _FILE_OFFSET_BITS=64. We already use the autoconf macro AC_SYS_LARGEFILE[1] which arranges this and sets this macro for us. Therefore, we can utilise the non-64 names without fear of breaking on 32-bit systems. This fixes the build against musl libc, ever since musl dropped the *64 compat from interfaces by default[2] just for _GNU_SOURCE, unless _LARGEFILE64_SOURCE is defined. However, there are plans for a future removal of the whole *64 header API, and that workaround (adding another define) might cease to exist. So, rename all *64 API use to the regular non-suffixed names. For consistency, rename the internal functions that were *64 named (lstat64_path, ..) too. This should have no regressions on any platform. [0]: https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html#index-_005fLARGEFILE64_005fSOURCE [1]: https://www.gnu.org/software/autoconf/manual/autoconf-2.67/html_node/System-Services.html [2]: https://github.com/bminor/musl/commit/25e6fee27f4a293728dd15b659170e7b9c7db9bc Pull-request: #615 Signed-off-by: psykose <alice@ayaya.dev> Signed-off-by: David Sterba <dsterba@suse.com>
2023-04-15 17:15:49 +00:00
written = pwrite(out_fd, buf, 1, size - 1);
if (written < 1)
ret = -EIO;
return ret;
}
static int _cmp_device_by_id(void *priv, struct list_head *a,
struct list_head *b)
{
return list_entry(a, struct btrfs_device, dev_list)->devid -
list_entry(b, struct btrfs_device, dev_list)->devid;
}
static void list_all_devices(struct btrfs_root *root, bool is_zoned)
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
{
struct btrfs_fs_devices *fs_devices;
struct btrfs_device *device;
int number_of_devices = 0;
struct string_table *tab;
int row, col;
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
fs_devices = root->fs_info->fs_devices;
list_for_each_entry(device, &fs_devices->devices, dev_list)
number_of_devices++;
list_sort(NULL, &fs_devices->devices, _cmp_device_by_id);
printf("Number of devices: %d\n", number_of_devices);
printf("Devices:\n");
if (is_zoned)
tab = table_create(4, number_of_devices + 1);
else
tab = table_create(3, number_of_devices + 1);
tab->spacing = STRING_TABLE_SPACING_2;
col = 0;
table_printf(tab, col++, 0, "> ID");
table_printf(tab, col++, 0, "> SIZE");
if (is_zoned)
table_printf(tab, col++, 0, ">ZONES");
table_printf(tab, col++, 0, "<PATH");
row = 1;
list_for_each_entry(device, &fs_devices->devices, dev_list) {
col = 0;
table_printf(tab, col++, row, ">%llu", device->devid);
table_printf(tab, col++, row, ">%s", pretty_size(device->total_bytes));
if (is_zoned)
table_printf(tab, col++, row, ">%u", device->zone_info->nr_zones);
table_printf(tab, col++, row, "<%s", device->name);
row++;
}
table_dump(tab);
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
printf("\n");
table_free(tab);
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
}
static bool is_temp_block_group(struct extent_buffer *node,
struct btrfs_block_group_item *bgi,
u64 data_profile, u64 meta_profile,
u64 sys_profile)
{
u64 flag = btrfs_block_group_flags(node, bgi);
u64 flag_type = flag & BTRFS_BLOCK_GROUP_TYPE_MASK;
u64 flag_profile = flag & BTRFS_BLOCK_GROUP_PROFILE_MASK;
u64 used = btrfs_block_group_used(node, bgi);
/*
* Chunks meets all the following conditions is a temp chunk
* 1) Empty chunk
* Temp chunk is always empty.
*
* 2) profile mismatch with mkfs profile.
* Temp chunk is always in SINGLE
*
* 3) Size differs with mkfs_alloc
* Special case for SINGLE/SINGLE btrfs.
* In that case, temp data chunk and real data chunk are always empty.
* So we need to use mkfs_alloc to be sure which chunk is the newly
* allocated.
*
* Normally, new chunk size is equal to mkfs one (One chunk)
* If it has multiple chunks, we just refuse to delete any one.
* As they are all single, so no real problem will happen.
* So only use condition 1) and 2) to judge them.
*/
if (used != 0)
return false;
switch (flag_type) {
case BTRFS_BLOCK_GROUP_DATA:
case BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA:
data_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
if (flag_profile != data_profile)
return true;
break;
case BTRFS_BLOCK_GROUP_METADATA:
meta_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
if (flag_profile != meta_profile)
return true;
break;
case BTRFS_BLOCK_GROUP_SYSTEM:
sys_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
if (flag_profile != sys_profile)
return true;
break;
}
return false;
}
/* Note: if current is a block group, it will skip it anyway */
static int next_block_group(struct btrfs_root *root,
struct btrfs_path *path)
{
struct btrfs_key key;
int ret = 0;
while (1) {
ret = btrfs_next_item(root, path);
if (ret)
goto out;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY)
goto out;
}
out:
return ret;
}
/* This function will cleanup */
static int cleanup_temp_chunks(struct btrfs_fs_info *fs_info,
struct mkfs_allocation *alloc,
u64 data_profile, u64 meta_profile,
u64 sys_profile)
{
struct btrfs_trans_handle *trans = NULL;
struct btrfs_block_group_item *bgi;
struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path path = { 0 };
int ret = 0;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
errno = -ret;
error_msg(ERROR_MSG_START_TRANS, "%m");
return ret;
}
key.objectid = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = 0;
while (1) {
/*
* as the rest of the loop may modify the tree, we need to
* start a new search each time.
*/
ret = btrfs_search_slot(trans, root, &key, &path, 0, 0);
if (ret < 0)
goto out;
/* Don't pollute ret for >0 case */
if (ret > 0)
ret = 0;
btrfs_item_key_to_cpu(path.nodes[0], &found_key,
path.slots[0]);
if (found_key.objectid < key.objectid)
goto out;
if (found_key.type != BTRFS_BLOCK_GROUP_ITEM_KEY) {
ret = next_block_group(root, &path);
if (ret < 0)
goto out;
if (ret > 0) {
ret = 0;
goto out;
}
btrfs_item_key_to_cpu(path.nodes[0], &found_key,
path.slots[0]);
}
bgi = btrfs_item_ptr(path.nodes[0], path.slots[0],
struct btrfs_block_group_item);
if (is_temp_block_group(path.nodes[0], bgi,
data_profile, meta_profile,
sys_profile)) {
u64 flags = btrfs_block_group_flags(path.nodes[0], bgi);
ret = btrfs_remove_block_group(trans,
found_key.objectid, found_key.offset);
if (ret < 0)
goto out;
if ((flags & BTRFS_BLOCK_GROUP_TYPE_MASK) ==
BTRFS_BLOCK_GROUP_DATA)
alloc->data -= found_key.offset;
else if ((flags & BTRFS_BLOCK_GROUP_TYPE_MASK) ==
BTRFS_BLOCK_GROUP_METADATA)
alloc->metadata -= found_key.offset;
else if ((flags & BTRFS_BLOCK_GROUP_TYPE_MASK) ==
BTRFS_BLOCK_GROUP_SYSTEM)
alloc->system -= found_key.offset;
else if ((flags & BTRFS_BLOCK_GROUP_TYPE_MASK) ==
(BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA))
alloc->mixed -= found_key.offset;
}
btrfs_release_path(&path);
key.objectid = found_key.objectid + found_key.offset;
}
out:
if (trans) {
ret = btrfs_commit_transaction(trans, root);
if (ret) {
errno = -ret;
error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
}
}
btrfs_release_path(&path);
return ret;
}
/*
* Just update chunk allocation info, since --rootdir may allocate new
* chunks which is not updated in @allocation structure.
*/
static void update_chunk_allocation(struct btrfs_fs_info *fs_info,
struct mkfs_allocation *allocation)
{
struct btrfs_block_group *bg_cache;
const u64 mixed_flag = BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA;
u64 search_start = 0;
allocation->mixed = 0;
allocation->data = 0;
allocation->metadata = 0;
allocation->system = 0;
while (1) {
bg_cache = btrfs_lookup_first_block_group(fs_info,
search_start);
if (!bg_cache)
break;
if ((bg_cache->flags & mixed_flag) == mixed_flag)
allocation->mixed += bg_cache->length;
else if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
allocation->data += bg_cache->length;
else if (bg_cache->flags & BTRFS_BLOCK_GROUP_METADATA)
allocation->metadata += bg_cache->length;
else
allocation->system += bg_cache->length;
search_start = bg_cache->start + bg_cache->length;
}
}
static int create_data_reloc_tree(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_inode_item *inode;
struct btrfs_root *root;
struct btrfs_path path = { 0 };
struct btrfs_key key = {
.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
};
u64 ino = BTRFS_FIRST_FREE_OBJECTID;
char *name = "..";
int ret;
root = btrfs_create_tree(trans, fs_info, &key);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
}
/* Update dirid as created tree has default dirid 0 */
btrfs_set_root_dirid(&root->root_item, ino);
ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key,
&root->root_item);
if (ret < 0)
goto out;
/* Cache this tree so it can be cleaned up at close_ctree() */
ret = rb_insert(&fs_info->fs_root_tree, &root->rb_node,
btrfs_fs_roots_compare_roots);
if (ret < 0)
goto out;
/* Insert INODE_ITEM */
ret = btrfs_new_inode(trans, root, ino, 0755 | S_IFDIR);
if (ret < 0)
goto out;
/* then INODE_REF */
ret = btrfs_insert_inode_ref(trans, root, name, strlen(name), ino, ino,
0);
if (ret < 0)
goto out;
/* Update nlink of that inode item */
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(trans, root, &key, &path, 0, 1);
if (ret > 0) {
ret = -ENOENT;
btrfs_release_path(&path);
goto out;
}
if (ret < 0) {
btrfs_release_path(&path);
goto out;
}
inode = btrfs_item_ptr(path.nodes[0], path.slots[0],
struct btrfs_inode_item);
btrfs_set_inode_nlink(path.nodes[0], inode, 1);
btrfs_mark_buffer_dirty(path.nodes[0]);
btrfs_release_path(&path);
return 0;
out:
btrfs_abort_transaction(trans, ret);
return ret;
}
static int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid,
u8 type, u64 subvol_id_cpu)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct extent_buffer *eb;
int slot;
unsigned long offset;
__le64 subvol_id_le;
btrfs_uuid_to_key(uuid, type, &key);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subvol_id_le));
if (ret < 0 && ret != -EEXIST) {
warning(
"inserting uuid item failed (0x%016llx, 0x%016llx) type %u: %d",
key.objectid, key.offset, type, ret);
goto out;
}
if (ret >= 0) {
/* Add an item for the type for the first time. */
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
} else {
/*
* ret == -EEXIST case, an item with that type already exists.
* Extend the item and store the new subvol_id at the end.
*/
btrfs_extend_item(path, sizeof(subvol_id_le));
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
offset += btrfs_item_size(eb, slot) - sizeof(subvol_id_le);
}
ret = 0;
subvol_id_le = cpu_to_le64(subvol_id_cpu);
write_extent_buffer(eb, &subvol_id_le, offset, sizeof(subvol_id_le));
btrfs_mark_buffer_dirty(eb);
out:
btrfs_free_path(path);
return ret;
}
btrfs-progs: Create uuid tree with proper contents Commit 2a496a5b8b74 ("btrfs-progs: mkfs: precreate the uuid tree") creates uuid tree at mkfs time. However it doesn't populate uuid tree correctly nor creates an empty root. It uses create_tree(), which just copies the content of fs root, containing a meaningless INODE_ITEM: v4.15 mkfs (no uuid tree creation) + kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30572544 items 1 free space 16250 generation 7 owner UUID_TREE leaf 30572544 flags 0x1(WRITTEN) backref revision 1 fs uuid 33ecddef-fc86-481a-93ce-846b01c11376 chunk uuid 9e58f646-b0da-43ca-9c7d-8bbe3e120246 item 0 key (0x92457c59d31491be UUID_KEY_SUBVOL 0xef908b5e79aa76a1) itemoff 16275 itemsize 8 subvol_id 5 v4.19.1 mkfs (incorrect one), no kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30507008 items 2 free space 16061 generation 4 owner UUID_TREE leaf 30507008 flags 0x1(WRITTEN) backref revision 1 fs uuid 162f5333-9b5d-4217-877c-ddaeaa79398e chunk uuid 7bc2c5c6-a6d2-4eec-a513-142b549c6541 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 generation 3 transid 0 size 0 nbytes 16384 block group 0 mode 40755 links 1 uid 0 gid 0 rdev 0 sequence 0 flags 0x0(none) item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12 index 0 namelen 2 name: .. This patchset will fix it by populuating uuid tree properly: (NOTE: due to tree-checker, kernel doesn't accept empty uuid tree, so we can only fix it by populating uuid tree correctly) With this patchset, no kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30507008 items 1 free space 16250 generation 4 owner UUID_TREE leaf 30507008 flags 0x1(WRITTEN) backref revision 1 fs uuid ae53079e-dbbc-409b-a565-5326c7b27731 chunk uuid b5fb1bea-f20d-4af1-80f8-6ca3f0038d67 item 0 key (0x334ba6b032d89c07 UUID_KEY_SUBVOL 0x86cde09cb78bcca0) itemoff 16275 itemsize 8 subvol_id 5 For kernel, except tree-checker needs an non-empty uuid tree, both of the above behavior won't cause problem, but it's always better to keep a good standardized behavior. Fixes: 2a496a5b8b74 ("btrfs-progs: mkfs: precreate the uuid tree") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-03 07:32:21 +00:00
static int create_uuid_tree(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
struct btrfs_key key = {
.objectid = BTRFS_UUID_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
};
btrfs-progs: Create uuid tree with proper contents Commit 2a496a5b8b74 ("btrfs-progs: mkfs: precreate the uuid tree") creates uuid tree at mkfs time. However it doesn't populate uuid tree correctly nor creates an empty root. It uses create_tree(), which just copies the content of fs root, containing a meaningless INODE_ITEM: v4.15 mkfs (no uuid tree creation) + kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30572544 items 1 free space 16250 generation 7 owner UUID_TREE leaf 30572544 flags 0x1(WRITTEN) backref revision 1 fs uuid 33ecddef-fc86-481a-93ce-846b01c11376 chunk uuid 9e58f646-b0da-43ca-9c7d-8bbe3e120246 item 0 key (0x92457c59d31491be UUID_KEY_SUBVOL 0xef908b5e79aa76a1) itemoff 16275 itemsize 8 subvol_id 5 v4.19.1 mkfs (incorrect one), no kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30507008 items 2 free space 16061 generation 4 owner UUID_TREE leaf 30507008 flags 0x1(WRITTEN) backref revision 1 fs uuid 162f5333-9b5d-4217-877c-ddaeaa79398e chunk uuid 7bc2c5c6-a6d2-4eec-a513-142b549c6541 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 generation 3 transid 0 size 0 nbytes 16384 block group 0 mode 40755 links 1 uid 0 gid 0 rdev 0 sequence 0 flags 0x0(none) item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12 index 0 namelen 2 name: .. This patchset will fix it by populuating uuid tree properly: (NOTE: due to tree-checker, kernel doesn't accept empty uuid tree, so we can only fix it by populating uuid tree correctly) With this patchset, no kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30507008 items 1 free space 16250 generation 4 owner UUID_TREE leaf 30507008 flags 0x1(WRITTEN) backref revision 1 fs uuid ae53079e-dbbc-409b-a565-5326c7b27731 chunk uuid b5fb1bea-f20d-4af1-80f8-6ca3f0038d67 item 0 key (0x334ba6b032d89c07 UUID_KEY_SUBVOL 0x86cde09cb78bcca0) itemoff 16275 itemsize 8 subvol_id 5 For kernel, except tree-checker needs an non-empty uuid tree, both of the above behavior won't cause problem, but it's always better to keep a good standardized behavior. Fixes: 2a496a5b8b74 ("btrfs-progs: mkfs: precreate the uuid tree") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-03 07:32:21 +00:00
int ret = 0;
UASSERT(fs_info->uuid_root == NULL);
root = btrfs_create_tree(trans, fs_info, &key);
btrfs-progs: Create uuid tree with proper contents Commit 2a496a5b8b74 ("btrfs-progs: mkfs: precreate the uuid tree") creates uuid tree at mkfs time. However it doesn't populate uuid tree correctly nor creates an empty root. It uses create_tree(), which just copies the content of fs root, containing a meaningless INODE_ITEM: v4.15 mkfs (no uuid tree creation) + kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30572544 items 1 free space 16250 generation 7 owner UUID_TREE leaf 30572544 flags 0x1(WRITTEN) backref revision 1 fs uuid 33ecddef-fc86-481a-93ce-846b01c11376 chunk uuid 9e58f646-b0da-43ca-9c7d-8bbe3e120246 item 0 key (0x92457c59d31491be UUID_KEY_SUBVOL 0xef908b5e79aa76a1) itemoff 16275 itemsize 8 subvol_id 5 v4.19.1 mkfs (incorrect one), no kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30507008 items 2 free space 16061 generation 4 owner UUID_TREE leaf 30507008 flags 0x1(WRITTEN) backref revision 1 fs uuid 162f5333-9b5d-4217-877c-ddaeaa79398e chunk uuid 7bc2c5c6-a6d2-4eec-a513-142b549c6541 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 generation 3 transid 0 size 0 nbytes 16384 block group 0 mode 40755 links 1 uid 0 gid 0 rdev 0 sequence 0 flags 0x0(none) item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12 index 0 namelen 2 name: .. This patchset will fix it by populuating uuid tree properly: (NOTE: due to tree-checker, kernel doesn't accept empty uuid tree, so we can only fix it by populating uuid tree correctly) With this patchset, no kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30507008 items 1 free space 16250 generation 4 owner UUID_TREE leaf 30507008 flags 0x1(WRITTEN) backref revision 1 fs uuid ae53079e-dbbc-409b-a565-5326c7b27731 chunk uuid b5fb1bea-f20d-4af1-80f8-6ca3f0038d67 item 0 key (0x334ba6b032d89c07 UUID_KEY_SUBVOL 0x86cde09cb78bcca0) itemoff 16275 itemsize 8 subvol_id 5 For kernel, except tree-checker needs an non-empty uuid tree, both of the above behavior won't cause problem, but it's always better to keep a good standardized behavior. Fixes: 2a496a5b8b74 ("btrfs-progs: mkfs: precreate the uuid tree") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-03 07:32:21 +00:00
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
}
add_root_to_dirty_list(root);
fs_info->uuid_root = root;
ret = btrfs_uuid_tree_add(trans, fs_info->fs_root->root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
fs_info->fs_root->root_key.objectid);
if (ret < 0)
btrfs_abort_transaction(trans, ret);
out:
return ret;
}
static int create_global_root(struct btrfs_trans_handle *trans, u64 objectid,
int root_id)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
struct btrfs_key key = {
.objectid = objectid,
.type = BTRFS_ROOT_ITEM_KEY,
.offset = root_id,
};
int ret = 0;
root = btrfs_create_tree(trans, fs_info, &key);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
}
ret = btrfs_global_root_insert(fs_info, root);
out:
if (ret)
btrfs_abort_transaction(trans, ret);
return ret;
}
static int create_global_roots(struct btrfs_trans_handle *trans,
int nr_global_roots)
{
int ret, i;
for (i = 1; i < nr_global_roots; i++) {
ret = create_global_root(trans, BTRFS_EXTENT_TREE_OBJECTID, i);
if (ret)
return ret;
ret = create_global_root(trans, BTRFS_CSUM_TREE_OBJECTID, i);
if (ret)
return ret;
ret = create_global_root(trans, BTRFS_FREE_SPACE_TREE_OBJECTID, i);
if (ret)
return ret;
}
btrfs_set_super_nr_global_roots(trans->fs_info->super_copy,
nr_global_roots);
return 0;
}
static int insert_qgroup_items(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
struct btrfs_path path = { 0 };
struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_key key;
int ret;
if (qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT) {
error("qgroup level other than 0 is not supported yet");
return -ENOTTY;
}
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroupid;
ret = btrfs_insert_empty_item(trans, quota_root, &path, &key,
sizeof(struct btrfs_qgroup_info_item));
btrfs_release_path(&path);
if (ret < 0)
return ret;
key.objectid = 0;
key.type = BTRFS_QGROUP_LIMIT_KEY;
key.offset = qgroupid;
ret = btrfs_insert_empty_item(trans, quota_root, &path, &key,
sizeof(struct btrfs_qgroup_limit_item));
btrfs_release_path(&path);
return ret;
}
/*
* Workaround for squota so the enable_gen can be properly used.
*/
static int touch_root_subvol(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_key key = {
.objectid = BTRFS_FIRST_FREE_OBJECTID,
.type = BTRFS_INODE_ITEM_KEY,
.offset = 0,
};
struct extent_buffer *leaf;
int slot;
struct btrfs_path path = { 0 };
int ret;
trans = btrfs_start_transaction(fs_info->fs_root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
errno = -ret;
error_msg(ERROR_MSG_START_TRANS, "%m");
return ret;
}
ret = btrfs_search_slot(trans, fs_info->fs_root, &key, &path, 0, 1);
if (ret)
goto fail;
leaf = path.nodes[0];
slot = path.slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
btrfs_mark_buffer_dirty(leaf);
ret = btrfs_commit_transaction(trans, fs_info->fs_root);
if (ret < 0) {
errno = -ret;
error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
return ret;
}
btrfs_release_path(&path);
return 0;
fail:
btrfs_abort_transaction(trans, ret);
btrfs_release_path(&path);
return ret;
}
static int setup_quota_root(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_qgroup_status_item *qsi;
struct btrfs_root *quota_root;
struct btrfs_path path = { 0 };
struct btrfs_key key;
int qgroup_repaired = 0;
bool simple = btrfs_fs_incompat(fs_info, SIMPLE_QUOTA);
int flags;
int ret;
/* One to modify tree root, one for quota root */
trans = btrfs_start_transaction(fs_info->tree_root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
errno = -ret;
error_msg(ERROR_MSG_START_TRANS, "%m");
return ret;
}
ret = btrfs_create_root(trans, fs_info, BTRFS_QUOTA_TREE_OBJECTID);
if (ret < 0) {
error("failed to create quota root: %d (%m)", ret);
goto fail;
}
quota_root = fs_info->quota_root;
key.objectid = 0;
key.type = BTRFS_QGROUP_STATUS_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, quota_root, &path, &key,
sizeof(*qsi));
if (ret < 0) {
error("failed to insert qgroup status item: %d (%m)", ret);
goto fail;
}
qsi = btrfs_item_ptr(path.nodes[0], path.slots[0],
struct btrfs_qgroup_status_item);
btrfs_set_qgroup_status_generation(path.nodes[0], qsi, trans->transid);
btrfs_set_qgroup_status_rescan(path.nodes[0], qsi, 0);
flags = BTRFS_QGROUP_STATUS_FLAG_ON;
if (simple) {
btrfs_set_qgroup_status_enable_gen(path.nodes[0], qsi, trans->transid);
flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
}
else {
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
btrfs_set_qgroup_status_version(path.nodes[0], qsi, 1);
btrfs_set_qgroup_status_flags(path.nodes[0], qsi, flags);
btrfs_release_path(&path);
/* Currently mkfs will only create one subvolume */
ret = insert_qgroup_items(trans, fs_info, BTRFS_FS_TREE_OBJECTID);
if (ret < 0) {
error("failed to insert qgroup items: %d (%m)", ret);
goto fail;
}
ret = btrfs_commit_transaction(trans, fs_info->tree_root);
if (ret < 0) {
errno = -ret;
error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
return ret;
}
/* Hack to count the default subvol metadata by dirtying it */
if (simple) {
ret = touch_root_subvol(fs_info);
if (ret) {
error("failed to touch root dir for simple quota accounting %d (%m)", ret);
goto fail;
}
}
/*
* Qgroup is setup but with wrong info, use qgroup-verify
* infrastructure to repair them. (Just acts as offline rescan)
*/
ret = qgroup_verify_all(fs_info);
if (ret < 0) {
error("qgroup rescan failed: %d (%m)", ret);
return ret;
}
ret = repair_qgroups(fs_info, &qgroup_repaired, true);
if (ret < 0)
error("failed to fill qgroup info: %d (%m)", ret);
return ret;
fail:
btrfs_abort_transaction(trans, ret);
return ret;
}
static int setup_raid_stripe_tree_root(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *stripe_root;
struct btrfs_key key = {
.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
};
int ret;
trans = btrfs_start_transaction(fs_info->tree_root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
errno = -ret;
error_msg(ERROR_MSG_START_TRANS, "%m");
return ret;
}
stripe_root = btrfs_create_tree(trans, fs_info, &key);
if (IS_ERR(stripe_root)) {
ret = PTR_ERR(stripe_root);
btrfs_abort_transaction(trans, ret);
return ret;
}
fs_info->stripe_root = stripe_root;
add_root_to_dirty_list(stripe_root);
ret = btrfs_commit_transaction(trans, fs_info->tree_root);
if (ret) {
errno = -ret;
error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
return ret;
}
return 0;
}
/* Thread callback for device preparation */
static void *prepare_one_device(void *ctx)
{
struct prepare_device_progress *prepare_ctx = ctx;
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
prepare_ctx->fd = open(prepare_ctx->file, opt_oflags);
if (prepare_ctx->fd < 0) {
error("unable to open %s: %m", prepare_ctx->file);
prepare_ctx->ret = -errno;
return NULL;
}
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
prepare_ctx->ret = btrfs_prepare_device(prepare_ctx->fd,
prepare_ctx->file,
&prepare_ctx->dev_byte_count,
prepare_ctx->byte_count,
(bconf.verbose ? PREP_DEVICE_VERBOSE : 0) |
(opt_zero_end ? PREP_DEVICE_ZERO_END : 0) |
(opt_discard ? PREP_DEVICE_DISCARD : 0) |
(opt_zoned ? PREP_DEVICE_ZONED : 0));
return NULL;
}
int BOX_MAIN(mkfs)(int argc, char **argv)
2007-03-21 00:35:03 +00:00
{
char *file;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct btrfs_trans_handle *trans;
struct open_ctree_args oca = { 0 };
int ret = 0;
int close_ret;
int i;
bool ssd = false;
bool shrink_rootdir = false;
u64 source_dir_size = 0;
u64 min_dev_size;
u64 shrink_size;
int device_count = 0;
int saved_optind;
pthread_t *t_prepare = NULL;
struct prepare_device_progress *prepare_ctx = NULL;
struct mkfs_allocation allocation = { 0 };
struct btrfs_mkfs_config mkfs_cfg;
/* Options */
bool force_overwrite = false;
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
struct btrfs_mkfs_features features = btrfs_mkfs_default_features;
enum btrfs_csum_type csum_type = BTRFS_CSUM_TYPE_CRC32;
char fs_uuid[BTRFS_UUID_UNPARSED_SIZE] = { 0 };
char dev_uuid[BTRFS_UUID_UNPARSED_SIZE] = { 0 };
u32 nodesize = 0;
bool nodesize_forced = false;
u32 sectorsize = 0;
u32 stripesize = 4096;
u64 metadata_profile = 0;
bool metadata_profile_set = false;
u64 data_profile = 0;
bool data_profile_set = false;
u64 byte_count = 0;
u64 dev_byte_count = 0;
bool mixed = false;
char *label = NULL;
int nr_global_roots = sysconf(_SC_NPROCESSORS_ONLN);
char *source_dir = NULL;
cpu_detect_flags();
hash_init_accel();
btrfs_config_init();
btrfs-progs: mkfs: fix a stack over-flow when features string are too long [BUG] Even with chunk_objectid bug fixed, mkfs.btrfs can still caused stack overflow when enabling extent-tree-v2 feature (need experimental features enabled): # ./mkfs.btrfs -f -O extent-tree-v2 ~/test.img btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match NOTE: several default settings have changed in version 5.15, please make sure this does not affect your deployments: - DUP for metadata (-m dup) - enabled no-holes (-O no-holes) - enabled free-space-tree (-R free-space-tree) Label: (null) UUID: 205c61e7-f58e-4e8f-9dc2-38724f5c554b Node size: 16384 Sector size: 4096 Filesystem size: 512.00MiB Block group profiles: Data: single 8.00MiB Metadata: DUP 32.00MiB System: DUP 8.00MiB SSD detected: no Zoned device: no ================================================================= [... Skip full ASAN output ...] ==65655==ABORTING [CAUSE] For experimental build, we have unified feature output, but the old buffer size is only 64 bytes, which is too small to cover the new full feature string: extref, skinny-metadata, no-holes, free-space-tree, block-group-tree, extent-tree-v2 Above feature string is already 84 bytes, over the 64 on-stack memory size. This can also be proved by the ASAN output: ==65655==ERROR: AddressSanitizer: stack-buffer-overflow on address 0x7ffc4e03b1d0 at pc 0x7ff0fc05fafe bp 0x7ffc4e03ac60 sp 0x7ffc4e03a408 WRITE of size 17 at 0x7ffc4e03b1d0 thread T0 #0 0x7ff0fc05fafd in __interceptor_strcat /usr/src/debug/gcc/libsanitizer/asan/asan_interceptors.cpp:377 #1 0x55cdb7b06ca5 in parse_features_to_string common/fsfeatures.c:316 #2 0x55cdb7b06ce1 in btrfs_parse_fs_features_to_string common/fsfeatures.c:324 #3 0x55cdb7a37226 in main mkfs/main.c:1783 #4 0x7ff0fbe3c28f (/usr/lib/libc.so.6+0x2328f) #5 0x7ff0fbe3c349 in __libc_start_main (/usr/lib/libc.so.6+0x23349) #6 0x55cdb7a2cb34 in _start ../sysdeps/x86_64/start.S:115 [FIX] Introduce a new macro, BTRFS_FEATURE_STRING_BUF_SIZE, along with a new sanity check helper, btrfs_assert_feature_buf_size(). The problem is I can not find a build time method to verify BTRFS_FEATURE_STRING_BUF_SIZE is large enough to contain all feature names, thus have to go the runtime function to do the BUG_ON() to verify the macro size. Now the minimal buffer size for experimental build is 138 bytes, just bump it to 160 for future expansion. And if further features go beyond that number, mkfs.btrfs/btrfs-convert will immediately crash at that BUG_ON(), so we can definitely detect it. Reviewed-by: Anand Jain <anand.jain@oracle.com> Tested-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-07 12:03:01 +00:00
btrfs_assert_feature_buf_size();
2007-10-15 20:25:14 +00:00
while(1) {
int c;
enum {
GETOPT_VAL_SHRINK = GETOPT_VAL_FIRST,
GETOPT_VAL_CHECKSUM,
GETOPT_VAL_GLOBAL_ROOTS,
GETOPT_VAL_DEVICE_UUID,
};
static const struct option long_options[] = {
{ "byte-count", required_argument, NULL, 'b' },
{ "csum", required_argument, NULL,
GETOPT_VAL_CHECKSUM },
{ "checksum", required_argument, NULL,
GETOPT_VAL_CHECKSUM },
{ "force", no_argument, NULL, 'f' },
{ "leafsize", required_argument, NULL, 'l' },
{ "label", required_argument, NULL, 'L'},
{ "metadata", required_argument, NULL, 'm' },
{ "mixed", no_argument, NULL, 'M' },
{ "nodesize", required_argument, NULL, 'n' },
{ "sectorsize", required_argument, NULL, 's' },
{ "data", required_argument, NULL, 'd' },
{ "version", no_argument, NULL, 'V' },
{ "rootdir", required_argument, NULL, 'r' },
{ "nodiscard", no_argument, NULL, 'K' },
{ "features", required_argument, NULL, 'O' },
{ "runtime-features", required_argument, NULL, 'R' },
{ "uuid", required_argument, NULL, 'U' },
{ "device-uuid", required_argument, NULL,
GETOPT_VAL_DEVICE_UUID },
{ "quiet", 0, NULL, 'q' },
{ "verbose", 0, NULL, 'v' },
{ "shrink", no_argument, NULL, GETOPT_VAL_SHRINK },
#if EXPERIMENTAL
{ "param", required_argument, NULL, GETOPT_VAL_PARAM },
{ "num-global-roots", required_argument, NULL, GETOPT_VAL_GLOBAL_ROOTS },
#endif
{ "help", no_argument, NULL, GETOPT_VAL_HELP },
{ NULL, 0, NULL, 0}
};
c = getopt_long(argc, argv, "A:b:fl:n:s:m:d:L:R:O:r:U:VvMKq",
long_options, NULL);
2007-10-15 20:25:14 +00:00
if (c < 0)
break;
switch(c) {
case 'f':
force_overwrite = true;
break;
case 'd':
ret = parse_bg_profile(optarg, &data_profile);
if (ret) {
error("unknown data profile %s", optarg);
exit(1);
}
data_profile_set = true;
break;
2007-10-15 20:25:14 +00:00
case 'l':
/* Deprecated in 4.0 */
error("--leafsize has been removed in 6.0, use --nodesize");
ret = 1;
goto error;
case 'n':
nodesize = arg_strtou64_with_suffix(optarg);
nodesize_forced = true;
2007-10-15 20:25:14 +00:00
break;
case 'L':
free(label);
ret = strlen(optarg);
if (ret >= BTRFS_LABEL_SIZE) {
error("label %s is too long (max %d)",
optarg, BTRFS_LABEL_SIZE - 1);
exit(1);
}
label = strdup(optarg);
break;
case 'm':
ret = parse_bg_profile(optarg, &metadata_profile);
if (ret) {
error("unknown metadata profile %s", optarg);
exit(1);
}
metadata_profile_set = true;
break;
case 'M':
mixed = true;
break;
case 'O': {
char *orig = strdup(optarg);
char *tmp = orig;
tmp = btrfs_parse_fs_features(tmp, &features);
if (tmp) {
error("unrecognized filesystem feature '%s'",
tmp);
free(orig);
goto error;
}
free(orig);
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
if (features.runtime_flags &
BTRFS_FEATURE_RUNTIME_LIST_ALL) {
btrfs_list_all_fs_features(NULL);
goto success;
}
break;
}
case 'R': {
char *orig = strdup(optarg);
char *tmp = orig;
warning("runtime features are deprecated, use -O|--features instead");
tmp = btrfs_parse_runtime_features(tmp,
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
&features);
if (tmp) {
error("unrecognized runtime feature '%s'",
tmp);
free(orig);
goto error;
}
free(orig);
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
if (features.runtime_flags &
BTRFS_FEATURE_RUNTIME_LIST_ALL) {
btrfs_list_all_runtime_features(NULL);
goto success;
}
break;
}
2007-11-30 16:30:24 +00:00
case 's':
sectorsize = arg_strtou64_with_suffix(optarg);
2007-11-30 16:30:24 +00:00
break;
2008-03-24 19:04:49 +00:00
case 'b':
byte_count = arg_strtou64_with_suffix(optarg);
opt_zero_end = false;
2008-03-24 19:04:49 +00:00
break;
case 'v':
bconf_be_verbose();
break;
case 'V':
printf("mkfs.btrfs, part of %s\n",
PACKAGE_STRING);
goto success;
case 'r':
free(source_dir);
source_dir = strdup(optarg);
break;
case 'U':
strncpy_null(fs_uuid, optarg, BTRFS_UUID_UNPARSED_SIZE);
break;
case 'K':
opt_discard = false;
break;
case 'q':
bconf_be_quiet();
break;
case GETOPT_VAL_DEVICE_UUID:
strncpy_null(dev_uuid, optarg, BTRFS_UUID_UNPARSED_SIZE);
break;
case GETOPT_VAL_SHRINK:
shrink_rootdir = true;
break;
case GETOPT_VAL_CHECKSUM:
csum_type = parse_csum_type(optarg);
break;
case GETOPT_VAL_GLOBAL_ROOTS:
btrfs_warn_experimental("Feature: num-global-roots is part of exten-tree-v2");
nr_global_roots = (int)arg_strtou64(optarg);
break;
case GETOPT_VAL_PARAM:
bconf_save_param(optarg);
break;
case GETOPT_VAL_HELP:
2007-10-15 20:25:14 +00:00
default:
usage(&mkfs_cmd, c != GETOPT_VAL_HELP);
2007-10-15 20:25:14 +00:00
}
}
if (bconf.verbose) {
printf("%s\n", PACKAGE_STRING);
printf("See %s for more information.\n\n", PACKAGE_URL);
}
if (!sectorsize)
sectorsize = (u32)SZ_4K;
if (btrfs_check_sectorsize(sectorsize))
goto error;
if (!nodesize)
nodesize = max_t(u32, sectorsize, BTRFS_MKFS_DEFAULT_NODE_SIZE);
stripesize = sectorsize;
saved_optind = optind;
device_count = argc - optind;
if (device_count == 0)
usage(&mkfs_cmd, 1);
2008-03-24 19:04:49 +00:00
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
opt_zoned = !!(features.incompat_flags & BTRFS_FEATURE_INCOMPAT_ZONED);
if (source_dir && device_count > 1) {
error("the option -r is limited to a single device");
goto error;
}
if (shrink_rootdir && source_dir == NULL) {
error("the option --shrink must be used with --rootdir");
goto error;
}
if (*fs_uuid) {
uuid_t dummy_uuid;
if (uuid_parse(fs_uuid, dummy_uuid) != 0) {
error("could not parse UUID: %s", fs_uuid);
goto error;
}
/* We allow non-unique fsid for single device btrfs filesystem. */
if (device_count != 1 && !test_uuid_unique(fs_uuid)) {
error("non-unique UUID: %s", fs_uuid);
goto error;
}
}
Btrfs-progs: Do not force mixed block group creation unless '-M' option is specified When creating small Btrfs filesystem instances (i.e. filesystem size <= 1GiB), mkfs.btrfs fails if both sectorsize and nodesize are specified on the command line and sectorsize != nodesize, since mixed block groups involves both data and metadata blocks sharing the same block group. This is an incorrect behavior when '-M' option isn't specified on the command line. This commit makes optional the creation of mixed block groups i.e. Mixed block groups are created only when -M option is specified on the command line. Since we now allow small filesystem instances with sectorsize != nodesize to be created, we can end up in the following situation, [root@localhost ~]# mkfs.btrfs -f -n 65536 /dev/loop0 btrfs-progs v3.19-rc2-405-g976307c See http://btrfs.wiki.kernel.org for more information. Performing full device TRIM (512.00MiB) ... Label: (null) UUID: 49fab72e-0c8b-466b-a3ca-d1bfe56475f0 Node size: 65536 Sector size: 4096 Filesystem size: 512.00MiB Block group profiles: Data: single 8.00MiB Metadata: DUP 40.00MiB System: DUP 12.00MiB SSD detected: no Incompat features: extref, skinny-metadata Number of devices: 1 Devices: ID SIZE PATH 1 512.00MiB /dev/loop0 [root@localhost ~]# mount /dev/loop0 /mnt/ mount: mount /dev/loop0 on /mnt failed: No space left on device The ENOSPC occurs during the creation of the UUID tree. This is because of things like large metadata block size, DUP mode used for metadata and global reservation consuming space. Also, large nodesize does not make sense on small filesystems, hence this should not be an issue. Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Signed-off-by: David Sterba <dsterba@suse.com>
2015-10-14 17:39:37 +00:00
if (*dev_uuid) {
uuid_t dummy_uuid;
if (uuid_parse(dev_uuid, dummy_uuid) != 0) {
error("could not parse device UUID: %s", dev_uuid);
goto error;
}
/* We allow non-unique device uuid for single device filesystem. */
if (device_count != 1 && !test_uuid_unique(dev_uuid)) {
error("the option --device-uuid %s can be used only for a single device filesystem",
dev_uuid);
goto error;
}
}
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
for (i = 0; i < device_count; i++) {
file = argv[optind++];
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
if (source_dir && path_exists(file) == 0)
ret = 0;
else if (path_is_block_device(file) == 1)
ret = test_dev_for_mkfs(file, force_overwrite);
else
ret = test_status_for_mkfs(file, force_overwrite);
if (ret)
goto error;
}
optind = saved_optind;
device_count = argc - optind;
file = argv[optind++];
ssd = device_get_rotational(file);
if (opt_zoned) {
if (!zone_size(file)) {
error("zoned: %s: zone size undefined", file);
exit(1);
}
} else if (zoned_model(file) == ZONED_HOST_MANAGED) {
if (bconf.verbose)
printf(
"Zoned: %s: host-managed device detected, setting zoned feature\n",
file);
opt_zoned = true;
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
features.incompat_flags |= BTRFS_FEATURE_INCOMPAT_ZONED;
}
/*
* Set default profiles according to number of added devices.
* For mixed groups defaults are single/single.
*/
if (!mixed) {
u64 tmp;
if (!metadata_profile_set) {
if (device_count > 1)
tmp = BTRFS_MKFS_DEFAULT_META_MULTI_DEVICE;
else
tmp = BTRFS_MKFS_DEFAULT_META_ONE_DEVICE;
metadata_profile = tmp;
}
if (!data_profile_set) {
if (device_count > 1)
tmp = BTRFS_MKFS_DEFAULT_DATA_MULTI_DEVICE;
else
tmp = BTRFS_MKFS_DEFAULT_DATA_ONE_DEVICE;
data_profile = tmp;
}
} else {
if (metadata_profile_set || data_profile_set) {
if (metadata_profile != data_profile) {
error(
"with mixed block groups data and metadata profiles must be the same");
goto error;
}
}
if (!nodesize_forced)
nodesize = sectorsize;
}
/*
* FS features that can be set by other means than -O
* just set the bit here
*/
if (mixed)
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
features.incompat_flags |= BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS;
if ((data_profile | metadata_profile) & BTRFS_BLOCK_GROUP_RAID56_MASK) {
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
features.incompat_flags |= BTRFS_FEATURE_INCOMPAT_RAID56;
warning("RAID5/6 support has known problems is strongly discouraged\n"
"\t to be used besides testing or evaluation.\n");
}
if ((data_profile | metadata_profile) &
(BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)) {
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
features.incompat_flags |= BTRFS_FEATURE_INCOMPAT_RAID1C34;
}
/* Extent tree v2 comes with a set of mandatory features. */
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
if (features.incompat_flags & BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) {
features.incompat_flags |= BTRFS_FEATURE_INCOMPAT_NO_HOLES;
features.compat_ro_flags |=
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID |
BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE;
if (!nr_global_roots) {
error("you must set a non-zero num-global-roots value");
exit(1);
}
}
/* Block group tree feature requires no-holes and free-space-tree. */
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
if (features.compat_ro_flags & BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE &&
(!(features.incompat_flags & BTRFS_FEATURE_INCOMPAT_NO_HOLES) ||
!(features.compat_ro_flags & BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE))) {
error("block group tree requires no-holes and free-space-tree features");
exit(1);
}
if (opt_zoned) {
const int blkid_version = blkid_get_library_version(NULL, NULL);
if (source_dir) {
error("the option -r and zoned mode are incompatible");
exit(1);
}
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
if (features.incompat_flags & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) {
error("cannot enable mixed-bg in zoned mode");
exit(1);
}
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
if (features.incompat_flags & BTRFS_FEATURE_INCOMPAT_RAID56) {
error("cannot enable RAID5/6 in zoned mode");
exit(1);
}
if (blkid_version < 2380)
warning("libblkid < 2.38 does not support zoned mode's superblock location, update recommended");
}
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
if (btrfs_check_nodesize(nodesize, sectorsize, &features))
goto error;
if (sectorsize < sizeof(struct btrfs_super_block)) {
error("sectorsize smaller than superblock: %u < %zu",
sectorsize, sizeof(struct btrfs_super_block));
goto error;
}
min_dev_size = btrfs_min_dev_size(nodesize, mixed,
opt_zoned ? zone_size(file) : 0,
metadata_profile, data_profile);
if (byte_count) {
byte_count = round_down(byte_count, sectorsize);
if (opt_zoned)
byte_count = round_down(byte_count, zone_size(file));
}
/*
* Enlarge the destination file or create a new one, using the size
* calculated from source dir.
*
* This must be done before minimal device size checks.
*/
if (source_dir) {
int oflags = O_RDWR;
struct stat statbuf;
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
int fd;
if (path_exists(file) == 0)
oflags |= O_CREAT;
fd = open(file, oflags, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP |
S_IROTH);
if (fd < 0) {
error("unable to open %s: %m", file);
goto error;
}
ret = fstat(fd, &statbuf);
if (ret < 0) {
error("unable to stat %s: %m", file);
ret = -errno;
goto error;
}
/*
* Block_count not specified, use file/device size first.
* Or we will always use source_dir_size calculated for mkfs.
*/
if (!byte_count)
byte_count = round_down(device_get_partition_size_fd_stat(fd, &statbuf),
sectorsize);
source_dir_size = btrfs_mkfs_size_dir(source_dir, sectorsize,
min_dev_size, metadata_profile, data_profile);
UASSERT(IS_ALIGNED(source_dir_size, sectorsize));
if (byte_count < source_dir_size) {
btrfs-progs: mkfs: do not enlarge the target block device [BUG] When running mkfs.btrfs with --rootdir on a block device, and the source directory contains a sparse file, whose size is larger than the block size, then mkfs.btrfs would fail: # lsblk /dev/test/test NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS test-test 253:0 0 10G 0 lvm # mkdir -p /tmp/output # truncate -s 20G /tmp/output/file # mkfs.btrfs -f --rootdir /tmp/output /dev/test/test # sudo mkfs.btrfs -f /dev/test/scratch1 --rootdir /tmp/output/ btrfs-progs v6.3.3 See https://btrfs.readthedocs.io for more information. ERROR: unable to zero the output file [CAUSE] Mkfs.btrfs would try to zero out the target file according to the total size of the directory. However the directory size is calculated using the file size, not the real bytes taken by the file, thus for such sparse file with holes only, it would still take 20G. Then we would use that 20G size to zero out the target file, but if the target file is a block device, we would fail as we can not enlarge a block device. [FIX] When zeroing the file, we only enlarge it if the target is a regular file. Otherwise we warn about the size and continue. Please note that, since "mkfs.btrfs --rootdir" doesn't handle sparse file any differently from regular file, above case would still fail due to ENOSPC, as will write zeros into the target file inside the fs. Proper handling for sparse files would need a new series of patch to address. Issue: #653 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-10-12 08:31:04 +00:00
if (S_ISREG(statbuf.st_mode)) {
byte_count = source_dir_size;
btrfs-progs: mkfs: do not enlarge the target block device [BUG] When running mkfs.btrfs with --rootdir on a block device, and the source directory contains a sparse file, whose size is larger than the block size, then mkfs.btrfs would fail: # lsblk /dev/test/test NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS test-test 253:0 0 10G 0 lvm # mkdir -p /tmp/output # truncate -s 20G /tmp/output/file # mkfs.btrfs -f --rootdir /tmp/output /dev/test/test # sudo mkfs.btrfs -f /dev/test/scratch1 --rootdir /tmp/output/ btrfs-progs v6.3.3 See https://btrfs.readthedocs.io for more information. ERROR: unable to zero the output file [CAUSE] Mkfs.btrfs would try to zero out the target file according to the total size of the directory. However the directory size is calculated using the file size, not the real bytes taken by the file, thus for such sparse file with holes only, it would still take 20G. Then we would use that 20G size to zero out the target file, but if the target file is a block device, we would fail as we can not enlarge a block device. [FIX] When zeroing the file, we only enlarge it if the target is a regular file. Otherwise we warn about the size and continue. Please note that, since "mkfs.btrfs --rootdir" doesn't handle sparse file any differently from regular file, above case would still fail due to ENOSPC, as will write zeros into the target file inside the fs. Proper handling for sparse files would need a new series of patch to address. Issue: #653 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-10-12 08:31:04 +00:00
} else {
warning(
"the target device %llu (%s) is smaller than the calculated source directory size %llu (%s), mkfs may fail",
byte_count, pretty_size(byte_count),
btrfs-progs: mkfs: do not enlarge the target block device [BUG] When running mkfs.btrfs with --rootdir on a block device, and the source directory contains a sparse file, whose size is larger than the block size, then mkfs.btrfs would fail: # lsblk /dev/test/test NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS test-test 253:0 0 10G 0 lvm # mkdir -p /tmp/output # truncate -s 20G /tmp/output/file # mkfs.btrfs -f --rootdir /tmp/output /dev/test/test # sudo mkfs.btrfs -f /dev/test/scratch1 --rootdir /tmp/output/ btrfs-progs v6.3.3 See https://btrfs.readthedocs.io for more information. ERROR: unable to zero the output file [CAUSE] Mkfs.btrfs would try to zero out the target file according to the total size of the directory. However the directory size is calculated using the file size, not the real bytes taken by the file, thus for such sparse file with holes only, it would still take 20G. Then we would use that 20G size to zero out the target file, but if the target file is a block device, we would fail as we can not enlarge a block device. [FIX] When zeroing the file, we only enlarge it if the target is a regular file. Otherwise we warn about the size and continue. Please note that, since "mkfs.btrfs --rootdir" doesn't handle sparse file any differently from regular file, above case would still fail due to ENOSPC, as will write zeros into the target file inside the fs. Proper handling for sparse files would need a new series of patch to address. Issue: #653 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-10-12 08:31:04 +00:00
source_dir_size, pretty_size(source_dir_size));
}
}
ret = zero_output_file(fd, byte_count);
if (ret) {
error("unable to zero the output file");
close(fd);
goto error;
}
/* our "device" is the new image file */
dev_byte_count = byte_count;
close(fd);
}
/* Check device/byte_count after the nodesize is determined */
if (byte_count && byte_count < min_dev_size) {
error("size %llu is too small to make a usable filesystem", byte_count);
error("minimum size for a %sbtrfs filesystem is %llu",
opt_zoned ? "zoned mode " : "", min_dev_size);
goto error;
}
for (i = saved_optind; i < saved_optind + device_count; i++) {
char *path;
path = argv[i];
ret = test_minimum_size(path, min_dev_size);
if (ret < 0) {
error("failed to check size for %s: %m", path);
goto error;
}
if (ret > 0) {
error("'%s' is too small to make a usable filesystem",
path);
error("minimum size for each btrfs device is %llu",
min_dev_size);
goto error;
}
}
ret = test_num_disk_vs_raid(metadata_profile, data_profile,
device_count, mixed, ssd);
if (ret)
goto error;
if (opt_zoned && device_count) {
switch (data_profile & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case BTRFS_BLOCK_GROUP_DUP:
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID1C3:
case BTRFS_BLOCK_GROUP_RAID1C4:
case BTRFS_BLOCK_GROUP_RAID0:
case BTRFS_BLOCK_GROUP_RAID10:
#if EXPERIMENTAL
features.incompat_flags |= BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE;
#endif
break;
default:
break;
}
}
if (opt_zoned) {
u64 metadata = BTRFS_BLOCK_GROUP_METADATA | metadata_profile;
u64 data = BTRFS_BLOCK_GROUP_DATA | data_profile;
bool rst = false;
if (features.incompat_flags & BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE)
rst = true;
if (!zoned_profile_supported(metadata, rst) ||
!zoned_profile_supported(data, rst)) {
error("zoned mode does not yet support the selected RAID profiles");
goto error;
}
}
t_prepare = calloc(device_count, sizeof(*t_prepare));
prepare_ctx = calloc(device_count, sizeof(*prepare_ctx));
if (!t_prepare || !prepare_ctx) {
error_msg(ERROR_MSG_MEMORY, "thread for preparing devices");
goto error;
}
opt_oflags = O_RDWR;
for (i = 0; i < device_count; i++) {
if (opt_zoned &&
zoned_model(argv[optind + i - 1]) == ZONED_HOST_MANAGED) {
opt_oflags |= O_DIRECT;
break;
}
}
/* Start threads */
for (i = 0; i < device_count; i++) {
prepare_ctx[i].file = argv[optind + i - 1];
prepare_ctx[i].byte_count = byte_count;
prepare_ctx[i].dev_byte_count = byte_count;
ret = pthread_create(&t_prepare[i], NULL, prepare_one_device,
&prepare_ctx[i]);
if (ret) {
errno = -ret;
error("failed to create thread for prepare device %s: %m",
prepare_ctx[i].file);
goto error;
}
}
/* Wait for threads */
for (i = 0; i < device_count; i++)
pthread_join(t_prepare[i], NULL);
ret = prepare_ctx[0].ret;
if (ret) {
error("unable prepare device: %s", prepare_ctx[0].file);
goto error;
}
dev_byte_count = prepare_ctx[0].dev_byte_count;
if (byte_count && byte_count > dev_byte_count) {
error("%s is smaller than requested size, expected %llu, found %llu",
file, byte_count, dev_byte_count);
goto error;
2007-03-21 00:35:03 +00:00
}
if (btrfs_bg_type_to_tolerated_failures(metadata_profile) <
btrfs_bg_type_to_tolerated_failures(data_profile))
warning("metadata has lower redundancy than data!\n");
if (bconf.verbose) {
printf("NOTE: several default settings have changed in version 5.15, please make sure\n");
printf(" this does not affect your deployments:\n");
printf(" - DUP for metadata (-m dup)\n");
printf(" - enabled no-holes (-O no-holes)\n");
printf(" - enabled free-space-tree (-R free-space-tree)\n");
printf("\n");
}
mkfs_cfg.label = label;
memcpy(mkfs_cfg.fs_uuid, fs_uuid, sizeof(mkfs_cfg.fs_uuid));
memcpy(mkfs_cfg.dev_uuid, dev_uuid, sizeof(mkfs_cfg.dev_uuid));
mkfs_cfg.num_bytes = dev_byte_count;
mkfs_cfg.nodesize = nodesize;
mkfs_cfg.sectorsize = sectorsize;
mkfs_cfg.stripesize = stripesize;
mkfs_cfg.features = features;
mkfs_cfg.csum_type = csum_type;
mkfs_cfg.leaf_data_size = __BTRFS_LEAF_DATA_SIZE(nodesize);
if (opt_zoned)
mkfs_cfg.zone_size = zone_size(file);
else
mkfs_cfg.zone_size = 0;
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
ret = make_btrfs(prepare_ctx[0].fd, &mkfs_cfg);
2007-03-21 00:35:03 +00:00
if (ret) {
errno = -ret;
error("error during mkfs: %m");
goto error;
2007-03-21 00:35:03 +00:00
}
oca.filename = file;
oca.flags = OPEN_CTREE_WRITES | OPEN_CTREE_TEMPORARY_SUPER;
fs_info = open_ctree_fs_info(&oca);
if (!fs_info) {
error("open ctree failed");
goto error;
}
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
root = fs_info->fs_root;
Revert "btrfs-progs: mkfs: create only desired block groups for single device" This reverts commit 5f8232e5c8f0b0de0ef426274911385b0e877392. This commit causes a regression: $ mkfs.btrfs -f /dev/sda6 $ btrfsck /dev/sda6 Checking filesystem on /dev/sda6 UUID: 2ebb483c-1986-4610-802a-c6f3e6ab4b76 checking extents Chunk[256, 228, 0]: length(4194304), offset(0), type(2) mismatch with block group[0, 192, 4194304]: offset(4194304), objectid(0), flags(34) Chunk[256, 228, 4194304]: length(8388608), offset(4194304), type(4) mismatch with block group[4194304, 192, 8388608]: offset(8388608), objectid(4194304), flags(36) Block group[0, 4194304] (flags = 34) didn't find the relative chunk. Block group[4194304, 8388608] (flags = 36) didn't find the relative chunk. ...... The commit has the following bug causing the problem. 1) Typo forgets to add meta/data_profile for alloc_chunk. Only meta/data_profile is added to allocate a block group, but not chunk. 2) Type for the first system chunk is impossible to modify yet. The type for the first chunk and its stripe is hard coded into make_btrfs() function. So even we try to modify the type of the block group, we are unable to change the type of the first chunk. Causing the chunk type mismatch problem. The 1st bug can be fixed quite easily but the second is not. The good news is, the last patch "btrfs-progs: mkfs: Cleanup temporary chunk to avoid strange balance behavior." from my patchset can handle it quite well alone. So just revert the patch. New bug fix for btrfsck(err is 0 even chunk/extent tree is corrupted) and new test cases for mkfs will follow soon. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: David Sterba <dsterba@suse.com>
2015-07-14 02:13:01 +00:00
ret = create_metadata_block_groups(root, mixed, &allocation);
if (ret) {
error("failed to create default block groups: %d", ret);
goto error;
}
if (features.incompat_flags & BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE) {
ret = setup_raid_stripe_tree_root(fs_info);
if (ret < 0) {
error("failed to initialize raid-stripe-tree: %d (%m)", ret);
goto out;
}
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
errno = -PTR_ERR(trans);
error_msg(ERROR_MSG_START_TRANS, "%m");
goto error;
}
Revert "btrfs-progs: mkfs: create only desired block groups for single device" This reverts commit 5f8232e5c8f0b0de0ef426274911385b0e877392. This commit causes a regression: $ mkfs.btrfs -f /dev/sda6 $ btrfsck /dev/sda6 Checking filesystem on /dev/sda6 UUID: 2ebb483c-1986-4610-802a-c6f3e6ab4b76 checking extents Chunk[256, 228, 0]: length(4194304), offset(0), type(2) mismatch with block group[0, 192, 4194304]: offset(4194304), objectid(0), flags(34) Chunk[256, 228, 4194304]: length(8388608), offset(4194304), type(4) mismatch with block group[4194304, 192, 8388608]: offset(8388608), objectid(4194304), flags(36) Block group[0, 4194304] (flags = 34) didn't find the relative chunk. Block group[4194304, 8388608] (flags = 36) didn't find the relative chunk. ...... The commit has the following bug causing the problem. 1) Typo forgets to add meta/data_profile for alloc_chunk. Only meta/data_profile is added to allocate a block group, but not chunk. 2) Type for the first system chunk is impossible to modify yet. The type for the first chunk and its stripe is hard coded into make_btrfs() function. So even we try to modify the type of the block group, we are unable to change the type of the first chunk. Causing the chunk type mismatch problem. The 1st bug can be fixed quite easily but the second is not. The good news is, the last patch "btrfs-progs: mkfs: Cleanup temporary chunk to avoid strange balance behavior." from my patchset can handle it quite well alone. So just revert the patch. New bug fix for btrfsck(err is 0 even chunk/extent tree is corrupted) and new test cases for mkfs will follow soon. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: David Sterba <dsterba@suse.com>
2015-07-14 02:13:01 +00:00
ret = create_data_block_groups(trans, root, mixed, &allocation);
if (ret) {
error("failed to create default data block groups: %d", ret);
goto error;
}
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
if (features.incompat_flags & BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) {
ret = create_global_roots(trans, nr_global_roots);
if (ret) {
error("failed to create global roots: %d", ret);
goto error;
}
}
ret = make_root_dir(trans, root);
2007-03-21 15:13:29 +00:00
if (ret) {
error("failed to setup the root directory: %d", ret);
goto error;
2007-03-21 15:13:29 +00:00
}
ret = btrfs_commit_transaction(trans, root);
if (ret) {
errno = -ret;
error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
goto out;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
errno = -PTR_ERR(trans);
error_msg(ERROR_MSG_START_TRANS, "%m");
goto error;
}
if (device_count == 0)
Revert "btrfs-progs: mkfs: create only desired block groups for single device" This reverts commit 5f8232e5c8f0b0de0ef426274911385b0e877392. This commit causes a regression: $ mkfs.btrfs -f /dev/sda6 $ btrfsck /dev/sda6 Checking filesystem on /dev/sda6 UUID: 2ebb483c-1986-4610-802a-c6f3e6ab4b76 checking extents Chunk[256, 228, 0]: length(4194304), offset(0), type(2) mismatch with block group[0, 192, 4194304]: offset(4194304), objectid(0), flags(34) Chunk[256, 228, 4194304]: length(8388608), offset(4194304), type(4) mismatch with block group[4194304, 192, 8388608]: offset(8388608), objectid(4194304), flags(36) Block group[0, 4194304] (flags = 34) didn't find the relative chunk. Block group[4194304, 8388608] (flags = 36) didn't find the relative chunk. ...... The commit has the following bug causing the problem. 1) Typo forgets to add meta/data_profile for alloc_chunk. Only meta/data_profile is added to allocate a block group, but not chunk. 2) Type for the first system chunk is impossible to modify yet. The type for the first chunk and its stripe is hard coded into make_btrfs() function. So even we try to modify the type of the block group, we are unable to change the type of the first chunk. Causing the chunk type mismatch problem. The 1st bug can be fixed quite easily but the second is not. The good news is, the last patch "btrfs-progs: mkfs: Cleanup temporary chunk to avoid strange balance behavior." from my patchset can handle it quite well alone. So just revert the patch. New bug fix for btrfsck(err is 0 even chunk/extent tree is corrupted) and new test cases for mkfs will follow soon. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: David Sterba <dsterba@suse.com>
2015-07-14 02:13:01 +00:00
goto raid_groups;
2008-03-24 19:04:49 +00:00
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
for (i = 1; i < device_count; i++) {
ret = btrfs_device_already_in_root(root, prepare_ctx[i].fd,
BTRFS_SUPER_INFO_OFFSET);
if (ret) {
error("skipping duplicate device %s in the filesystem",
file);
continue;
}
dev_byte_count = prepare_ctx[i].dev_byte_count;
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
if (prepare_ctx[i].ret) {
errno = -prepare_ctx[i].ret;
error("unable to prepare device %s: %m", prepare_ctx[i].file);
goto error;
}
2008-03-24 19:04:49 +00:00
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
ret = btrfs_add_to_fsid(trans, root, prepare_ctx[i].fd,
prepare_ctx[i].file, dev_byte_count,
2008-03-24 19:04:49 +00:00
sectorsize, sectorsize, sectorsize);
if (ret) {
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
error("unable to add %s to filesystem: %d",
prepare_ctx[i].file, ret);
goto error;
}
if (bconf.verbose >= 2) {
struct btrfs_device *device;
device = container_of(fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
printf("adding device %s id %llu\n", file, device->devid);
}
2008-03-24 19:04:49 +00:00
}
if (opt_zoned)
btrfs_get_dev_zone_info_all_devices(fs_info);
Revert "btrfs-progs: mkfs: create only desired block groups for single device" This reverts commit 5f8232e5c8f0b0de0ef426274911385b0e877392. This commit causes a regression: $ mkfs.btrfs -f /dev/sda6 $ btrfsck /dev/sda6 Checking filesystem on /dev/sda6 UUID: 2ebb483c-1986-4610-802a-c6f3e6ab4b76 checking extents Chunk[256, 228, 0]: length(4194304), offset(0), type(2) mismatch with block group[0, 192, 4194304]: offset(4194304), objectid(0), flags(34) Chunk[256, 228, 4194304]: length(8388608), offset(4194304), type(4) mismatch with block group[4194304, 192, 8388608]: offset(8388608), objectid(4194304), flags(36) Block group[0, 4194304] (flags = 34) didn't find the relative chunk. Block group[4194304, 8388608] (flags = 36) didn't find the relative chunk. ...... The commit has the following bug causing the problem. 1) Typo forgets to add meta/data_profile for alloc_chunk. Only meta/data_profile is added to allocate a block group, but not chunk. 2) Type for the first system chunk is impossible to modify yet. The type for the first chunk and its stripe is hard coded into make_btrfs() function. So even we try to modify the type of the block group, we are unable to change the type of the first chunk. Causing the chunk type mismatch problem. The 1st bug can be fixed quite easily but the second is not. The good news is, the last patch "btrfs-progs: mkfs: Cleanup temporary chunk to avoid strange balance behavior." from my patchset can handle it quite well alone. So just revert the patch. New bug fix for btrfsck(err is 0 even chunk/extent tree is corrupted) and new test cases for mkfs will follow soon. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: David Sterba <dsterba@suse.com>
2015-07-14 02:13:01 +00:00
raid_groups:
ret = create_raid_groups(trans, root, data_profile,
metadata_profile, mixed, &allocation);
if (ret) {
error("unable to create raid groups: %d", ret);
goto out;
}
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
/*
* Commit current transaction so we can COW all existing tree blocks
* to newly created raid groups.
* As currently we use btrfs_search_slot() to COW tree blocks in
* recow_roots(), if a tree block is already modified in current trans,
* it won't be re-COWed, thus it will stay in temporary chunks.
*/
ret = btrfs_commit_transaction(trans, root);
if (ret) {
errno = -ret;
error_msg(ERROR_MSG_COMMIT_TRANS, "before recowing trees: %m");
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
goto out;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
errno = -PTR_ERR(trans);
error_msg(ERROR_MSG_START_TRANS, "%m");
btrfs-progs: mkfs: recow all tree blocks properly [BUG] Since btrfs-progs v5.14, mkfs.btrfs no longer cleans up the temporary SINGLE metadata chunks if "-R free-space-tree" is specified: $ mkfs.btrfs -f -R free-space-tree -m dup -d dup /dev/test/test $ btrfs ins dump-tree -t chunk /dev/test/test | grep "type METADATA" length 8388608 owner 2 stripe_len 65536 type METADATA length 268435456 owner 2 stripe_len 65536 type METADATA|DUP [CAUSE] Since commit 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time"), free space tree is created when the temporary btrfs image is created. This behavior itself has no problem at all. The problem happens when "-m DUP -d DUP" (or other profiles) is specified. This makes btrfs to create extra chunks, enlarging free space tree so that it can be as high as level 1. During mkfs, we rely on recow_roots() to re-COW all tree blocks to the newly allocated chunks. But __recow_root() can only handle tree root at level 0, as it forces root node to be COWed, not bothering the children leaves/nodes. This makes part of the free space cache tree still live on the old temporary chunks, leaving later cleanup_temp_chunks() unable to delete temporary SINGLE chunks. [FIX] Rework __recow_root() to do a proper COW of the whole tree. But above rework is not enough, as if a free space tree block is allocated during current transaction, but before new chunks added. Then the reworked __recow_root() can't COW it, as btrfs_search_slot() won't COW a tree block allocated in current transaction. So this patch will also commit current transaction before calling recow_roots(), to force us to re-cow all tree blocks. This shouldn't be a problem, as at the time of calling, we should have less than a dozen tree blocks, thus there won't be a performance impact. Reported-by: FireFish5000 <firefish5000@gmail.com> Fixes: 4b6cf2a3eb78 ("btrfs-progs: mkfs: generate free space tree at make_btrfs() time") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-11 12:06:49 +00:00
goto error;
}
/* COW all tree blocks to newly created chunks */
ret = recow_roots(trans, root);
if (ret) {
errno = -ret;
error("unable to COW tree blocks to new profiles: %m");
goto out;
}
ret = create_data_reloc_tree(trans);
if (ret) {
error("unable to create data reloc tree: %d", ret);
goto out;
}
btrfs-progs: Create uuid tree with proper contents Commit 2a496a5b8b74 ("btrfs-progs: mkfs: precreate the uuid tree") creates uuid tree at mkfs time. However it doesn't populate uuid tree correctly nor creates an empty root. It uses create_tree(), which just copies the content of fs root, containing a meaningless INODE_ITEM: v4.15 mkfs (no uuid tree creation) + kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30572544 items 1 free space 16250 generation 7 owner UUID_TREE leaf 30572544 flags 0x1(WRITTEN) backref revision 1 fs uuid 33ecddef-fc86-481a-93ce-846b01c11376 chunk uuid 9e58f646-b0da-43ca-9c7d-8bbe3e120246 item 0 key (0x92457c59d31491be UUID_KEY_SUBVOL 0xef908b5e79aa76a1) itemoff 16275 itemsize 8 subvol_id 5 v4.19.1 mkfs (incorrect one), no kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30507008 items 2 free space 16061 generation 4 owner UUID_TREE leaf 30507008 flags 0x1(WRITTEN) backref revision 1 fs uuid 162f5333-9b5d-4217-877c-ddaeaa79398e chunk uuid 7bc2c5c6-a6d2-4eec-a513-142b549c6541 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 generation 3 transid 0 size 0 nbytes 16384 block group 0 mode 40755 links 1 uid 0 gid 0 rdev 0 sequence 0 flags 0x0(none) item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12 index 0 namelen 2 name: .. This patchset will fix it by populuating uuid tree properly: (NOTE: due to tree-checker, kernel doesn't accept empty uuid tree, so we can only fix it by populating uuid tree correctly) With this patchset, no kernel mount: uuid tree key (UUID_TREE ROOT_ITEM 0) leaf 30507008 items 1 free space 16250 generation 4 owner UUID_TREE leaf 30507008 flags 0x1(WRITTEN) backref revision 1 fs uuid ae53079e-dbbc-409b-a565-5326c7b27731 chunk uuid b5fb1bea-f20d-4af1-80f8-6ca3f0038d67 item 0 key (0x334ba6b032d89c07 UUID_KEY_SUBVOL 0x86cde09cb78bcca0) itemoff 16275 itemsize 8 subvol_id 5 For kernel, except tree-checker needs an non-empty uuid tree, both of the above behavior won't cause problem, but it's always better to keep a good standardized behavior. Fixes: 2a496a5b8b74 ("btrfs-progs: mkfs: precreate the uuid tree") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-03 07:32:21 +00:00
ret = create_uuid_tree(trans);
if (ret)
warning(
"unable to create uuid tree, will be created after mount: %d", ret);
ret = btrfs_commit_transaction(trans, root);
if (ret) {
errno = -ret;
error_msg(ERROR_MSG_START_TRANS, "%m");
goto out;
}
ret = cleanup_temp_chunks(fs_info, &allocation, data_profile,
metadata_profile, metadata_profile);
if (ret < 0) {
error("failed to cleanup temporary chunks: %d", ret);
goto out;
}
if (source_dir) {
pr_verbose(LOG_DEFAULT, "Rootdir from: %s\n", source_dir);
ret = btrfs_mkfs_fill_dir(source_dir, root);
if (ret) {
error("error while filling filesystem: %d", ret);
goto out;
}
if (shrink_rootdir) {
pr_verbose(LOG_DEFAULT, " Shrink: yes\n");
ret = btrfs_mkfs_shrink_fs(fs_info, &shrink_size,
shrink_rootdir);
if (ret < 0) {
error("error while shrinking filesystem: %d",
ret);
goto out;
}
} else {
pr_verbose(LOG_DEFAULT, " Shrink: no\n");
}
}
if (features.runtime_flags & BTRFS_FEATURE_RUNTIME_QUOTA ||
features.incompat_flags & BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) {
ret = setup_quota_root(fs_info);
if (ret < 0) {
error("failed to initialize quota: %d (%m)", ret);
goto out;
}
}
if (bconf.verbose) {
btrfs-progs: mkfs: fix a stack over-flow when features string are too long [BUG] Even with chunk_objectid bug fixed, mkfs.btrfs can still caused stack overflow when enabling extent-tree-v2 feature (need experimental features enabled): # ./mkfs.btrfs -f -O extent-tree-v2 ~/test.img btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match NOTE: several default settings have changed in version 5.15, please make sure this does not affect your deployments: - DUP for metadata (-m dup) - enabled no-holes (-O no-holes) - enabled free-space-tree (-R free-space-tree) Label: (null) UUID: 205c61e7-f58e-4e8f-9dc2-38724f5c554b Node size: 16384 Sector size: 4096 Filesystem size: 512.00MiB Block group profiles: Data: single 8.00MiB Metadata: DUP 32.00MiB System: DUP 8.00MiB SSD detected: no Zoned device: no ================================================================= [... Skip full ASAN output ...] ==65655==ABORTING [CAUSE] For experimental build, we have unified feature output, but the old buffer size is only 64 bytes, which is too small to cover the new full feature string: extref, skinny-metadata, no-holes, free-space-tree, block-group-tree, extent-tree-v2 Above feature string is already 84 bytes, over the 64 on-stack memory size. This can also be proved by the ASAN output: ==65655==ERROR: AddressSanitizer: stack-buffer-overflow on address 0x7ffc4e03b1d0 at pc 0x7ff0fc05fafe bp 0x7ffc4e03ac60 sp 0x7ffc4e03a408 WRITE of size 17 at 0x7ffc4e03b1d0 thread T0 #0 0x7ff0fc05fafd in __interceptor_strcat /usr/src/debug/gcc/libsanitizer/asan/asan_interceptors.cpp:377 #1 0x55cdb7b06ca5 in parse_features_to_string common/fsfeatures.c:316 #2 0x55cdb7b06ce1 in btrfs_parse_fs_features_to_string common/fsfeatures.c:324 #3 0x55cdb7a37226 in main mkfs/main.c:1783 #4 0x7ff0fbe3c28f (/usr/lib/libc.so.6+0x2328f) #5 0x7ff0fbe3c349 in __libc_start_main (/usr/lib/libc.so.6+0x23349) #6 0x55cdb7a2cb34 in _start ../sysdeps/x86_64/start.S:115 [FIX] Introduce a new macro, BTRFS_FEATURE_STRING_BUF_SIZE, along with a new sanity check helper, btrfs_assert_feature_buf_size(). The problem is I can not find a build time method to verify BTRFS_FEATURE_STRING_BUF_SIZE is large enough to contain all feature names, thus have to go the runtime function to do the BUG_ON() to verify the macro size. Now the minimal buffer size for experimental build is 138 bytes, just bump it to 160 for future expansion. And if further features go beyond that number, mkfs.btrfs/btrfs-convert will immediately crash at that BUG_ON(), so we can definitely detect it. Reviewed-by: Anand Jain <anand.jain@oracle.com> Tested-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-07 12:03:01 +00:00
char features_buf[BTRFS_FEATURE_STRING_BUF_SIZE];
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
update_chunk_allocation(fs_info, &allocation);
printf("Label: %s\n", label);
printf("UUID: %s\n", mkfs_cfg.fs_uuid);
if (dev_uuid[0] != 0)
printf("Device UUID: %s\n", mkfs_cfg.dev_uuid);
printf("Node size: %u\n", nodesize);
printf("Sector size: %u\t(CPU page size: %lu)\n",
sectorsize, sysconf(_SC_PAGESIZE));
printf("Filesystem size: %s\n",
pretty_size(btrfs_super_total_bytes(fs_info->super_copy)));
printf("Block group profiles:\n");
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
if (allocation.data)
printf(" Data: %-8s %16s\n",
btrfs_group_profile_str(data_profile),
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
pretty_size(allocation.data));
if (allocation.metadata)
printf(" Metadata: %-8s %16s\n",
btrfs_group_profile_str(metadata_profile),
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
pretty_size(allocation.metadata));
if (allocation.mixed)
printf(" Data+Metadata: %-8s %16s\n",
btrfs_group_profile_str(data_profile),
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
pretty_size(allocation.mixed));
printf(" System: %-8s %16s\n",
btrfs_group_profile_str(metadata_profile),
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
pretty_size(allocation.system));
printf("SSD detected: %s\n", ssd ? "yes" : "no");
printf("Zoned device: %s\n", opt_zoned ? "yes" : "no");
if (opt_zoned)
printf(" Zone size: %s\n",
pretty_size(fs_info->zone_size));
btrfs-progs: fsfeatures: properly merge -O and -R options [BUG] Commit "btrfs-progs: prepare merging compat feature lists" tries to merged "-O" and "-R" options, as they don't correctly represents btrfs features. But that commit caused the following bug during mkfs for experimental build: $ mkfs.btrfs -f -O block-group-tree /dev/nvme0n1 btrfs-progs v5.19.1 See http://btrfs.wiki.kernel.org for more information. ERROR: superblock magic doesn't match ERROR: illegal nodesize 16384 (not equal to 4096 for mixed block group) [CAUSE] Currently btrfs_parse_fs_features() will return a u64, and reuse the same u64 for both incompat and compat RO flags for experimental branch. This can easily leads to conflicts, as BTRFS_FEATURE_INCOMPAT_MIXED_BLOCK_GROUP and BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE both share the same bit (1 << 2). Thus for above case, mkfs.btrfs believe it has set MIXED_BLOCK_GROUP feature, but what we really want is BLOCK_GROUP_TREE. [FIX] Instead of incorrectly re-using the same bits in btrfs_feature, split the old flags into 3 flags: - incompat_flag - compat_ro_flag - runtime_flag The first two flags are easy to understand, the corresponding flag of each feature. The last runtime_flag is to compensate features which doesn't have any on-disk flag set, like QUOTA and LIST_ALL. And since we're no longer using a single u64 as features, we have to introduce a new structure, btrfs_mkfs_features, to contain above 3 flags. This also mean, things like default mkfs features must be converted to use the new structure, thus those old macros are all converted to const static structures: - BTRFS_MKFS_DEFAULT_FEATURES + BTRFS_MKFS_DEFAULT_RUNTIME_FEATURES -> btrfs_mkfs_default_features - BTRFS_CONVERT_ALLOWED_FEATURES -> btrfs_convert_allowed_features And since we're using a structure, it's not longer as easy to implement a disallowed mask. Thus functions with @mask_disallowed are all changed to using an @allowed structure pointer (which can be NULL). Finally if we have experimental features enabled, all features can be specified by -O options, and we can output a unified feature list, instead of the old split ones. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-05 01:48:07 +00:00
btrfs_parse_fs_features_to_string(features_buf, &features);
printf("Features: %s\n", features_buf);
printf("Checksum: %s\n",
btrfs_super_csum_name(mkfs_cfg.csum_type));
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
list_all_devices(root, opt_zoned);
if (mkfs_cfg.csum_type == BTRFS_CSUM_TYPE_SHA256) {
printf(
"NOTE: you may need to manually load kernel module implementing accelerated SHA256 in case\n"
" the generic implementation is built-in, before mount. Check lsmod or /proc/crypto\n\n"
);
}
btrfs-progs: mkfs: print the summary This patch prints the summary of the filesystem after the creation. The main fileds printed are: - devices list with their uuid, devid, path and size - raid profile (dup,single,raid0...) - leafsize/nodesize/sectorsize - filesystem features (raid56, extref, mixed-bg) - chunk size and type If the '-v' switched is passed, the output is more verbose; if the '-q' switched is passed, only the errors are printed. Below an example: BTRFS filesystem summary: Label: btrfs-test UUID: 14ae8a88-98ac-4f22-8441-79f76ec622f7 Node size: 4096 Leaf size: 4096 Sector size: 4096 Initial chunks: Data+Metadata: 9.01GiB System: 18.06MiB Metadata profile: RAID5 Data profile: RAID5 Mixed mode: YES SSD detected: NO Incompat features: mixed-bg, extref, raid56 Number of devices: 10 UUID ID SIZE PATH ------------------------------------ -- --------- ----------- df1c7f50-1980-4da2-8bc9-7ee6ffb0b554 1 50.00GiB /dev/vdb 32c808a0-cd7b-4497-a2c0-1d77a9854af9 2 50.00GiB /dev/vdc 3159782e-d108-40bc-9e15-090ecac160b4 3 50.00GiB /dev/vdd db7eaf0c-beb8-4093-a9d0-b9c25c146305 4 50.00GiB /dev/vde c367ca04-1f71-49c0-a331-11fc0b87e9fc 5 50.00GiB /dev/vdf e9b73c86-4058-4b3a-90ac-18741a276e70 6 50.00GiB /dev/vdg c4298b7a-ad41-4690-bf10-bf748b319413 7 50.00GiB /dev/vdh 1cf048c8-af8a-4225-b09a-5d12e9b217fa 8 2.00GiB /dev/vdi 7e157869-768a-4725-bad5-82e6bd05fd17 9 2.00GiB /dev/vdj 2c9431ac-c7f0-45a5-8529-cef8cf6e4033 10 2.00GiB /dev/vdk Total devices size: 356.01GiB Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> Signed-off-by: David Sterba <dsterba@suse.cz>
2015-06-08 11:00:50 +00:00
}
/*
* The filesystem is now fully set up, commit the remaining changes and
* fix the signature as the last step before closing the devices.
*/
fs_info->finalize_on_close = 1;
out:
close_ret = close_ctree(root);
if (!close_ret) {
optind = saved_optind;
device_count = argc - optind;
while (device_count-- > 0) {
file = argv[optind++];
if (path_is_block_device(file) == 1)
btrfs_register_one_device(file);
}
}
if (!ret && close_ret) {
ret = close_ret;
error("failed to close ctree, the filesystem may be inconsistent: %d",
ret);
}
btrfs_close_all_devices();
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
if (prepare_ctx) {
for (i = 0; i < device_count; i++)
close(prepare_ctx[i].fd);
}
free(t_prepare);
free(prepare_ctx);
free(label);
free(source_dir);
return !!ret;
error:
btrfs-progs: mkfs: keep file descriptors open during whole time [BUG] There is an internal bug report that, after mkfs.btrfs there is a chance that no /dev/disk/by-uuid/<uuid> symlink is not created at all. [CAUSE] That uuid symlink is created by udev, which listens to inotify IN_CLOSE_WRITE events from all block devices. After such IN_CLOSE_WRITE event is triggered, udev would *disable* inotify for that block device, and do a blkid scan on it. After the blkid scan is done, re-enables the inotify listening. This means normally mkfs tools should open the fd, do all the writes, and close the fd after everything is done. But unfortunately for mkfs.btrfs, it's not the case, we have a lot of phases separated by different close() calls: open_ctree() would open fds of each involved device and close them at close_ctree() Only after close_ctree() we have a valid superblock -\ | |<------- A -------->|<--------- B --------->|<------- C ------->| | | | `- open a new fd for make_btrfs() | and close it before open_ctree() | The device contains invalid sb. | `- open a new fd for each device, then call btrfs_prepare_device(), then close the fd. The device would contain no valid superblock. If at the close() of phase A udev event is triggered, while doing udev scan we go into phase C (but before the new valid super blocks written), udev would only see no superblock or invalid superblock. Then phase C finished, udev resumes its inotify listening, but at this time mkfs is finished, while udev only sees the premature data from phase A, and misses the IN_CLOSE_WRITE events from phase C. [FIX] Instead of opening and closing a new fd for each device, re-use the fd opened during prepare_one_device(), and close all the fds until close_ctree() is called. By this, although we may still have race between close_ctree() and explicit close() calls, at least udev can always see the properly written super blocks. To compensate the change, some extra cleanups are made: - Do not touch @device_count Which makes later prepare_ctx iteration much easier. - Remove top-level @fd variable Instead go with prepare_ctx[i].fd. - Do not open with O_RDWR in test_dev_for_mkfs() as test_dev_for_mkfs() would close the fd, if we go O_RDWR, it can cause the udev race. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-03-15 06:06:54 +00:00
if (prepare_ctx) {
for (i = 0; i < device_count; i++)
close(prepare_ctx[i].fd);
}
free(t_prepare);
free(prepare_ctx);
free(label);
free(source_dir);
exit(1);
success:
exit(0);
2007-03-21 00:35:03 +00:00
}