626 lines
14 KiB
C
626 lines
14 KiB
C
/*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public
|
|
* License along with this program; if not, write to the
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
* Boston, MA 021110-1307, USA.
|
|
*/
|
|
|
|
#include <sys/ioctl.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/types.h>
|
|
#include <linux/limits.h>
|
|
#ifdef BTRFS_ZONED
|
|
#include <linux/blkzoned.h>
|
|
#endif
|
|
#include <linux/fs.h>
|
|
#include <limits.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
#include <fcntl.h>
|
|
#include <dirent.h>
|
|
#include <errno.h>
|
|
#include <blkid/blkid.h>
|
|
#include "kernel-lib/sizes.h"
|
|
#include "kernel-shared/disk-io.h"
|
|
#include "kernel-shared/ctree.h"
|
|
#include "kernel-shared/zoned.h"
|
|
#include "common/device-utils.h"
|
|
#include "common/path-utils.h"
|
|
#include "common/internal.h"
|
|
#include "common/messages.h"
|
|
#include "common/utils.h"
|
|
#include "common/units.h"
|
|
|
|
#ifndef BLKDISCARD
|
|
#define BLKDISCARD _IO(0x12,119)
|
|
#endif
|
|
|
|
/*
|
|
* Discard the given range in one go
|
|
*/
|
|
static int discard_range(int fd, u64 start, u64 len)
|
|
{
|
|
u64 range[2] = { start, len };
|
|
|
|
if (ioctl(fd, BLKDISCARD, &range) < 0)
|
|
return errno;
|
|
return 0;
|
|
}
|
|
|
|
static int discard_supported(const char *device)
|
|
{
|
|
int ret;
|
|
char buf[128] = {};
|
|
|
|
ret = device_get_queue_param(device, "discard_granularity", buf, sizeof(buf));
|
|
if (ret == 0) {
|
|
pr_verbose(3, "cannot read discard_granularity for %s\n", device);
|
|
return 0;
|
|
} else {
|
|
if (atoi(buf) == 0) {
|
|
pr_verbose(3, "%s: discard_granularity %s", device, buf);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Discard blocks in the given range in 1G chunks, the process is interruptible
|
|
*/
|
|
int device_discard_blocks(int fd, u64 start, u64 len)
|
|
{
|
|
while (len > 0) {
|
|
/* 1G granularity */
|
|
u64 chunk_size = min_t(u64, len, SZ_1G);
|
|
int ret;
|
|
|
|
ret = discard_range(fd, start, chunk_size);
|
|
if (ret)
|
|
return ret;
|
|
len -= chunk_size;
|
|
start += chunk_size;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Write zeros to the given range [start, start + len)
|
|
*/
|
|
int device_zero_blocks(int fd, off_t start, size_t len, bool direct)
|
|
{
|
|
char *buf = malloc(len);
|
|
int ret = 0;
|
|
ssize_t written;
|
|
|
|
if (!buf)
|
|
return -ENOMEM;
|
|
memset(buf, 0, len);
|
|
written = btrfs_pwrite(fd, buf, len, start, direct);
|
|
if (written != len)
|
|
ret = -EIO;
|
|
free(buf);
|
|
return ret;
|
|
}
|
|
|
|
#define ZERO_DEV_BYTES SZ_2M
|
|
|
|
/*
|
|
* Zero blocks in the range from start but not after the given device size.
|
|
* (On SPARC the disk labels are preserved too.)
|
|
*/
|
|
static int zero_dev_clamped(int fd, struct btrfs_zoned_device_info *zinfo,
|
|
off_t start, ssize_t len, u64 dev_size)
|
|
{
|
|
off_t end = max(start, start + len);
|
|
|
|
#ifdef __sparc__
|
|
/* and don't overwrite the disk labels on sparc */
|
|
start = max(start, 1024);
|
|
end = max(end, 1024);
|
|
#endif
|
|
|
|
start = min_t(u64, start, dev_size);
|
|
end = min_t(u64, end, dev_size);
|
|
|
|
if (zinfo && zinfo->model == ZONED_HOST_MANAGED)
|
|
return zero_zone_blocks(fd, zinfo, start, end - start);
|
|
|
|
return device_zero_blocks(fd, start, end - start, false);
|
|
}
|
|
|
|
/*
|
|
* Find all magic signatures known to blkid and remove them
|
|
*/
|
|
static int btrfs_wipe_existing_sb(int fd, struct btrfs_zoned_device_info *zinfo)
|
|
{
|
|
const char *off = NULL;
|
|
size_t len = 0;
|
|
loff_t offset;
|
|
char buf[BUFSIZ];
|
|
int ret = 0;
|
|
blkid_probe pr = NULL;
|
|
|
|
pr = blkid_new_probe();
|
|
if (!pr)
|
|
return -1;
|
|
|
|
if (blkid_probe_set_device(pr, fd, 0, 0)) {
|
|
ret = -1;
|
|
goto out;
|
|
}
|
|
|
|
ret = blkid_probe_lookup_value(pr, "SBMAGIC_OFFSET", &off, NULL);
|
|
if (!ret)
|
|
ret = blkid_probe_lookup_value(pr, "SBMAGIC", NULL, &len);
|
|
|
|
if (ret || len == 0 || off == NULL) {
|
|
/*
|
|
* If lookup fails, the probe did not find any values, eg. for
|
|
* a file image or a loop device. Soft error.
|
|
*/
|
|
ret = 1;
|
|
goto out;
|
|
}
|
|
|
|
offset = strtoll(off, NULL, 10);
|
|
if (len > sizeof(buf))
|
|
len = sizeof(buf);
|
|
|
|
if (!zone_is_sequential(zinfo, offset)) {
|
|
const bool direct = zinfo && zinfo->model == ZONED_HOST_MANAGED;
|
|
|
|
memset(buf, 0, len);
|
|
ret = btrfs_pwrite(fd, buf, len, offset, direct);
|
|
if (ret < 0) {
|
|
error("cannot wipe existing superblock: %m");
|
|
ret = -1;
|
|
} else if (ret != len) {
|
|
error("cannot wipe existing superblock: wrote %d of %zd",
|
|
ret, len);
|
|
ret = -1;
|
|
}
|
|
} else {
|
|
struct blk_zone *zone = &zinfo->zones[offset / zinfo->zone_size];
|
|
|
|
ret = btrfs_reset_dev_zone(fd, zone);
|
|
if (ret < 0) {
|
|
error(
|
|
"zoned: failed to wipe zones containing superblock: %m");
|
|
ret = -1;
|
|
}
|
|
}
|
|
fsync(fd);
|
|
|
|
out:
|
|
blkid_free_probe(pr);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Prepare a device before it's added to the filesystem. Optionally:
|
|
* - remove old superblocks
|
|
* - discard
|
|
* - reset zones
|
|
* - delete end of the device
|
|
*/
|
|
int btrfs_prepare_device(int fd, const char *file, u64 *block_count_ret,
|
|
u64 max_block_count, unsigned opflags)
|
|
{
|
|
struct btrfs_zoned_device_info *zinfo = NULL;
|
|
u64 block_count;
|
|
struct stat st;
|
|
int i, ret;
|
|
|
|
ret = fstat(fd, &st);
|
|
if (ret < 0) {
|
|
error("unable to stat %s: %m", file);
|
|
return 1;
|
|
}
|
|
|
|
block_count = device_get_partition_size_fd_stat(fd, &st);
|
|
if (block_count == 0) {
|
|
error("unable to determine size of %s", file);
|
|
return 1;
|
|
}
|
|
if (max_block_count)
|
|
block_count = min(block_count, max_block_count);
|
|
|
|
if (opflags & PREP_DEVICE_ZONED) {
|
|
ret = btrfs_get_zone_info(fd, file, &zinfo);
|
|
if (ret < 0 || !zinfo) {
|
|
error("zoned: unable to load zone information of %s",
|
|
file);
|
|
return 1;
|
|
}
|
|
|
|
if (!zinfo->emulated) {
|
|
if (opflags & PREP_DEVICE_VERBOSE)
|
|
printf("Resetting device zones %s (%u zones) ...\n",
|
|
file, zinfo->nr_zones);
|
|
/*
|
|
* We cannot ignore zone reset errors for a zoned block
|
|
* device as this could result in the inability to write
|
|
* to non-empty sequential zones of the device.
|
|
*/
|
|
if (btrfs_reset_all_zones(fd, zinfo)) {
|
|
error("zoned: failed to reset device '%s' zones: %m",
|
|
file);
|
|
goto err;
|
|
}
|
|
}
|
|
} else if (opflags & PREP_DEVICE_DISCARD) {
|
|
/*
|
|
* We intentionally ignore errors from the discard ioctl. It
|
|
* is not necessary for the mkfs functionality but just an
|
|
* optimization.
|
|
*/
|
|
if (discard_supported(file)) {
|
|
if (opflags & PREP_DEVICE_VERBOSE)
|
|
printf("Performing full device TRIM %s (%s) ...\n",
|
|
file, pretty_size(block_count));
|
|
device_discard_blocks(fd, 0, block_count);
|
|
}
|
|
}
|
|
|
|
ret = zero_dev_clamped(fd, zinfo, 0, ZERO_DEV_BYTES, block_count);
|
|
for (i = 0 ; !ret && i < BTRFS_SUPER_MIRROR_MAX; i++)
|
|
ret = zero_dev_clamped(fd, zinfo, btrfs_sb_offset(i),
|
|
BTRFS_SUPER_INFO_SIZE, block_count);
|
|
if (!ret && (opflags & PREP_DEVICE_ZERO_END))
|
|
ret = zero_dev_clamped(fd, zinfo, block_count - ZERO_DEV_BYTES,
|
|
ZERO_DEV_BYTES, block_count);
|
|
|
|
if (ret < 0) {
|
|
errno = -ret;
|
|
error("failed to zero device '%s': %m", file);
|
|
goto err;
|
|
}
|
|
|
|
ret = btrfs_wipe_existing_sb(fd, zinfo);
|
|
if (ret < 0) {
|
|
error("cannot wipe superblocks on %s", file);
|
|
goto err;
|
|
}
|
|
|
|
free(zinfo);
|
|
*block_count_ret = block_count;
|
|
return 0;
|
|
|
|
err:
|
|
free(zinfo);
|
|
return 1;
|
|
}
|
|
|
|
u64 device_get_partition_size_fd_stat(int fd, const struct stat *st)
|
|
{
|
|
u64 size;
|
|
|
|
if (S_ISREG(st->st_mode))
|
|
return st->st_size;
|
|
if (!S_ISBLK(st->st_mode))
|
|
return 0;
|
|
if (ioctl(fd, BLKGETSIZE64, &size) >= 0)
|
|
return size;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Read partition size using the low-level ioctl
|
|
*/
|
|
u64 device_get_partition_size_fd(int fd)
|
|
{
|
|
u64 result;
|
|
|
|
if (ioctl(fd, BLKGETSIZE64, &result) < 0)
|
|
return 0;
|
|
|
|
return result;
|
|
}
|
|
|
|
static u64 device_get_partition_size_sysfs(const char *dev)
|
|
{
|
|
int ret;
|
|
char path[PATH_MAX] = {};
|
|
char sysfs[PATH_MAX] = {};
|
|
char sizebuf[128] = {};
|
|
char *name = NULL;
|
|
int sysfd;
|
|
unsigned long long size = 0;
|
|
|
|
name = realpath(dev, path);
|
|
if (!name)
|
|
return 0;
|
|
name = basename(path);
|
|
|
|
ret = path_cat3_out(sysfs, "/sys/class/block", name, "size");
|
|
if (ret < 0)
|
|
return 0;
|
|
sysfd = open(sysfs, O_RDONLY);
|
|
if (sysfd < 0)
|
|
return 0;
|
|
ret = sysfs_read_file(sysfd, sizebuf, sizeof(sizebuf));
|
|
if (ret < 0) {
|
|
close(sysfd);
|
|
return 0;
|
|
}
|
|
errno = 0;
|
|
size = strtoull(sizebuf, NULL, 10);
|
|
if (size == ULLONG_MAX && errno == ERANGE) {
|
|
close(sysfd);
|
|
return 0;
|
|
}
|
|
close(sysfd);
|
|
return size;
|
|
}
|
|
|
|
u64 device_get_partition_size(const char *dev)
|
|
{
|
|
u64 result;
|
|
int fd = open(dev, O_RDONLY);
|
|
|
|
if (fd < 0)
|
|
return device_get_partition_size_sysfs(dev);
|
|
|
|
if (ioctl(fd, BLKGETSIZE64, &result) < 0) {
|
|
close(fd);
|
|
return 0;
|
|
}
|
|
close(fd);
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Get a device request queue parameter from sysfs.
|
|
*/
|
|
int device_get_queue_param(const char *file, const char *param, char *buf, size_t len)
|
|
{
|
|
blkid_probe probe;
|
|
char wholedisk[PATH_MAX];
|
|
char sysfs_path[PATH_MAX];
|
|
dev_t devno;
|
|
int fd;
|
|
int ret;
|
|
|
|
probe = blkid_new_probe_from_filename(file);
|
|
if (!probe)
|
|
return 0;
|
|
|
|
/* Device number of this disk (possibly a partition) */
|
|
devno = blkid_probe_get_devno(probe);
|
|
if (!devno) {
|
|
blkid_free_probe(probe);
|
|
return 0;
|
|
}
|
|
|
|
/* Get whole disk name (not full path) for this devno */
|
|
ret = blkid_devno_to_wholedisk(devno, wholedisk, sizeof(wholedisk), NULL);
|
|
if (ret) {
|
|
blkid_free_probe(probe);
|
|
return 0;
|
|
}
|
|
|
|
snprintf(sysfs_path, PATH_MAX, "/sys/block/%s/queue/%s",
|
|
wholedisk, param);
|
|
|
|
blkid_free_probe(probe);
|
|
|
|
fd = open(sysfs_path, O_RDONLY);
|
|
if (fd < 0)
|
|
return 0;
|
|
|
|
len = read(fd, buf, len);
|
|
close(fd);
|
|
|
|
return len;
|
|
}
|
|
|
|
/*
|
|
* Read value of zone_unusable from sysfs for given block group type in flags
|
|
*/
|
|
u64 device_get_zone_unusable(int fd, u64 flags)
|
|
{
|
|
char buf[64];
|
|
int sys_fd;
|
|
u64 unusable = DEVICE_ZONE_UNUSABLE_UNKNOWN;
|
|
|
|
/* Don't report it for a regular fs */
|
|
sys_fd = sysfs_open_fsid_file(fd, "features/zoned");
|
|
if (sys_fd < 0)
|
|
return DEVICE_ZONE_UNUSABLE_UNKNOWN;
|
|
close(sys_fd);
|
|
sys_fd = -1;
|
|
|
|
if ((flags & BTRFS_BLOCK_GROUP_DATA) == BTRFS_BLOCK_GROUP_DATA)
|
|
sys_fd = sysfs_open_fsid_file(fd, "allocation/data/bytes_zone_unusable");
|
|
else if ((flags & BTRFS_BLOCK_GROUP_METADATA) == BTRFS_BLOCK_GROUP_METADATA)
|
|
sys_fd = sysfs_open_fsid_file(fd, "allocation/metadata/bytes_zone_unusable");
|
|
else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) == BTRFS_BLOCK_GROUP_SYSTEM)
|
|
sys_fd = sysfs_open_fsid_file(fd, "allocation/system/bytes_zone_unusable");
|
|
|
|
if (sys_fd < 0)
|
|
return DEVICE_ZONE_UNUSABLE_UNKNOWN;
|
|
sysfs_read_file(sys_fd, buf, sizeof(buf));
|
|
unusable = strtoull(buf, NULL, 10);
|
|
close(sys_fd);
|
|
|
|
return unusable;
|
|
}
|
|
|
|
/*
|
|
* Read information about zone size of the given device (short @name) from a
|
|
* given filesystem fd
|
|
*/
|
|
u64 device_get_zone_size(int fd, const char *name)
|
|
{
|
|
DIR *dir;
|
|
struct dirent *de;
|
|
int sysfs_fd;
|
|
u64 ret = 0;
|
|
|
|
sysfs_fd = sysfs_open_fsid_dir(fd, "devices");
|
|
if (sysfs_fd < 0)
|
|
return 0;
|
|
|
|
dir = fdopendir(sysfs_fd);
|
|
if (!dir) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
while (1) {
|
|
int queue_fd;
|
|
char queue[PATH_MAX];
|
|
char buf[128] = {0};
|
|
|
|
de = readdir(dir);
|
|
if (!de) {
|
|
ret = 0;
|
|
break;
|
|
}
|
|
|
|
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
|
|
continue;
|
|
if (strcmp(name, de->d_name) != 0)
|
|
continue;
|
|
|
|
path_cat3_out(queue, "devices", de->d_name, "queue/chunk_sectors");
|
|
/* /sys/fs/btrfs/FSID/devices/NAME/queue/chunk_sectors */
|
|
queue_fd = sysfs_open_fsid_file(fd, queue);
|
|
if (queue_fd < 0) {
|
|
ret = 0;
|
|
break;
|
|
}
|
|
sysfs_read_file(queue_fd, buf, sizeof(buf));
|
|
ret = atoll(buf);
|
|
close(queue_fd);
|
|
break;
|
|
}
|
|
closedir(dir);
|
|
|
|
out:
|
|
close(sysfs_fd);
|
|
return ret;
|
|
}
|
|
|
|
int device_get_rotational(const char *file)
|
|
{
|
|
char rotational;
|
|
int ret;
|
|
|
|
ret = device_get_queue_param(file, "rotational", &rotational, 1);
|
|
if (ret < 1)
|
|
return 0;
|
|
|
|
return (rotational == '0');
|
|
}
|
|
|
|
ssize_t btrfs_direct_pread(int fd, void *buf, size_t count, off_t offset)
|
|
{
|
|
int alignment;
|
|
size_t iosize;
|
|
void *bounce_buf = NULL;
|
|
struct stat stat_buf;
|
|
unsigned long req;
|
|
int ret;
|
|
|
|
if (fstat(fd, &stat_buf) == -1) {
|
|
error("fstat failed: %m");
|
|
return -errno;
|
|
}
|
|
|
|
if ((stat_buf.st_mode & S_IFMT) == S_IFBLK)
|
|
req = BLKSSZGET;
|
|
else
|
|
req = FIGETBSZ;
|
|
|
|
if (ioctl(fd, req, &alignment)) {
|
|
error("failed to get block size: %m");
|
|
return -errno;
|
|
}
|
|
|
|
if (IS_ALIGNED((size_t)buf, alignment) && IS_ALIGNED(count, alignment))
|
|
return pread(fd, buf, count, offset);
|
|
|
|
iosize = round_up(count, alignment);
|
|
|
|
ret = posix_memalign(&bounce_buf, alignment, iosize);
|
|
if (ret) {
|
|
error_msg(ERROR_MSG_MEMORY, "bounce buffer");
|
|
errno = ret;
|
|
return -ret;
|
|
}
|
|
|
|
ret = pread(fd, bounce_buf, iosize, offset);
|
|
if (ret >= count)
|
|
ret = count;
|
|
memcpy(buf, bounce_buf, count);
|
|
|
|
free(bounce_buf);
|
|
return ret;
|
|
}
|
|
|
|
ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count, off_t offset)
|
|
{
|
|
int alignment;
|
|
size_t iosize;
|
|
void *bounce_buf = NULL;
|
|
struct stat stat_buf;
|
|
unsigned long req;
|
|
int ret;
|
|
|
|
if (fstat(fd, &stat_buf) == -1) {
|
|
error("fstat failed: %m");
|
|
return -errno;
|
|
}
|
|
|
|
if ((stat_buf.st_mode & S_IFMT) == S_IFBLK)
|
|
req = BLKSSZGET;
|
|
else
|
|
req = FIGETBSZ;
|
|
|
|
if (ioctl(fd, req, &alignment)) {
|
|
error("failed to get block size: %m");
|
|
return -errno;
|
|
}
|
|
|
|
if (IS_ALIGNED((size_t)buf, alignment) && IS_ALIGNED(count, alignment))
|
|
return pwrite(fd, buf, count, offset);
|
|
|
|
/* Cannot do anything if the write size is not aligned */
|
|
if (!IS_ALIGNED(count, alignment)) {
|
|
error("%zu is not aligned to %d", count, alignment);
|
|
return -EINVAL;
|
|
}
|
|
|
|
iosize = round_up(count, alignment);
|
|
|
|
ret = posix_memalign(&bounce_buf, alignment, iosize);
|
|
if (ret) {
|
|
error_msg(ERROR_MSG_MEMORY, "bounce buffer");
|
|
errno = ret;
|
|
return -ret;
|
|
}
|
|
|
|
UASSERT(iosize == count);
|
|
memcpy(bounce_buf, buf, count);
|
|
ret = pwrite(fd, bounce_buf, iosize, offset);
|
|
|
|
free(bounce_buf);
|
|
return ret;
|
|
}
|