btrfs-progs/common/device-utils.c
Qu Wenruo 2b51d006cd btrfs-progs: split btrfs_direct_pio() functions into read and write
It's not a common practice to use the same io function for both read and
write (we have pread() and pwrite(), not pio()).

Furthermore the original function has the following problems:

- Not returning proper error number
  If we had ioctl/stat errors we just return 0 with errno set.
  Thus caller would treat it as a short read, not a proper error.

- Unnecessary @ret_rw
  This is not that obvious if we have different handling for read and
  write, but if we split them it's super obvious we can reuse @ret.

- No proper copy back for short read

- Unable to constify the @buf pointer for write operation

All those problems would be addressed in this patch.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-05-26 18:02:31 +02:00

626 lines
14 KiB
C

/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <linux/limits.h>
#ifdef BTRFS_ZONED
#include <linux/blkzoned.h>
#endif
#include <linux/fs.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <dirent.h>
#include <errno.h>
#include <blkid/blkid.h>
#include "kernel-lib/sizes.h"
#include "kernel-shared/disk-io.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/zoned.h"
#include "common/device-utils.h"
#include "common/path-utils.h"
#include "common/internal.h"
#include "common/messages.h"
#include "common/utils.h"
#include "common/units.h"
#ifndef BLKDISCARD
#define BLKDISCARD _IO(0x12,119)
#endif
/*
* Discard the given range in one go
*/
static int discard_range(int fd, u64 start, u64 len)
{
u64 range[2] = { start, len };
if (ioctl(fd, BLKDISCARD, &range) < 0)
return errno;
return 0;
}
static int discard_supported(const char *device)
{
int ret;
char buf[128] = {};
ret = device_get_queue_param(device, "discard_granularity", buf, sizeof(buf));
if (ret == 0) {
pr_verbose(3, "cannot read discard_granularity for %s\n", device);
return 0;
} else {
if (atoi(buf) == 0) {
pr_verbose(3, "%s: discard_granularity %s", device, buf);
return 0;
}
}
return 1;
}
/*
* Discard blocks in the given range in 1G chunks, the process is interruptible
*/
int device_discard_blocks(int fd, u64 start, u64 len)
{
while (len > 0) {
/* 1G granularity */
u64 chunk_size = min_t(u64, len, SZ_1G);
int ret;
ret = discard_range(fd, start, chunk_size);
if (ret)
return ret;
len -= chunk_size;
start += chunk_size;
}
return 0;
}
/*
* Write zeros to the given range [start, start + len)
*/
int device_zero_blocks(int fd, off_t start, size_t len, bool direct)
{
char *buf = malloc(len);
int ret = 0;
ssize_t written;
if (!buf)
return -ENOMEM;
memset(buf, 0, len);
written = btrfs_pwrite(fd, buf, len, start, direct);
if (written != len)
ret = -EIO;
free(buf);
return ret;
}
#define ZERO_DEV_BYTES SZ_2M
/*
* Zero blocks in the range from start but not after the given device size.
* (On SPARC the disk labels are preserved too.)
*/
static int zero_dev_clamped(int fd, struct btrfs_zoned_device_info *zinfo,
off_t start, ssize_t len, u64 dev_size)
{
off_t end = max(start, start + len);
#ifdef __sparc__
/* and don't overwrite the disk labels on sparc */
start = max(start, 1024);
end = max(end, 1024);
#endif
start = min_t(u64, start, dev_size);
end = min_t(u64, end, dev_size);
if (zinfo && zinfo->model == ZONED_HOST_MANAGED)
return zero_zone_blocks(fd, zinfo, start, end - start);
return device_zero_blocks(fd, start, end - start, false);
}
/*
* Find all magic signatures known to blkid and remove them
*/
static int btrfs_wipe_existing_sb(int fd, struct btrfs_zoned_device_info *zinfo)
{
const char *off = NULL;
size_t len = 0;
loff_t offset;
char buf[BUFSIZ];
int ret = 0;
blkid_probe pr = NULL;
pr = blkid_new_probe();
if (!pr)
return -1;
if (blkid_probe_set_device(pr, fd, 0, 0)) {
ret = -1;
goto out;
}
ret = blkid_probe_lookup_value(pr, "SBMAGIC_OFFSET", &off, NULL);
if (!ret)
ret = blkid_probe_lookup_value(pr, "SBMAGIC", NULL, &len);
if (ret || len == 0 || off == NULL) {
/*
* If lookup fails, the probe did not find any values, eg. for
* a file image or a loop device. Soft error.
*/
ret = 1;
goto out;
}
offset = strtoll(off, NULL, 10);
if (len > sizeof(buf))
len = sizeof(buf);
if (!zone_is_sequential(zinfo, offset)) {
const bool direct = zinfo && zinfo->model == ZONED_HOST_MANAGED;
memset(buf, 0, len);
ret = btrfs_pwrite(fd, buf, len, offset, direct);
if (ret < 0) {
error("cannot wipe existing superblock: %m");
ret = -1;
} else if (ret != len) {
error("cannot wipe existing superblock: wrote %d of %zd",
ret, len);
ret = -1;
}
} else {
struct blk_zone *zone = &zinfo->zones[offset / zinfo->zone_size];
ret = btrfs_reset_dev_zone(fd, zone);
if (ret < 0) {
error(
"zoned: failed to wipe zones containing superblock: %m");
ret = -1;
}
}
fsync(fd);
out:
blkid_free_probe(pr);
return ret;
}
/*
* Prepare a device before it's added to the filesystem. Optionally:
* - remove old superblocks
* - discard
* - reset zones
* - delete end of the device
*/
int btrfs_prepare_device(int fd, const char *file, u64 *block_count_ret,
u64 max_block_count, unsigned opflags)
{
struct btrfs_zoned_device_info *zinfo = NULL;
u64 block_count;
struct stat st;
int i, ret;
ret = fstat(fd, &st);
if (ret < 0) {
error("unable to stat %s: %m", file);
return 1;
}
block_count = device_get_partition_size_fd_stat(fd, &st);
if (block_count == 0) {
error("unable to determine size of %s", file);
return 1;
}
if (max_block_count)
block_count = min(block_count, max_block_count);
if (opflags & PREP_DEVICE_ZONED) {
ret = btrfs_get_zone_info(fd, file, &zinfo);
if (ret < 0 || !zinfo) {
error("zoned: unable to load zone information of %s",
file);
return 1;
}
if (!zinfo->emulated) {
if (opflags & PREP_DEVICE_VERBOSE)
printf("Resetting device zones %s (%u zones) ...\n",
file, zinfo->nr_zones);
/*
* We cannot ignore zone reset errors for a zoned block
* device as this could result in the inability to write
* to non-empty sequential zones of the device.
*/
if (btrfs_reset_all_zones(fd, zinfo)) {
error("zoned: failed to reset device '%s' zones: %m",
file);
goto err;
}
}
} else if (opflags & PREP_DEVICE_DISCARD) {
/*
* We intentionally ignore errors from the discard ioctl. It
* is not necessary for the mkfs functionality but just an
* optimization.
*/
if (discard_supported(file)) {
if (opflags & PREP_DEVICE_VERBOSE)
printf("Performing full device TRIM %s (%s) ...\n",
file, pretty_size(block_count));
device_discard_blocks(fd, 0, block_count);
}
}
ret = zero_dev_clamped(fd, zinfo, 0, ZERO_DEV_BYTES, block_count);
for (i = 0 ; !ret && i < BTRFS_SUPER_MIRROR_MAX; i++)
ret = zero_dev_clamped(fd, zinfo, btrfs_sb_offset(i),
BTRFS_SUPER_INFO_SIZE, block_count);
if (!ret && (opflags & PREP_DEVICE_ZERO_END))
ret = zero_dev_clamped(fd, zinfo, block_count - ZERO_DEV_BYTES,
ZERO_DEV_BYTES, block_count);
if (ret < 0) {
errno = -ret;
error("failed to zero device '%s': %m", file);
goto err;
}
ret = btrfs_wipe_existing_sb(fd, zinfo);
if (ret < 0) {
error("cannot wipe superblocks on %s", file);
goto err;
}
free(zinfo);
*block_count_ret = block_count;
return 0;
err:
free(zinfo);
return 1;
}
u64 device_get_partition_size_fd_stat(int fd, const struct stat *st)
{
u64 size;
if (S_ISREG(st->st_mode))
return st->st_size;
if (!S_ISBLK(st->st_mode))
return 0;
if (ioctl(fd, BLKGETSIZE64, &size) >= 0)
return size;
return 0;
}
/*
* Read partition size using the low-level ioctl
*/
u64 device_get_partition_size_fd(int fd)
{
u64 result;
if (ioctl(fd, BLKGETSIZE64, &result) < 0)
return 0;
return result;
}
static u64 device_get_partition_size_sysfs(const char *dev)
{
int ret;
char path[PATH_MAX] = {};
char sysfs[PATH_MAX] = {};
char sizebuf[128] = {};
char *name = NULL;
int sysfd;
unsigned long long size = 0;
name = realpath(dev, path);
if (!name)
return 0;
name = basename(path);
ret = path_cat3_out(sysfs, "/sys/class/block", name, "size");
if (ret < 0)
return 0;
sysfd = open(sysfs, O_RDONLY);
if (sysfd < 0)
return 0;
ret = sysfs_read_file(sysfd, sizebuf, sizeof(sizebuf));
if (ret < 0) {
close(sysfd);
return 0;
}
errno = 0;
size = strtoull(sizebuf, NULL, 10);
if (size == ULLONG_MAX && errno == ERANGE) {
close(sysfd);
return 0;
}
close(sysfd);
return size;
}
u64 device_get_partition_size(const char *dev)
{
u64 result;
int fd = open(dev, O_RDONLY);
if (fd < 0)
return device_get_partition_size_sysfs(dev);
if (ioctl(fd, BLKGETSIZE64, &result) < 0) {
close(fd);
return 0;
}
close(fd);
return result;
}
/*
* Get a device request queue parameter from sysfs.
*/
int device_get_queue_param(const char *file, const char *param, char *buf, size_t len)
{
blkid_probe probe;
char wholedisk[PATH_MAX];
char sysfs_path[PATH_MAX];
dev_t devno;
int fd;
int ret;
probe = blkid_new_probe_from_filename(file);
if (!probe)
return 0;
/* Device number of this disk (possibly a partition) */
devno = blkid_probe_get_devno(probe);
if (!devno) {
blkid_free_probe(probe);
return 0;
}
/* Get whole disk name (not full path) for this devno */
ret = blkid_devno_to_wholedisk(devno, wholedisk, sizeof(wholedisk), NULL);
if (ret) {
blkid_free_probe(probe);
return 0;
}
snprintf(sysfs_path, PATH_MAX, "/sys/block/%s/queue/%s",
wholedisk, param);
blkid_free_probe(probe);
fd = open(sysfs_path, O_RDONLY);
if (fd < 0)
return 0;
len = read(fd, buf, len);
close(fd);
return len;
}
/*
* Read value of zone_unusable from sysfs for given block group type in flags
*/
u64 device_get_zone_unusable(int fd, u64 flags)
{
char buf[64];
int sys_fd;
u64 unusable = DEVICE_ZONE_UNUSABLE_UNKNOWN;
/* Don't report it for a regular fs */
sys_fd = sysfs_open_fsid_file(fd, "features/zoned");
if (sys_fd < 0)
return DEVICE_ZONE_UNUSABLE_UNKNOWN;
close(sys_fd);
sys_fd = -1;
if ((flags & BTRFS_BLOCK_GROUP_DATA) == BTRFS_BLOCK_GROUP_DATA)
sys_fd = sysfs_open_fsid_file(fd, "allocation/data/bytes_zone_unusable");
else if ((flags & BTRFS_BLOCK_GROUP_METADATA) == BTRFS_BLOCK_GROUP_METADATA)
sys_fd = sysfs_open_fsid_file(fd, "allocation/metadata/bytes_zone_unusable");
else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) == BTRFS_BLOCK_GROUP_SYSTEM)
sys_fd = sysfs_open_fsid_file(fd, "allocation/system/bytes_zone_unusable");
if (sys_fd < 0)
return DEVICE_ZONE_UNUSABLE_UNKNOWN;
sysfs_read_file(sys_fd, buf, sizeof(buf));
unusable = strtoull(buf, NULL, 10);
close(sys_fd);
return unusable;
}
/*
* Read information about zone size of the given device (short @name) from a
* given filesystem fd
*/
u64 device_get_zone_size(int fd, const char *name)
{
DIR *dir;
struct dirent *de;
int sysfs_fd;
u64 ret = 0;
sysfs_fd = sysfs_open_fsid_dir(fd, "devices");
if (sysfs_fd < 0)
return 0;
dir = fdopendir(sysfs_fd);
if (!dir) {
ret = 0;
goto out;
}
while (1) {
int queue_fd;
char queue[PATH_MAX];
char buf[128] = {0};
de = readdir(dir);
if (!de) {
ret = 0;
break;
}
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
if (strcmp(name, de->d_name) != 0)
continue;
path_cat3_out(queue, "devices", de->d_name, "queue/chunk_sectors");
/* /sys/fs/btrfs/FSID/devices/NAME/queue/chunk_sectors */
queue_fd = sysfs_open_fsid_file(fd, queue);
if (queue_fd < 0) {
ret = 0;
break;
}
sysfs_read_file(queue_fd, buf, sizeof(buf));
ret = atoll(buf);
close(queue_fd);
break;
}
closedir(dir);
out:
close(sysfs_fd);
return ret;
}
int device_get_rotational(const char *file)
{
char rotational;
int ret;
ret = device_get_queue_param(file, "rotational", &rotational, 1);
if (ret < 1)
return 0;
return (rotational == '0');
}
ssize_t btrfs_direct_pread(int fd, void *buf, size_t count, off_t offset)
{
int alignment;
size_t iosize;
void *bounce_buf = NULL;
struct stat stat_buf;
unsigned long req;
int ret;
if (fstat(fd, &stat_buf) == -1) {
error("fstat failed: %m");
return -errno;
}
if ((stat_buf.st_mode & S_IFMT) == S_IFBLK)
req = BLKSSZGET;
else
req = FIGETBSZ;
if (ioctl(fd, req, &alignment)) {
error("failed to get block size: %m");
return -errno;
}
if (IS_ALIGNED((size_t)buf, alignment) && IS_ALIGNED(count, alignment))
return pread(fd, buf, count, offset);
iosize = round_up(count, alignment);
ret = posix_memalign(&bounce_buf, alignment, iosize);
if (ret) {
error_msg(ERROR_MSG_MEMORY, "bounce buffer");
errno = ret;
return -ret;
}
ret = pread(fd, bounce_buf, iosize, offset);
if (ret >= count)
ret = count;
memcpy(buf, bounce_buf, count);
free(bounce_buf);
return ret;
}
ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count, off_t offset)
{
int alignment;
size_t iosize;
void *bounce_buf = NULL;
struct stat stat_buf;
unsigned long req;
int ret;
if (fstat(fd, &stat_buf) == -1) {
error("fstat failed: %m");
return -errno;
}
if ((stat_buf.st_mode & S_IFMT) == S_IFBLK)
req = BLKSSZGET;
else
req = FIGETBSZ;
if (ioctl(fd, req, &alignment)) {
error("failed to get block size: %m");
return -errno;
}
if (IS_ALIGNED((size_t)buf, alignment) && IS_ALIGNED(count, alignment))
return pwrite(fd, buf, count, offset);
/* Cannot do anything if the write size is not aligned */
if (!IS_ALIGNED(count, alignment)) {
error("%zu is not aligned to %d", count, alignment);
return -EINVAL;
}
iosize = round_up(count, alignment);
ret = posix_memalign(&bounce_buf, alignment, iosize);
if (ret) {
error_msg(ERROR_MSG_MEMORY, "bounce buffer");
errno = ret;
return -ret;
}
UASSERT(iosize == count);
memcpy(bounce_buf, buf, count);
ret = pwrite(fd, bounce_buf, iosize, offset);
free(bounce_buf);
return ret;
}