btrfs-progs/kernel-shared/zoned.c

1203 lines
30 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include "kerncompat.h"
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include "kernel-lib/list.h"
#include "kernel-shared/volumes.h"
#include "kernel-shared/zoned.h"
#include "kernel-shared/accessors.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/extent_io.h"
#include "kernel-shared/uapi/btrfs.h"
#include "kernel-shared/uapi/btrfs_tree.h"
#include "common/utils.h"
#include "common/device-utils.h"
#include "common/extent-cache.h"
#include "common/internal.h"
#include "common/parse-utils.h"
#include "common/messages.h"
#include "mkfs/common.h"
/* Maximum number of zones to report per ioctl(BLKREPORTZONE) call */
#define BTRFS_REPORT_NR_ZONES 4096
/* Invalid allocation pointer value for missing devices */
#define WP_MISSING_DEV ((u64)-1)
/* Pseudo write pointer value for conventional zone */
#define WP_CONVENTIONAL ((u64)-2)
#define DEFAULT_EMULATED_ZONE_SIZE SZ_256M
static u64 emulated_zone_size = DEFAULT_EMULATED_ZONE_SIZE;
/*
* Minimum / maximum supported zone size. Currently, SMR disks have a zone size
* of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do
* not expect the zone size to become larger than 8GiB or smaller than 4MiB in
* the near future.
*/
#define BTRFS_MAX_ZONE_SIZE (8ULL * SZ_1G)
#define BTRFS_MIN_ZONE_SIZE (SZ_4M)
static int btrfs_get_dev_zone_info(struct btrfs_device *device);
enum btrfs_zoned_model zoned_model(const char *file)
{
const char host_aware[] = "host-aware";
const char host_managed[] = "host-managed";
struct stat st;
char model[32];
int ret;
ret = stat(file, &st);
if (ret < 0) {
error("zoned: unable to stat %s", file);
return -ENOENT;
}
/* Consider a regular file as non-zoned device */
if (!S_ISBLK(st.st_mode))
return ZONED_NONE;
ret = device_get_queue_param(file, "zoned", model, sizeof(model));
if (ret <= 0)
return ZONED_NONE;
if (strncmp(model, host_aware, strlen(host_aware)) == 0)
return ZONED_HOST_AWARE;
if (strncmp(model, host_managed, strlen(host_managed)) == 0)
return ZONED_HOST_MANAGED;
return ZONED_NONE;
}
u64 zone_size(const char *file)
{
char chunk[32];
int ret;
/* Zoned emulation on regular device */
if (zoned_model(file) == ZONED_NONE) {
const char *tmp;
u64 size = DEFAULT_EMULATED_ZONE_SIZE;
tmp = bconf_param_value("zone-size");
if (tmp) {
size = parse_size_from_string(tmp);
if (!is_power_of_2(size) || size < BTRFS_MIN_ZONE_SIZE ||
size > BTRFS_MAX_ZONE_SIZE) {
error("invalid emulated zone size %llu", size);
exit(1);
}
}
emulated_zone_size = size;
return emulated_zone_size;
}
ret = device_get_queue_param(file, "chunk_sectors", chunk, sizeof(chunk));
if (ret <= 0)
return 0;
return strtoull((const char *)chunk, NULL, 10) << SECTOR_SHIFT;
}
static u64 max_zone_append_size(const char *file)
{
char chunk[32];
int ret;
ret = device_get_queue_param(file, "zone_append_max_bytes", chunk,
sizeof(chunk));
if (ret <= 0)
return 0;
return strtoull((const char *)chunk, NULL, 10);
}
#ifdef BTRFS_ZONED
/*
* Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
* device into fixed-sized chunks and emulate a conventional zone on each of
* them.
*/
static int emulate_report_zones(const char *file, int fd, u64 pos,
struct blk_zone *zones, unsigned int nr_zones)
{
const sector_t zone_sectors = emulated_zone_size >> SECTOR_SHIFT;
struct stat st;
sector_t bdev_size;
unsigned int i;
int ret;
ret = fstat(fd, &st);
if (ret < 0) {
error("unable to stat %s: %m", file);
return -EIO;
}
bdev_size = device_get_partition_size_fd_stat(fd, &st) >> SECTOR_SHIFT;
pos >>= SECTOR_SHIFT;
for (i = 0; i < nr_zones; i++) {
zones[i].start = i * zone_sectors + pos;
zones[i].len = zone_sectors;
zones[i].capacity = zone_sectors;
zones[i].wp = zones[i].start + zone_sectors;
zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
zones[i].cond = BLK_ZONE_COND_NOT_WP;
if (zones[i].wp >= bdev_size) {
i++;
break;
}
}
return i;
}
static int sb_write_pointer(int fd, struct blk_zone *zones, u64 *wp_ret)
{
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
/*
* Possible states of log buffer zones
*
* Empty[0] In use[0] Full[0]
* Empty[1] * x 0
* In use[1] 0 x 0
* Full[1] 1 1 C
*
* Log position:
* *: Special case, no superblock is written
* 0: Use write pointer of zones[0]
* 1: Use write pointer of zones[1]
* C: Compare super blocks from zones[0] and zones[1], use the latest
* one determined by generation
* x: Invalid state
*/
if (empty[0] && empty[1]) {
/* Special case to distinguish no superblock to read */
*wp_ret = (zones[0].start << SECTOR_SHIFT);
return -ENOENT;
} else if (full[0] && full[1]) {
/* Compare two super blocks */
u8 buf[BTRFS_NR_SB_LOG_ZONES][BTRFS_SUPER_INFO_SIZE];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
int i;
int ret;
for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
u64 bytenr;
bytenr = ((zones[i].start + zones[i].len)
<< SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
ret = pread(fd, buf[i], BTRFS_SUPER_INFO_SIZE, bytenr);
if (ret != BTRFS_SUPER_INFO_SIZE)
return -EIO;
super[i] = (struct btrfs_super_block *)&buf[i];
}
if (super[0]->generation > super[1]->generation)
sector = zones[1].start;
else
sector = zones[0].start;
} else if (!full[0] && (empty[1] || full[1])) {
sector = zones[0].wp;
} else if (full[0]) {
sector = zones[1].wp;
} else {
return -EUCLEAN;
}
*wp_ret = sector << SECTOR_SHIFT;
return 0;
}
int btrfs_reset_dev_zone(int fd, struct blk_zone *zone)
{
struct blk_zone_range range;
/* Nothing to do if it is already empty */
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
zone->cond == BLK_ZONE_COND_EMPTY)
return 0;
range.sector = zone->start;
range.nr_sectors = zone->len;
if (ioctl(fd, BLKRESETZONE, &range) < 0)
return -errno;
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
return 0;
}
static int report_zones(int fd, const char *file,
struct btrfs_zoned_device_info *zinfo)
{
u64 device_size;
u64 zone_bytes = zone_size(file);
size_t rep_size;
u64 sector = 0;
struct stat st;
struct blk_zone_report *rep;
struct blk_zone *zone;
unsigned int i, n = 0;
int ret;
/*
* Zones are guaranteed (by kernel) to be a power of 2 number of
* sectors. Check this here and make sure that zones are not too small.
*/
if (!zone_bytes || !is_power_of_2(zone_bytes)) {
error("zoned: illegal zone size %llu (not a power of 2)",
zone_bytes);
exit(1);
}
/*
* The zone size must be large enough to hold the initial system
* block group for mkfs time.
*/
if (zone_bytes < BTRFS_MKFS_SYSTEM_GROUP_SIZE) {
error("zoned: illegal zone size %llu (smaller than %d)",
zone_bytes, BTRFS_MKFS_SYSTEM_GROUP_SIZE);
exit(1);
}
ret = fstat(fd, &st);
if (ret < 0) {
error("error when reading zone info on %s: %m", file);
return -EIO;
}
device_size = device_get_partition_size_fd_stat(fd, &st);
if (device_size == 0) {
error("zoned: failed to read size of %s: %m", file);
exit(1);
}
/* Allocate the zone information array */
zinfo->zone_size = zone_bytes;
zinfo->nr_zones = device_size / zone_bytes;
if (zinfo->zone_size > BTRFS_MAX_ZONE_SIZE) {
error("zoned: zone size %llu larger than supported maximum %llu",
zinfo->zone_size, BTRFS_MAX_ZONE_SIZE);
exit(1);
} else if (zinfo->zone_size < BTRFS_MIN_ZONE_SIZE) {
error("zoned: zone size %llu smaller than supported minimum %u",
zinfo->zone_size, BTRFS_MIN_ZONE_SIZE);
exit(1);
}
if (device_size & (zone_bytes - 1))
zinfo->nr_zones++;
if (zoned_model(file) != ZONED_NONE && max_zone_append_size(file) == 0) {
error(
"zoned: device %s does not support ZONE_APPEND command", file);
exit(1);
}
zinfo->zones = calloc(zinfo->nr_zones, sizeof(struct blk_zone));
if (!zinfo->zones) {
error_msg(ERROR_MSG_MEMORY, "zone information");
exit(1);
}
/* Allocate a zone report */
rep_size = sizeof(struct blk_zone_report) +
sizeof(struct blk_zone) * BTRFS_REPORT_NR_ZONES;
rep = kmalloc(rep_size, GFP_KERNEL);
if (!rep) {
error_msg(ERROR_MSG_MEMORY, "zone report");
exit(1);
}
/* Get zone information */
zone = (struct blk_zone *)(rep + 1);
while (n < zinfo->nr_zones) {
memset(rep, 0, rep_size);
rep->sector = sector;
rep->nr_zones = BTRFS_REPORT_NR_ZONES;
if (zinfo->model != ZONED_NONE) {
ret = ioctl(fd, BLKREPORTZONE, rep);
if (ret != 0) {
error("zoned: ioctl BLKREPORTZONE failed (%m)");
exit(1);
}
zinfo->emulated = false;
} else {
ret = emulate_report_zones(file, fd,
sector << SECTOR_SHIFT,
zone, BTRFS_REPORT_NR_ZONES);
if (ret < 0) {
error("zoned: failed to emulate BLKREPORTZONE");
exit(1);
}
zinfo->emulated = true;
}
if (!rep->nr_zones)
break;
for (i = 0; i < rep->nr_zones; i++) {
if (n >= zinfo->nr_zones)
break;
memcpy(&zinfo->zones[n], &zone[i],
sizeof(struct blk_zone));
n++;
}
sector = zone[rep->nr_zones - 1].start +
zone[rep->nr_zones - 1].len;
}
kfree(rep);
return 0;
}
/*
* Discard blocks in the zones of a zoned block device. Process this with zone
* size granularity so that blocks in conventional zones are discarded using
* discard_range and blocks in sequential zones are reset though a zone reset.
*/
int btrfs_reset_all_zones(int fd, struct btrfs_zoned_device_info *zinfo)
{
unsigned int i;
int ret = 0;
ASSERT(zinfo);
/* Zone size granularity */
for (i = 0; i < zinfo->nr_zones; i++) {
if (zinfo->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
ret = device_discard_blocks(fd,
zinfo->zones[i].start << SECTOR_SHIFT,
zinfo->zone_size);
if (ret == EOPNOTSUPP)
ret = 0;
} else if (zinfo->zones[i].cond != BLK_ZONE_COND_EMPTY) {
ret = btrfs_reset_dev_zone(fd, &zinfo->zones[i]);
} else {
ret = 0;
}
if (ret)
return ret;
}
return fsync(fd);
}
int zero_zone_blocks(int fd, struct btrfs_zoned_device_info *zinfo, off_t start,
size_t len)
{
size_t zone_len = zinfo->zone_size;
off_t ofst = start;
size_t count;
int ret;
/* Make sure that device_zero_blocks does not write sequential zones */
while (len > 0) {
/* Limit device_zero_blocks to a single zone */
count = min_t(size_t, len, zone_len);
if (count > zone_len - (ofst & (zone_len - 1)))
count = zone_len - (ofst & (zone_len - 1));
if (!zone_is_sequential(zinfo, ofst)) {
ret = device_zero_blocks(fd, ofst, count, true);
if (ret != 0)
return ret;
}
len -= count;
ofst += count;
}
return 0;
}
static int sb_log_location(int fd, struct blk_zone *zones, int rw, u64 *bytenr_ret)
{
u64 wp;
int ret;
/* Use the head of the zones if either zone is conventional */
if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
*bytenr_ret = zones[0].start << SECTOR_SHIFT;
return 0;
} else if (zones[1].type == BLK_ZONE_TYPE_CONVENTIONAL) {
*bytenr_ret = zones[1].start << SECTOR_SHIFT;
return 0;
}
ret = sb_write_pointer(fd, zones, &wp);
if (ret != -ENOENT && ret < 0)
return ret;
if (rw == WRITE) {
struct blk_zone *reset = NULL;
if (wp == zones[0].start << SECTOR_SHIFT)
reset = &zones[0];
else if (wp == zones[1].start << SECTOR_SHIFT)
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
ASSERT(reset->cond == BLK_ZONE_COND_FULL);
ret = btrfs_reset_dev_zone(fd, reset);
if (ret)
return ret;
}
} else if (ret != -ENOENT) {
/* For READ, we want the previous one */
if (wp == zones[0].start << SECTOR_SHIFT)
wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
wp -= BTRFS_SUPER_INFO_SIZE;
}
*bytenr_ret = wp;
return 0;
}
static u32 sb_bytenr_to_sb_zone(u64 bytenr, int zone_size_shift)
{
int mirror = -1;
for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
if (bytenr == btrfs_sb_offset(i)) {
mirror = i;
break;
}
}
ASSERT(mirror != -1);
return sb_zone_number(zone_size_shift, mirror);
}
size_t btrfs_sb_io(int fd, void *buf, off_t offset, int rw)
{
size_t count = BTRFS_SUPER_INFO_SIZE;
struct stat stat_buf;
struct blk_zone_report *rep;
struct blk_zone *zones;
const u64 sb_size_sector = (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
u64 mapped = U64_MAX;
u32 zone_num;
u32 zone_size_sector;
size_t rep_size;
int ret;
size_t ret_sz;
ASSERT(rw == READ || rw == WRITE);
if (fstat(fd, &stat_buf) == -1) {
error("fstat failed: %m");
exit(1);
}
/* Do not call ioctl(BLKGETZONESZ) on a regular file. */
if ((stat_buf.st_mode & S_IFMT) == S_IFBLK) {
ret = ioctl(fd, BLKGETZONESZ, &zone_size_sector);
if (ret < 0) {
if (errno == ENOTTY || errno == EINVAL) {
/*
* No kernel support, assuming non-zoned device.
*
* Note: older kernels before 5.11 could return
* EINVAL in case the ioctl is not available,
* which is wrong.
*/
zone_size_sector = 0;
} else {
error("zoned: ioctl BLKGETZONESZ failed: %m");
exit(1);
}
}
} else {
zone_size_sector = 0;
}
/* We can call pread/pwrite if 'fd' is non-zoned device/file */
if (zone_size_sector == 0) {
if (rw == READ)
return pread(fd, buf, count, offset);
return pwrite(fd, buf, count, offset);
}
ASSERT(IS_ALIGNED(zone_size_sector, sb_size_sector));
zone_num = sb_bytenr_to_sb_zone(offset, ilog2(zone_size_sector) + SECTOR_SHIFT);
rep_size = sizeof(struct blk_zone_report) + sizeof(struct blk_zone) * 2;
rep = calloc(1, rep_size);
if (!rep) {
error_msg(ERROR_MSG_MEMORY, "zone report");
exit(1);
}
rep->sector = zone_num * (sector_t)zone_size_sector;
rep->nr_zones = 2;
ret = ioctl(fd, BLKREPORTZONE, rep);
if (ret) {
if (errno == ENOTTY || errno == EINVAL) {
/*
* Note: older kernels before 5.11 could return EINVAL
* in case the ioctl is not available, which is wrong.
*/
error("zoned: BLKREPORTZONE failed but BLKGETZONESZ works: %m");
exit(1);
}
error("zoned: ioctl BLKREPORTZONE failed: %m");
exit(1);
}
if (rep->nr_zones != 2) {
if (errno == ENOENT || errno == 0)
return (rw == WRITE ? count : 0);
error("zoned: failed to read zone info of %u and %u: %m",
zone_num, zone_num + 1);
kfree(rep);
return 0;
}
zones = (struct blk_zone *)(rep + 1);
ret = sb_log_location(fd, zones, rw, &mapped);
kfree(rep);
/*
* Special case: no superblock found in the zones. This case happens
* when initializing a file-system.
*/
if (rw == READ && ret == -ENOENT) {
memset(buf, 0, count);
return count;
}
if (ret)
return ret;
if (rw == READ)
ret_sz = btrfs_pread(fd, buf, count, mapped, true);
else
ret_sz = btrfs_pwrite(fd, buf, count, mapped, true);
if (ret_sz != count)
return ret_sz;
/* Call fsync() to force the write order */
if (rw == WRITE && fsync(fd)) {
error("failed to synchronize superblock: %m");
exit(1);
}
return ret_sz;
}
/**
* btrfs_find_allocatable_zones - find allocatable zones within a given region
*
* @device: the device to allocate a region on
* @hole_start: the position of the hole to allocate the region
* @num_bytes: size of wanted region
* @hole_end: the end of the hole
* @return: position of allocatable zones
*
* Allocatable region should not contain any superblock locations.
*/
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
u64 hole_end, u64 num_bytes)
{
struct btrfs_zoned_device_info *zinfo = device->zone_info;
int shift = ilog2(zinfo->zone_size);
u64 nzones = num_bytes >> shift;
u64 pos = hole_start;
u64 begin, end;
bool is_sequential;
bool have_sb;
int i;
ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
while (pos < hole_end) {
begin = pos >> shift;
end = begin + nzones;
if (end > zinfo->nr_zones)
return hole_end;
/*
* The zones must be all sequential (and empty), or
* conventional
*/
is_sequential = btrfs_dev_is_sequential(device, pos);
for (i = 0; i < end - begin; i++) {
u64 zone_offset = pos + ((u64)i << shift);
if ((is_sequential &&
!btrfs_dev_is_empty_zone(device, zone_offset)) ||
(is_sequential !=
btrfs_dev_is_sequential(device, zone_offset))) {
pos += zinfo->zone_size;
continue;
}
}
have_sb = false;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
u32 sb_zone;
u64 sb_pos;
sb_zone = sb_zone_number(shift, i);
if (!(end <= sb_zone ||
sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
have_sb = true;
pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
break;
}
/* We also need to exclude regular superblock positions */
sb_pos = btrfs_sb_offset(i);
if (!(pos + num_bytes <= sb_pos ||
sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
have_sb = true;
pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
zinfo->zone_size);
break;
}
}
if (!have_sb)
break;
}
return pos;
}
/*
* Calculate an allocation pointer from the extent allocation information
* for a block group consisting of conventional zones. It is pointed to the
* end of the highest addressed extent in the block group as an allocation
* offset.
*/
static int calculate_alloc_pointer(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *cache,
u64 *offset_ret)
{
struct btrfs_root *root = btrfs_extent_root(fs_info, cache->start);
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
u64 length;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = cache->start + cache->length;
key.type = 0;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
/* There should be no exact match (ie. an extent) at this address */
if (!ret)
ret = -EUCLEAN;
if (ret < 0)
goto out;
ret = btrfs_previous_extent_item(root, path, cache->start);
if (ret) {
if (ret == 1) {
ret = 0;
*offset_ret = 0;
}
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
length = found_key.offset;
else
length = fs_info->nodesize;
if (!(found_key.objectid >= cache->start &&
found_key.objectid + length <= cache->start + cache->length)) {
ret = -EUCLEAN;
goto out;
}
*offset_ret = found_key.objectid + length - cache->start;
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
bool zoned_profile_supported(u64 map_type, bool rst)
{
bool data = (map_type & BTRFS_BLOCK_GROUP_DATA);
u64 flags = (map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
/* SINGLE */
if (flags == 0)
return true;
if (data) {
if ((flags & BTRFS_BLOCK_GROUP_DUP) && rst)
return true;
/* Data RAID1 needs a raid-stripe-tree. */
if ((flags & BTRFS_BLOCK_GROUP_RAID1_MASK) && rst)
return true;
/* Data RAID0 needs a raid-stripe-tree. */
if ((flags & BTRFS_BLOCK_GROUP_RAID0) && rst)
return true;
/* Data RAID10 needs a raid-stripe-tree. */
if ((flags & BTRFS_BLOCK_GROUP_RAID10) && rst)
return true;
} else {
/* We can support DUP on metadata/system. */
if (flags & BTRFS_BLOCK_GROUP_DUP)
return true;
/* We can support RAID1 on metadata/system. */
if (flags & BTRFS_BLOCK_GROUP_RAID1_MASK)
return true;
/* We can support RAID0 on metadata/system. */
if (flags & BTRFS_BLOCK_GROUP_RAID0)
return true;
/* We can support RAID10 on metadata/system. */
if (flags & BTRFS_BLOCK_GROUP_RAID10)
return true;
}
/* All other profiles are not supported yet */
return false;
}
int btrfs_load_block_group_zone_info(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *cache)
{
struct btrfs_device *device;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct cache_extent *ce;
struct map_lookup *map;
u64 logical = cache->start;
u64 length = cache->length;
u64 physical = 0;
int ret = 0;
int i;
u64 *alloc_offsets = NULL;
u64 last_alloc = 0;
u32 num_conventional = 0;
if (!btrfs_is_zoned(fs_info))
return 0;
/* Sanity check */
if (logical == BTRFS_BLOCK_RESERVED_1M_FOR_SUPER) {
if (length + SZ_1M != fs_info->zone_size) {
error("zoned: unaligned initial system block group");
return -EIO;
}
} else if (!IS_ALIGNED(length, fs_info->zone_size)) {
error("zoned: unaligned block group at %llu + %llu", logical,
length);
return -EIO;
}
/* Get the chunk mapping */
ce = search_cache_extent(&map_tree->cache_tree, logical);
if (!ce) {
error("zoned: failed to find block group at %llu", logical);
return -ENOENT;
}
map = container_of(ce, struct map_lookup, ce);
alloc_offsets = calloc(map->num_stripes, sizeof(*alloc_offsets));
if (!alloc_offsets) {
error_msg(ERROR_MSG_MEMORY, "zone offsets");
return -ENOMEM;
}
for (i = 0; i < map->num_stripes; i++) {
bool is_sequential;
struct blk_zone zone;
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
if (device->fd == -1) {
alloc_offsets[i] = WP_MISSING_DEV;
continue;
}
is_sequential = btrfs_dev_is_sequential(device, physical);
if (!is_sequential)
num_conventional++;
if (!is_sequential) {
alloc_offsets[i] = WP_CONVENTIONAL;
continue;
}
/*
* The group is mapped to a sequential zone. Get the zone write
* pointer to determine the allocation offset within the zone.
*/
WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
zone = device->zone_info->zones[physical / fs_info->zone_size];
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
error(
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
physical / fs_info->zone_size, device->name,
device->devid);
alloc_offsets[i] = WP_MISSING_DEV;
break;
case BLK_ZONE_COND_EMPTY:
alloc_offsets[i] = 0;
break;
case BLK_ZONE_COND_FULL:
alloc_offsets[i] = fs_info->zone_size;
break;
default:
/* Partially used zone */
alloc_offsets[i] =
((zone.wp - zone.start) << SECTOR_SHIFT);
break;
}
}
if (num_conventional > 0) {
ret = calculate_alloc_pointer(fs_info, cache, &last_alloc);
if (ret || map->num_stripes == num_conventional) {
if (!ret)
cache->alloc_offset = last_alloc;
else
error(
"zoned: failed to determine allocation offset of block group %llu",
cache->start);
goto out;
}
}
if (!zoned_profile_supported(map->type, !!fs_info->stripe_root)) {
error("zoned: profile %s not yet supported",
btrfs_group_profile_str(map->type));
ret = -EINVAL;
goto out;
}
cache->alloc_offset = alloc_offsets[0];
out:
/* An extent is allocated after the write pointer */
if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
error(
"zoned: got wrong write pointer in block group %llu: %llu > %llu",
logical, last_alloc, cache->alloc_offset);
ret = -EIO;
}
if (!ret)
cache->write_offset = cache->alloc_offset;
kfree(alloc_offsets);
return ret;
}
bool btrfs_redirty_extent_buffer_for_zoned(struct btrfs_fs_info *fs_info,
u64 start, u64 end)
{
u64 next;
struct btrfs_block_group *cache;
struct extent_buffer *eb;
if (!btrfs_is_zoned(fs_info))
return false;
cache = btrfs_lookup_first_block_group(fs_info, start);
BUG_ON(!cache);
if (cache->start + cache->write_offset < start) {
next = cache->start + cache->write_offset;
BUG_ON(next + fs_info->nodesize > start);
eb = btrfs_find_create_tree_block(fs_info, next);
btrfs_mark_buffer_dirty(eb);
free_extent_buffer(eb);
return true;
}
cache->write_offset += (end + 1 - start);
return false;
}
int btrfs_reset_chunk_zones(struct btrfs_fs_info *fs_info, u64 devid,
u64 offset, u64 length)
{
struct btrfs_device *device;
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
struct btrfs_zoned_device_info *zinfo;
struct blk_zone *reset;
if (device->devid != devid)
continue;
zinfo = device->zone_info;
if (!zone_is_sequential(zinfo, offset))
continue;
reset = &zinfo->zones[offset / zinfo->zone_size];
if (btrfs_reset_dev_zone(device->fd, reset)) {
error("zoned: failed to reset zone %llu: %m",
offset / zinfo->zone_size);
return -EIO;
}
}
return 0;
}
int btrfs_wipe_temporary_sb(struct btrfs_fs_devices *fs_devices)
{
struct list_head *head = &fs_devices->devices;
struct btrfs_device *dev;
int ret = 0;
list_for_each_entry(dev, head, dev_list) {
struct btrfs_zoned_device_info *zinfo = dev->zone_info;
if (!zinfo)
continue;
ret = btrfs_reset_dev_zone(dev->fd, &zinfo->zones[0]);
if (ret)
break;
}
return ret;
}
bool btrfs_sb_zone_exists(struct btrfs_device *device, u64 bytenr)
{
u32 zone_num = sb_bytenr_to_sb_zone(bytenr,
ilog2(device->zone_info->zone_size));
return zone_num + 1 <= device->zone_info->nr_zones - 1;
}
#endif
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
int ret = 0;
/* fs_info->zone_size might not set yet. Use the incomapt flag here. */
if (!btrfs_fs_incompat(fs_info, ZONED))
return 0;
list_for_each_entry(device, &fs_devices->devices, dev_list) {
/* We can skip reading of zone info for missing devices */
if (device->fd == -1)
continue;
ret = btrfs_get_dev_zone_info(device);
if (ret)
break;
}
return ret;
}
static int btrfs_get_dev_zone_info(struct btrfs_device *device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
/*
* Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
* yet be set.
*/
if (!btrfs_fs_incompat(fs_info, ZONED))
return 0;
if (device->zone_info)
return 0;
return btrfs_get_zone_info(device->fd, device->name, &device->zone_info);
}
int btrfs_get_zone_info(int fd, const char *file,
struct btrfs_zoned_device_info **zinfo_ret)
{
#ifdef BTRFS_ZONED
struct btrfs_zoned_device_info *zinfo;
int ret;
#endif
enum btrfs_zoned_model model;
*zinfo_ret = NULL;
/* Check zone model */
model = zoned_model(file);
#ifdef BTRFS_ZONED
zinfo = calloc(1, sizeof(*zinfo));
if (!zinfo) {
error_msg(ERROR_MSG_MEMORY, "zone information");
exit(1);
}
zinfo->model = model;
/* Get zone information */
ret = report_zones(fd, file, zinfo);
if (ret != 0) {
kfree(zinfo);
return ret;
}
*zinfo_ret = zinfo;
#else
error("zoned: %s: unsupported host-%s zoned block device", file,
model == ZONED_HOST_MANAGED ? "managed" : "aware");
if (model == ZONED_HOST_MANAGED)
return -EOPNOTSUPP;
error("zoned: %s: handling host-aware block device as a regular disk",
file);
#endif
return 0;
}
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 zoned_devices = 0;
u64 nr_devices = 0;
u64 zone_size = 0;
const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
int ret = 0;
/* Count zoned devices */
list_for_each_entry(device, &fs_devices->devices, dev_list) {
enum btrfs_zoned_model model;
if (device->fd == -1)
continue;
model = zoned_model(device->name);
/*
* A Host-Managed zoned device must be used as a zoned device.
* A Host-Aware zoned device and a non-zoned devices can be
* treated as a zoned device, if ZONED flag is enabled in the
* superblock.
*/
if (model == ZONED_HOST_MANAGED ||
(model == ZONED_HOST_AWARE && incompat_zoned) ||
(model == ZONED_NONE && incompat_zoned)) {
struct btrfs_zoned_device_info *zone_info =
device->zone_info;
zoned_devices++;
if (!zone_size) {
zone_size = zone_info->zone_size;
} else if (zone_info->zone_size != zone_size) {
error(
"zoned: unequal block device zone sizes: have %llu found %llu",
device->zone_info->zone_size,
zone_size);
ret = -EINVAL;
goto out;
}
}
nr_devices++;
}
if (!zoned_devices && !incompat_zoned)
goto out;
if (!zoned_devices && incompat_zoned) {
/* No zoned block device found on ZONED filesystem */
error("zoned: no zoned devices found on a zoned filesystem");
ret = -EINVAL;
goto out;
}
if (zoned_devices && !incompat_zoned) {
error("zoned: mode not enabled but zoned device found");
ret = -EINVAL;
goto out;
}
if (zoned_devices != nr_devices) {
error("zoned: cannot mix zoned and regular devices");
ret = -EINVAL;
goto out;
}
/*
* stripe_size is always aligned to BTRFS_STRIPE_LEN in
* __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
* check the alignment here.
*/
if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
error("zoned: zone size %llu not aligned to stripe %u",
zone_size, BTRFS_STRIPE_LEN);
ret = -EINVAL;
goto out;
}
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
error("zoned: mixed block groups not supported");
ret = -EINVAL;
goto out;
}
fs_info->zone_size = zone_size;
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
out:
return ret;
}