btrfs-progs/kernel-shared/zoned.c

// SPDX-License-Identifier: GPL-2.0
/*
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include "kerncompat.h"
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include "kernel-lib/list.h"
#include "kernel-shared/volumes.h"
#include "kernel-shared/zoned.h"
#include "kernel-shared/accessors.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/extent_io.h"
#include "kernel-shared/uapi/btrfs.h"
#include "kernel-shared/uapi/btrfs_tree.h"
#include "common/utils.h"
#include "common/device-utils.h"
#include "common/extent-cache.h"
#include "common/internal.h"
#include "common/string-utils.h"
#include "common/messages.h"
#include "mkfs/common.h"

/* Maximum number of zones to report per ioctl(BLKREPORTZONE) call */
#define BTRFS_REPORT_NR_ZONES  		4096
/* Invalid allocation pointer value for missing devices */
#define WP_MISSING_DEV			((u64)-1)
/* Pseudo write pointer value for conventional zone */
#define WP_CONVENTIONAL			((u64)-2)

#define DEFAULT_EMULATED_ZONE_SIZE		SZ_256M

static u64 emulated_zone_size = DEFAULT_EMULATED_ZONE_SIZE;

/*
 * Minimum / maximum supported zone size. Currently, SMR disks have a zone size
 * of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.  We do
 * not expect the zone size to become larger than 8GiB or smaller than 4MiB in
 * the near future.
 */
#define BTRFS_MAX_ZONE_SIZE		(8ULL * SZ_1G)
#define BTRFS_MIN_ZONE_SIZE		(SZ_4M)

static int btrfs_get_dev_zone_info(struct btrfs_device *device);

enum btrfs_zoned_model zoned_model(const char *file)
{
	const char host_aware[] = "host-aware";
	const char host_managed[] = "host-managed";
	struct stat st;
	char model[32];
	int ret;

	ret = stat(file, &st);
	if (ret < 0) {
		error("zoned: unable to stat %s", file);
		return -ENOENT;
	}

	/* Consider a regular file as non-zoned device */
	if (!S_ISBLK(st.st_mode))
		return ZONED_NONE;

	ret = device_get_queue_param(file, "zoned", model, sizeof(model));
	if (ret <= 0)
		return ZONED_NONE;

	if (strncmp(model, host_aware, strlen(host_aware)) == 0)
		return ZONED_HOST_AWARE;
	if (strncmp(model, host_managed, strlen(host_managed)) == 0)
		return ZONED_HOST_MANAGED;

	return ZONED_NONE;
}

u64 zone_size(const char *file)
{
	char chunk[32];
	int ret;

	/* Zoned emulation on regular device */
	if (zoned_model(file) == ZONED_NONE) {
		const char *tmp;
		u64 size = DEFAULT_EMULATED_ZONE_SIZE;

		tmp = bconf_param_value("zone-size");
		if (tmp) {
			size = arg_strtou64_with_suffix(tmp);
			if (!is_power_of_2(size) || size < BTRFS_MIN_ZONE_SIZE ||
			    size > BTRFS_MAX_ZONE_SIZE) {
				error("invalid emulated zone size %llu", size);
				exit(1);
			}
		}
		emulated_zone_size = size;
		return emulated_zone_size;
	}

	ret = device_get_queue_param(file, "chunk_sectors", chunk, sizeof(chunk));
	if (ret <= 0)
		return 0;

	return strtoull((const char *)chunk, NULL, 10) << SECTOR_SHIFT;
}

static u64 max_zone_append_size(const char *file)
{
	char chunk[32];
	int ret;

	ret = device_get_queue_param(file, "zone_append_max_bytes", chunk,
				     sizeof(chunk));
	if (ret <= 0)
		return 0;

	return strtoull((const char *)chunk, NULL, 10);
}

#ifdef BTRFS_ZONED
/*
 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
 * device into fixed-sized chunks and emulate a conventional zone on each of
 * them.
 */
static int emulate_report_zones(const char *file, int fd, u64 pos,
				struct blk_zone *zones, unsigned int nr_zones)
{
	const sector_t zone_sectors = emulated_zone_size >> SECTOR_SHIFT;
	struct stat st;
	sector_t bdev_size;
	unsigned int i;
	int ret;

	ret = fstat(fd, &st);
	if (ret < 0) {
		error("unable to stat %s: %m", file);
		return -EIO;
	}

	bdev_size = device_get_partition_size_fd_stat(fd, &st) >> SECTOR_SHIFT;

	pos >>= SECTOR_SHIFT;
	for (i = 0; i < nr_zones; i++) {
		zones[i].start = i * zone_sectors + pos;
		zones[i].len = zone_sectors;
		zones[i].capacity = zone_sectors;
		zones[i].wp = zones[i].start + zone_sectors;
		zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
		zones[i].cond = BLK_ZONE_COND_NOT_WP;

		if (zones[i].wp >= bdev_size) {
			i++;
			break;
		}
	}

	return i;
}

static int sb_write_pointer(int fd, struct blk_zone *zones, u64 *wp_ret)
{
	bool empty[BTRFS_NR_SB_LOG_ZONES];
	bool full[BTRFS_NR_SB_LOG_ZONES];
	sector_t sector;

	ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
	       zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);

	empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
	empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
	full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
	full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);

	/*
	 * Possible states of log buffer zones
	 *
	 *           Empty[0]  In use[0]  Full[0]
	 * Empty[1]         *          x        0
	 * In use[1]        0          x        0
	 * Full[1]          1          1        C
	 *
	 * Log position:
	 *   *: Special case, no superblock is written
	 *   0: Use write pointer of zones[0]
	 *   1: Use write pointer of zones[1]
	 *   C: Compare super blocks from zones[0] and zones[1], use the latest
	 *      one determined by generation
	 *   x: Invalid state
	 */

	if (empty[0] && empty[1]) {
		/* Special case to distinguish no superblock to read */
		*wp_ret = (zones[0].start << SECTOR_SHIFT);
		return -ENOENT;
	} else if (full[0] && full[1]) {
		/* Compare two super blocks */
		u8 buf[BTRFS_NR_SB_LOG_ZONES][BTRFS_SUPER_INFO_SIZE];
		struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
		int i;
		int ret;

		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
			u64 bytenr;

			bytenr = ((zones[i].start + zones[i].len)
				   << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;

			ret = pread(fd, buf[i], BTRFS_SUPER_INFO_SIZE, bytenr);
			if (ret != BTRFS_SUPER_INFO_SIZE)
				return -EIO;
			super[i] = (struct btrfs_super_block *)&buf[i];
		}

		if (super[0]->generation > super[1]->generation)
			sector = zones[1].start;
		else
			sector = zones[0].start;
	} else if (!full[0] && (empty[1] || full[1])) {
		sector = zones[0].wp;
	} else if (full[0]) {
		sector = zones[1].wp;
	} else {
		return -EUCLEAN;
	}
	*wp_ret = sector << SECTOR_SHIFT;
	return 0;
}

int btrfs_reset_dev_zone(int fd, struct blk_zone *zone)
{
	struct blk_zone_range range;

	/* Nothing to do if it is already empty */
	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
	    zone->cond == BLK_ZONE_COND_EMPTY)
		return 0;

	range.sector = zone->start;
	range.nr_sectors = zone->len;

	if (ioctl(fd, BLKRESETZONE, &range) < 0)
		return -errno;

	zone->cond = BLK_ZONE_COND_EMPTY;
	zone->wp = zone->start;

	return 0;
}

static int report_zones(int fd, const char *file,
			struct btrfs_zoned_device_info *zinfo)
{
	u64 device_size;
	u64 zone_bytes = zone_size(file);
	size_t rep_size;
	u64 sector = 0;
	struct stat st;
	struct blk_zone_report *rep;
	struct blk_zone *zone;
	unsigned int i, n = 0;
	int ret;

	/*
	 * Zones are guaranteed (by kernel) to be a power of 2 number of
	 * sectors. Check this here and make sure that zones are not too small.
	 */
	if (!zone_bytes || !is_power_of_2(zone_bytes)) {
		error("zoned: illegal zone size %llu (not a power of 2)",
		      zone_bytes);
		exit(1);
	}
	/*
	 * The zone size must be large enough to hold the initial system
	 * block group for mkfs time.
	 */
	if (zone_bytes < BTRFS_MKFS_SYSTEM_GROUP_SIZE) {
		error("zoned: illegal zone size %llu (smaller than %d)",
		      zone_bytes, BTRFS_MKFS_SYSTEM_GROUP_SIZE);
		exit(1);
	}

	ret = fstat(fd, &st);
	if (ret < 0) {
		error("error when reading zone info on %s: %m", file);
		return -EIO;
	}

	device_size = device_get_partition_size_fd_stat(fd, &st);
	if (device_size == 0) {
		error("zoned: failed to read size of %s: %m", file);
		exit(1);
	}

	/* Allocate the zone information array */
	zinfo->zone_size = zone_bytes;
	zinfo->nr_zones = device_size / zone_bytes;

	if (zinfo->zone_size > BTRFS_MAX_ZONE_SIZE) {
		error("zoned: zone size %llu larger than supported maximum %llu",
		      zinfo->zone_size, BTRFS_MAX_ZONE_SIZE);
		exit(1);
	} else if (zinfo->zone_size < BTRFS_MIN_ZONE_SIZE) {
		error("zoned: zone size %llu smaller than supported minimum %u",
		      zinfo->zone_size, BTRFS_MIN_ZONE_SIZE);
		exit(1);
	}

	if (device_size & (zone_bytes - 1))
		zinfo->nr_zones++;

	if (zoned_model(file) != ZONED_NONE && max_zone_append_size(file) == 0) {
		error(
		"zoned: device %s does not support ZONE_APPEND command", file);
		exit(1);
	}

	zinfo->zones = calloc(zinfo->nr_zones, sizeof(struct blk_zone));
	if (!zinfo->zones) {
		error_msg(ERROR_MSG_MEMORY, "zone information");
		exit(1);
	}

	/* Allocate a zone report */
	rep_size = sizeof(struct blk_zone_report) +
		   sizeof(struct blk_zone) * BTRFS_REPORT_NR_ZONES;
	rep = kmalloc(rep_size, GFP_KERNEL);
	if (!rep) {
		error_msg(ERROR_MSG_MEMORY, "zone report");
		exit(1);
	}

	/* Get zone information */
	zone = (struct blk_zone *)(rep + 1);
	while (n < zinfo->nr_zones) {
		memset(rep, 0, rep_size);
		rep->sector = sector;
		rep->nr_zones = BTRFS_REPORT_NR_ZONES;

		if (zinfo->model != ZONED_NONE) {
			ret = ioctl(fd, BLKREPORTZONE, rep);
			if (ret != 0) {
				error("zoned: ioctl BLKREPORTZONE failed (%m)");
				exit(1);
			}
			zinfo->emulated = false;
		} else {
			ret = emulate_report_zones(file, fd,
						   sector << SECTOR_SHIFT,
						   zone, BTRFS_REPORT_NR_ZONES);
			if (ret < 0) {
				error("zoned: failed to emulate BLKREPORTZONE");
				exit(1);
			}
			zinfo->emulated = true;
		}

		if (!rep->nr_zones)
			break;

		for (i = 0; i < rep->nr_zones; i++) {
			if (n >= zinfo->nr_zones)
				break;
			memcpy(&zinfo->zones[n], &zone[i],
			       sizeof(struct blk_zone));
			n++;
		}

		sector = zone[rep->nr_zones - 1].start +
			 zone[rep->nr_zones - 1].len;
	}

	kfree(rep);

	return 0;
}

/*
 * Discard blocks in the zones of a zoned block device. Process this with zone
 * size granularity so that blocks in conventional zones are discarded using
 * discard_range and blocks in sequential zones are reset though a zone reset.
 */
int btrfs_reset_all_zones(int fd, struct btrfs_zoned_device_info *zinfo)
{
	unsigned int i;
	int ret = 0;

	ASSERT(zinfo);

	/* Zone size granularity */
	for (i = 0; i < zinfo->nr_zones; i++) {
		if (zinfo->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
			ret = device_discard_blocks(fd,
					     zinfo->zones[i].start << SECTOR_SHIFT,
					     zinfo->zone_size);
			if (ret == EOPNOTSUPP)
				ret = 0;
		} else if (zinfo->zones[i].cond != BLK_ZONE_COND_EMPTY) {
			ret = btrfs_reset_dev_zone(fd, &zinfo->zones[i]);
		} else {
			ret = 0;
		}

		if (ret)
			return ret;
	}
	return fsync(fd);
}

int zero_zone_blocks(int fd, struct btrfs_zoned_device_info *zinfo, off_t start,
		     size_t len)
{
	size_t zone_len = zinfo->zone_size;
	off_t ofst = start;
	size_t count;
	int ret;

	/* Make sure that device_zero_blocks does not write sequential zones */
	while (len > 0) {
		/* Limit device_zero_blocks to a single zone */
		count = min_t(size_t, len, zone_len);
		if (count > zone_len - (ofst & (zone_len - 1)))
			count = zone_len - (ofst & (zone_len - 1));

		if (!zone_is_sequential(zinfo, ofst)) {
			ret = device_zero_blocks(fd, ofst, count, true);
			if (ret != 0)
				return ret;
		}

		len -= count;
		ofst += count;
	}

	return 0;
}

static int sb_log_location(int fd, struct blk_zone *zones, int rw, u64 *bytenr_ret)
{
	u64 wp;
	int ret;

	/* Use the head of the zones if either zone is conventional */
	if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
		*bytenr_ret = zones[0].start << SECTOR_SHIFT;
		return 0;
	} else if (zones[1].type == BLK_ZONE_TYPE_CONVENTIONAL) {
		*bytenr_ret = zones[1].start << SECTOR_SHIFT;
		return 0;
	}

	ret = sb_write_pointer(fd, zones, &wp);
	if (ret != -ENOENT && ret < 0)
		return ret;

	if (rw == WRITE) {
		struct blk_zone *reset = NULL;

		if (wp == zones[0].start << SECTOR_SHIFT)
			reset = &zones[0];
		else if (wp == zones[1].start << SECTOR_SHIFT)
			reset = &zones[1];

		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
			ASSERT(reset->cond == BLK_ZONE_COND_FULL);

			ret = btrfs_reset_dev_zone(fd, reset);
			if (ret)
				return ret;
		}
	} else if (ret != -ENOENT) {
		/* For READ, we want the previous one */
		if (wp == zones[0].start << SECTOR_SHIFT)
			wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
		wp -= BTRFS_SUPER_INFO_SIZE;
	}

	*bytenr_ret = wp;
	return 0;
}

static u32 sb_bytenr_to_sb_zone(u64 bytenr, int zone_size_shift)
{
	int mirror = -1;

	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
		if (bytenr == btrfs_sb_offset(i)) {
			mirror = i;
			break;
		}
	}
	ASSERT(mirror != -1);

	return sb_zone_number(zone_size_shift, mirror);
}

size_t btrfs_sb_io(int fd, void *buf, off_t offset, int rw)
{
	size_t count = BTRFS_SUPER_INFO_SIZE;
	struct stat stat_buf;
	struct blk_zone_report *rep;
	struct blk_zone *zones;
	const u64 sb_size_sector = (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
	u64 mapped = U64_MAX;
	u32 zone_num;
	u32 zone_size_sector;
	size_t rep_size;
	int ret;
	size_t ret_sz;

	ASSERT(rw == READ || rw == WRITE);

	if (fstat(fd, &stat_buf) == -1) {
		error("fstat failed: %m");
		exit(1);
	}

	/* Do not call ioctl(BLKGETZONESZ) on a regular file. */
	if ((stat_buf.st_mode & S_IFMT) == S_IFBLK) {
		ret = ioctl(fd, BLKGETZONESZ, &zone_size_sector);
		if (ret < 0) {
			if (errno == ENOTTY || errno == EINVAL) {
				/*
				 * No kernel support, assuming non-zoned device.
				 *
				 * Note: older kernels before 5.11 could return
				 * EINVAL in case the ioctl is not available,
				 * which is wrong.
				 */
				zone_size_sector = 0;
			} else {
				error("zoned: ioctl BLKGETZONESZ failed: %m");
				exit(1);
			}
		}
	} else {
		zone_size_sector = 0;
	}

	/* We can call pread/pwrite if 'fd' is non-zoned device/file */
	if (zone_size_sector == 0) {
		if (rw == READ)
			return pread(fd, buf, count, offset);
		return pwrite(fd, buf, count, offset);
	}

	ASSERT(IS_ALIGNED(zone_size_sector, sb_size_sector));

	zone_num = sb_bytenr_to_sb_zone(offset, ilog2(zone_size_sector) + SECTOR_SHIFT);

	rep_size = sizeof(struct blk_zone_report) + sizeof(struct blk_zone) * 2;
	rep = calloc(1, rep_size);
	if (!rep) {
		error_msg(ERROR_MSG_MEMORY, "zone report");
		exit(1);
	}

	rep->sector = zone_num * (sector_t)zone_size_sector;
	rep->nr_zones = 2;

	ret = ioctl(fd, BLKREPORTZONE, rep);
	if (ret) {
		if (errno == ENOTTY || errno == EINVAL) {
			/*
			 * Note: older kernels before 5.11 could return EINVAL
			 * in case the ioctl is not available, which is wrong.
			 */
			error("zoned: BLKREPORTZONE failed but BLKGETZONESZ works: %m");
			exit(1);
		}
		error("zoned: ioctl BLKREPORTZONE failed: %m");
		exit(1);
	}
	if (rep->nr_zones != 2) {
		if (errno == ENOENT || errno == 0)
			return (rw == WRITE ? count : 0);
		error("zoned: failed to read zone info of %u and %u: %m",
		      zone_num, zone_num + 1);
		kfree(rep);
		return 0;
	}

	zones = (struct blk_zone *)(rep + 1);

	ret = sb_log_location(fd, zones, rw, &mapped);
	kfree(rep);
	/*
	 * Special case: no superblock found in the zones. This case happens
	 * when initializing a file-system.
	 */
	if (rw == READ && ret == -ENOENT) {
		memset(buf, 0, count);
		return count;
	}
	if (ret)
		return ret;

	if (rw == READ)
		ret_sz = btrfs_pread(fd, buf, count, mapped, true);
	else
		ret_sz = btrfs_pwrite(fd, buf, count, mapped, true);

	if (ret_sz != count)
		return ret_sz;

	/* Call fsync() to force the write order */
	if (rw == WRITE && fsync(fd)) {
		error("failed to synchronize superblock: %m");
		exit(1);
	}

	return ret_sz;
}

/**
 * btrfs_find_allocatable_zones - find allocatable zones within a given region
 *
 * @device:	the device to allocate a region on
 * @hole_start: the position of the hole to allocate the region
 * @num_bytes:	size of wanted region
 * @hole_end:	the end of the hole
 * @return:	position of allocatable zones
 *
 * Allocatable region should not contain any superblock locations.
 */
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
				 u64 hole_end, u64 num_bytes)
{
	struct btrfs_zoned_device_info *zinfo = device->zone_info;
	int shift = ilog2(zinfo->zone_size);
	u64 nzones = num_bytes >> shift;
	u64 pos = hole_start;
	u64 begin, end;
	bool is_sequential;
	bool have_sb;
	int i;

	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));

	while (pos < hole_end) {
		begin = pos >> shift;
		end = begin + nzones;

		if (end > zinfo->nr_zones)
			return hole_end;

		/*
		 * The zones must be all sequential (and empty), or
		 * conventional
		 */
		is_sequential = btrfs_dev_is_sequential(device, pos);
		for (i = 0; i < end - begin; i++) {
			u64 zone_offset = pos + ((u64)i << shift);

			if ((is_sequential &&
			     !btrfs_dev_is_empty_zone(device, zone_offset)) ||
			    (is_sequential !=
			     btrfs_dev_is_sequential(device, zone_offset))) {
				pos += zinfo->zone_size;
				continue;
			}
		}

		have_sb = false;
		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
			u32 sb_zone;
			u64 sb_pos;

			sb_zone = sb_zone_number(shift, i);
			if (!(end <= sb_zone ||
			      sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
				have_sb = true;
				pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
				break;
			}

			/* We also need to exclude regular superblock positions */
			sb_pos = btrfs_sb_offset(i);
			if (!(pos + num_bytes <= sb_pos ||
			      sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
				have_sb = true;
				pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
					    zinfo->zone_size);
				break;
			}
		}
		if (!have_sb)
			break;
	}

	return pos;
}

/*
 * Calculate an allocation pointer from the extent allocation information
 * for a block group consisting of conventional zones. It is pointed to the
 * end of the highest addressed extent in the block group as an allocation
 * offset.
 */
static int calculate_alloc_pointer(struct btrfs_fs_info *fs_info,
				   struct btrfs_block_group *cache,
				   u64 *offset_ret)
{
	struct btrfs_root *root = btrfs_extent_root(fs_info, cache->start);
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	u64 length;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = cache->start + cache->length;
	key.type = 0;
	key.offset = 0;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	/* There should be no exact match (ie. an extent) at this address */
	if (!ret)
		ret = -EUCLEAN;
	if (ret < 0)
		goto out;

	ret = btrfs_previous_extent_item(root, path, cache->start);
	if (ret) {
		if (ret == 1) {
			ret = 0;
			*offset_ret = 0;
		}
		goto out;
	}

	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);

	if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
		length = found_key.offset;
	else
		length = fs_info->nodesize;

	if (!(found_key.objectid >= cache->start &&
	       found_key.objectid + length <= cache->start + cache->length)) {
		ret = -EUCLEAN;
		goto out;
	}
	*offset_ret = found_key.objectid + length - cache->start;
	ret = 0;

out:
	btrfs_free_path(path);
	return ret;
}

bool zoned_profile_supported(u64 map_type, bool rst)
{
	bool data = (map_type & BTRFS_BLOCK_GROUP_DATA);
	u64 flags = (map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK);

	/* SINGLE */
	if (flags == 0)
		return true;

	if (data) {
		if ((flags & BTRFS_BLOCK_GROUP_DUP) && rst)
			return true;
		/* Data RAID1 needs a raid-stripe-tree. */
		if ((flags & BTRFS_BLOCK_GROUP_RAID1_MASK) && rst)
			return true;
		/* Data RAID0 needs a raid-stripe-tree. */
		if ((flags & BTRFS_BLOCK_GROUP_RAID0) && rst)
			return true;
		/* Data RAID10 needs a raid-stripe-tree. */
		if ((flags & BTRFS_BLOCK_GROUP_RAID10) && rst)
			return true;
	} else {
		/* We can support DUP on metadata/system. */
		if (flags & BTRFS_BLOCK_GROUP_DUP)
			return true;
		/* We can support RAID1 on metadata/system. */
		if (flags & BTRFS_BLOCK_GROUP_RAID1_MASK)
			return true;
		/* We can support RAID0 on metadata/system. */
		if (flags & BTRFS_BLOCK_GROUP_RAID0)
			return true;
		/* We can support RAID10 on metadata/system. */
		if (flags & BTRFS_BLOCK_GROUP_RAID10)
			return true;
	}

	/* All other profiles are not supported yet */
	return false;
}

int btrfs_load_block_group_zone_info(struct btrfs_fs_info *fs_info,
				     struct btrfs_block_group *cache)
{
	struct btrfs_device *device;
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	struct cache_extent *ce;
	struct map_lookup *map;
	u64 logical = cache->start;
	u64 length = cache->length;
	u64 physical = 0;
	int ret = 0;
	int i;
	u64 *alloc_offsets = NULL;
	u64 last_alloc = 0;
	u32 num_conventional = 0;

	if (!btrfs_is_zoned(fs_info))
		return 0;

	/* Sanity check */
	if (logical == BTRFS_BLOCK_RESERVED_1M_FOR_SUPER) {
		if (length + SZ_1M != fs_info->zone_size) {
			error("zoned: unaligned initial system block group");
			return -EIO;
		}
	} else if (!IS_ALIGNED(length, fs_info->zone_size)) {
		error("zoned: unaligned block group at %llu + %llu", logical,
		      length);
		return -EIO;
	}

	/* Get the chunk mapping */
	ce = search_cache_extent(&map_tree->cache_tree, logical);
	if (!ce) {
		error("zoned: failed to find block group at %llu", logical);
		return -ENOENT;
	}
	map = container_of(ce, struct map_lookup, ce);

	alloc_offsets = calloc(map->num_stripes, sizeof(*alloc_offsets));
	if (!alloc_offsets) {
		error_msg(ERROR_MSG_MEMORY, "zone offsets");
		return -ENOMEM;
	}

	for (i = 0; i < map->num_stripes; i++) {
		bool is_sequential;
		struct blk_zone zone;

		device = map->stripes[i].dev;
		physical = map->stripes[i].physical;

		if (device->fd == -1) {
			alloc_offsets[i] = WP_MISSING_DEV;
			continue;
		}

		is_sequential = btrfs_dev_is_sequential(device, physical);
		if (!is_sequential)
			num_conventional++;

		if (!is_sequential) {
			alloc_offsets[i] = WP_CONVENTIONAL;
			continue;
		}

		/*
		 * The group is mapped to a sequential zone. Get the zone write
		 * pointer to determine the allocation offset within the zone.
		 */
		WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
		zone = device->zone_info->zones[physical / fs_info->zone_size];

		switch (zone.cond) {
		case BLK_ZONE_COND_OFFLINE:
		case BLK_ZONE_COND_READONLY:
			error(
		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
			      physical / fs_info->zone_size, device->name,
			      device->devid);
			alloc_offsets[i] = WP_MISSING_DEV;
			break;
		case BLK_ZONE_COND_EMPTY:
			alloc_offsets[i] = 0;
			break;
		case BLK_ZONE_COND_FULL:
			alloc_offsets[i] = fs_info->zone_size;
			break;
		default:
			/* Partially used zone */
			alloc_offsets[i] =
					((zone.wp - zone.start) << SECTOR_SHIFT);
			break;
		}
	}

	if (num_conventional > 0) {
		ret = calculate_alloc_pointer(fs_info, cache, &last_alloc);
		if (ret || map->num_stripes == num_conventional) {
			if (!ret)
				cache->alloc_offset = last_alloc;
			else
				error(
		"zoned: failed to determine allocation offset of block group %llu",
					  cache->start);
			goto out;
		}
	}

	if (!zoned_profile_supported(map->type, !!fs_info->stripe_root)) {
		error("zoned: profile %s not yet supported",
		      btrfs_group_profile_str(map->type));
		ret = -EINVAL;
		goto out;
	}
	cache->alloc_offset = alloc_offsets[0];

out:
	/* An extent is allocated after the write pointer */
	if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
		error(
		"zoned: got wrong write pointer in block group %llu: %llu > %llu",
		      logical, last_alloc, cache->alloc_offset);
		ret = -EIO;
	}

	if (!ret)
		cache->write_offset = cache->alloc_offset;

	kfree(alloc_offsets);
	return ret;
}

bool btrfs_redirty_extent_buffer_for_zoned(struct btrfs_fs_info *fs_info,
					   u64 start, u64 end)
{
	u64 next;
	struct btrfs_block_group *cache;
	struct extent_buffer *eb;

	if (!btrfs_is_zoned(fs_info))
		return false;

	cache = btrfs_lookup_first_block_group(fs_info, start);
	BUG_ON(!cache);

	if (cache->start + cache->write_offset < start) {
		next = cache->start + cache->write_offset;
		BUG_ON(next + fs_info->nodesize > start);
		eb = btrfs_find_create_tree_block(fs_info, next);
		btrfs_mark_buffer_dirty(eb);
		free_extent_buffer(eb);
		return true;
	}

	cache->write_offset += (end + 1 - start);

	return false;
}

int btrfs_reset_chunk_zones(struct btrfs_fs_info *fs_info, u64 devid,
			    u64 offset, u64 length)
{
	struct btrfs_device *device;

	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
		struct btrfs_zoned_device_info *zinfo;
		struct blk_zone *reset;

		if (device->devid != devid)
			continue;

		zinfo = device->zone_info;
		if (!zone_is_sequential(zinfo, offset))
			continue;

		reset = &zinfo->zones[offset / zinfo->zone_size];
		if (btrfs_reset_dev_zone(device->fd, reset)) {
			error("zoned: failed to reset zone %llu: %m",
			      offset / zinfo->zone_size);
			return -EIO;
		}
	}

	return 0;
}

int btrfs_wipe_temporary_sb(struct btrfs_fs_devices *fs_devices)
{
	struct list_head *head = &fs_devices->devices;
	struct btrfs_device *dev;
	int ret = 0;

	list_for_each_entry(dev, head, dev_list) {
		struct btrfs_zoned_device_info *zinfo = dev->zone_info;

		if (!zinfo)
			continue;

		ret = btrfs_reset_dev_zone(dev->fd, &zinfo->zones[0]);
		if (ret)
			break;
	}

	return ret;
}

bool btrfs_sb_zone_exists(struct btrfs_device *device, u64 bytenr)
{
	u32 zone_num = sb_bytenr_to_sb_zone(bytenr,
					    ilog2(device->zone_info->zone_size));

	return zone_num + 1 <= device->zone_info->nr_zones - 1;
}

#endif

int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
	int ret = 0;

	/* fs_info->zone_size might not set yet. Use the incomapt flag here. */
	if (!btrfs_fs_incompat(fs_info, ZONED))
		return 0;

	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		/* We can skip reading of zone info for missing devices */
		if (device->fd == -1)
			continue;

		ret = btrfs_get_dev_zone_info(device);
		if (ret)
			break;
	}

	return ret;
}

static int btrfs_get_dev_zone_info(struct btrfs_device *device)
{
	struct btrfs_fs_info *fs_info = device->fs_info;

	/*
	 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
	 * yet be set.
	 */
	if (!btrfs_fs_incompat(fs_info, ZONED))
		return 0;

	if (device->zone_info)
		return 0;

	return btrfs_get_zone_info(device->fd, device->name, &device->zone_info);
}

int btrfs_get_zone_info(int fd, const char *file,
			struct btrfs_zoned_device_info **zinfo_ret)
{
#ifdef BTRFS_ZONED
	struct btrfs_zoned_device_info *zinfo;
	int ret;
#endif
	enum btrfs_zoned_model model;

	*zinfo_ret = NULL;

	/* Check zone model */
	model = zoned_model(file);

#ifdef BTRFS_ZONED
	zinfo = calloc(1, sizeof(*zinfo));
	if (!zinfo) {
		error_msg(ERROR_MSG_MEMORY, "zone information");
		exit(1);
	}

	zinfo->model = model;

	/* Get zone information */
	ret = report_zones(fd, file, zinfo);
	if (ret != 0) {
		kfree(zinfo);
		return ret;
	}
	*zinfo_ret = zinfo;
#else
	error("zoned: %s: unsupported host-%s zoned block device", file,
	      model == ZONED_HOST_MANAGED ? "managed" : "aware");
	if (model == ZONED_HOST_MANAGED)
		return -EOPNOTSUPP;

	error("zoned: %s: handling host-aware block device as a regular disk",
	      file);
#endif

	return 0;
}

int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
	u64 zoned_devices = 0;
	u64 nr_devices = 0;
	u64 zone_size = 0;
	const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
	int ret = 0;

	/* Count zoned devices */
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		enum btrfs_zoned_model model;

		if (device->fd == -1)
			continue;

		model = zoned_model(device->name);
		/*
		 * A Host-Managed zoned device must be used as a zoned device.
		 * A Host-Aware zoned device and a non-zoned devices can be
		 * treated as a zoned device, if ZONED flag is enabled in the
		 * superblock.
		 */
		if (model == ZONED_HOST_MANAGED ||
		    (model == ZONED_HOST_AWARE && incompat_zoned) ||
		    (model == ZONED_NONE && incompat_zoned)) {
			struct btrfs_zoned_device_info *zone_info =
				device->zone_info;

			zoned_devices++;
			if (!zone_size) {
				zone_size = zone_info->zone_size;
			} else if (zone_info->zone_size != zone_size) {
				error(
		"zoned: unequal block device zone sizes: have %llu found %llu",
				      device->zone_info->zone_size,
				      zone_size);
				ret = -EINVAL;
				goto out;
			}
		}
		nr_devices++;
	}

	if (!zoned_devices && !incompat_zoned)
		goto out;

	if (!zoned_devices && incompat_zoned) {
		/* No zoned block device found on ZONED filesystem */
		error("zoned: no zoned devices found on a zoned filesystem");
		ret = -EINVAL;
		goto out;
	}

	if (zoned_devices && !incompat_zoned) {
		error("zoned: mode not enabled but zoned device found");
		ret = -EINVAL;
		goto out;
	}

	if (zoned_devices != nr_devices) {
		error("zoned: cannot mix zoned and regular devices");
		ret = -EINVAL;
		goto out;
	}

	/*
	 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
	 * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
	 * check the alignment here.
	 */
	if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
		error("zoned: zone size %llu not aligned to stripe %u",
		      zone_size, BTRFS_STRIPE_LEN);
		ret = -EINVAL;
		goto out;
	}

	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
		error("zoned: mixed block groups not supported");
		ret = -EINVAL;
		goto out;
	}

	fs_info->zone_size = zone_size;
	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;

out:
	return ret;
}