btrfs-progs/scrub.c

/*
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 */

/*
 * Main part to implement offline(unmounted) btrfs scrub
 */

#include <unistd.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
#include "utils.h"
#include "kernel-lib/bitops.h"
#include "task-utils.h"
#include "kernel-lib/raid56.h"

/*
 * For parity based profile (RAID56)
 * Mirror/stripe based on won't need this. They are iterated by bytenr and
 * mirror number.
 */
struct scrub_stripe {
	/* For P/Q logical start will be BTRFS_RAID5/6_P/Q_STRIPE */
	u64 logical;

	u64 physical;

	/* Device is missing */
	unsigned int dev_missing:1;

	/* Any tree/data csum mismatches */
	unsigned int csum_mismatch:1;

	/* Some data doesn't have csum (nodatasum) */
	unsigned int csum_missing:1;

	/* Device fd, to write correct data back to disc */
	int fd;

	char *data;
};

/*
 * RAID56 full stripe (data stripes + P/Q)
 */
struct scrub_full_stripe {
	u64 logical_start;
	u64 logical_len;
	u64 bg_type;
	u32 nr_stripes;
	u32 stripe_len;

	/* Read error stripes */
	u32 err_read_stripes;

	/* Missing devices */
	u32 err_missing_devs;

	/* Csum error data stripes */
	u32 err_csum_dstripes;

	/* Missing csum data stripes */
	u32 missing_csum_dstripes;

	/* currupted stripe index */
	int corrupted_index[2];

	int nr_corrupted_stripes;

	/* Already recovered once? */
	unsigned int recovered:1;

	struct scrub_stripe stripes[];
};

static void free_full_stripe(struct scrub_full_stripe *fstripe)
{
	int i;

	for (i = 0; i < fstripe->nr_stripes; i++)
		free(fstripe->stripes[i].data);
	free(fstripe);
}

static struct scrub_full_stripe *alloc_full_stripe(int nr_stripes,
						    u32 stripe_len)
{
	struct scrub_full_stripe *ret;
	int size = sizeof(*ret) + sizeof(unsigned long *) +
		nr_stripes * sizeof(struct scrub_stripe);
	int i;

	ret = malloc(size);
	if (!ret)
		return NULL;

	memset(ret, 0, size);
	ret->nr_stripes = nr_stripes;
	ret->stripe_len = stripe_len;
	ret->corrupted_index[0] = -1;
	ret->corrupted_index[1] = -1;

	/* Alloc data memory for each stripe */
	for (i = 0; i < nr_stripes; i++) {
		struct scrub_stripe *stripe = &ret->stripes[i];

		stripe->data = malloc(stripe_len);
		if (!stripe->data) {
			free_full_stripe(ret);
			return NULL;
		}
	}
	return ret;
}

static inline int is_data_stripe(struct scrub_stripe *stripe)
{
	u64 bytenr = stripe->logical;

	if (bytenr == BTRFS_RAID5_P_STRIPE || bytenr == BTRFS_RAID6_Q_STRIPE)
		return 0;
	return 1;
}

/*
 * Check one tree mirror given by @bytenr and @mirror, or @data.
 * If @data is not given (NULL), the function will try to read out tree block
 * using @bytenr and @mirror.
 * If @data is given, use data directly, won't try to read from disk.
 *
 * The extra @data prameter is handy for RAID5/6 recovery code to verify
 * the recovered data.
 *
 * Return 0 if everything is OK.
 * Return <0 something goes wrong, and @scrub_ctx accounting will be updated
 * if it's a data corruption.
 */
static int check_tree_mirror(struct btrfs_fs_info *fs_info,
			     struct btrfs_scrub_progress *scrub_ctx,
			     char *data, u64 bytenr, int mirror)
{
	struct extent_buffer *eb;
	u32 nodesize = fs_info->nodesize;
	int ret;

	if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
		/* Such error will be reported by check_tree_block() */
		scrub_ctx->verify_errors++;
		return -EIO;
	}

	eb = btrfs_find_create_tree_block(fs_info, bytenr);
	if (!eb)
		return -ENOMEM;
	if (data) {
		memcpy(eb->data, data, nodesize);
	} else {
		ret = read_whole_eb(fs_info, eb, mirror);
		if (ret) {
			scrub_ctx->read_errors++;
			error("failed to read tree block %llu mirror %d",
			      bytenr, mirror);
			goto out;
		}
	}

	scrub_ctx->tree_bytes_scrubbed += nodesize;
	if (csum_tree_block(fs_info, eb, 1)) {
		error("tree block %llu mirror %d checksum mismatch", bytenr,
			mirror);
		scrub_ctx->csum_errors++;
		ret = -EIO;
		goto out;
	}
	ret = check_tree_block(fs_info, eb);
	if (ret < 0) {
		error("tree block %llu mirror %d is invalid", bytenr, mirror);
		scrub_ctx->verify_errors++;
		goto out;
	}

	scrub_ctx->tree_extents_scrubbed++;
out:
	free_extent_buffer(eb);
	return ret;
}

/*
 * read_extent_data() helper
 *
 * This function will handle short read and update @scrub_ctx when read
 * error happens.
 */
static int read_extent_data_loop(struct btrfs_fs_info *fs_info,
				 struct btrfs_scrub_progress *scrub_ctx,
				 char *buf, u64 start, u64 len, int mirror)
{
	int ret = 0;
	u64 cur = 0;

	while (cur < len) {
		u64 read_len = len - cur;

		ret = read_extent_data(fs_info, buf + cur,
					start + cur, &read_len, mirror);
		if (ret < 0) {
			error("failed to read out data at bytenr %llu mirror %d",
				start + cur, mirror);
			scrub_ctx->read_errors++;
			break;
		}
		cur += read_len;
	}
	return ret;
}

/*
 * Recover all other (corrupted) mirrors for tree block.
 *
 * The method is quite simple, just read out the correct mirror specified by
 * @good_mirror and write back correct data to all other blocks
 */
static int recover_tree_mirror(struct btrfs_fs_info *fs_info,
			       struct btrfs_scrub_progress *scrub_ctx,
			       u64 start, int good_mirror)
{
	char *buf;
	u32 nodesize = fs_info->nodesize;
	int i;
	int num_copies;
	int ret;

	buf = malloc(nodesize);
	if (!buf)
		return -ENOMEM;
	ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start, nodesize,
				    good_mirror);
	if (ret < 0) {
		error("failed to read tree block at bytenr %llu mirror %d",
			start, good_mirror);
		goto out;
	}

	num_copies = btrfs_num_copies(fs_info, start, nodesize);
	for (i = 0; i <= num_copies; i++) {
		if (i == good_mirror)
			continue;
		ret = write_data_to_disk(fs_info, buf, start, nodesize, i);
		if (ret < 0) {
			error("failed to write tree block at bytenr %llu mirror %d",
				start, i);
			goto out;
		}
	}
	ret = 0;
out:
	free(buf);
	return ret;
}

/*
 * Check one data mirror given by @start @len and @mirror, or @data
 * If @data is not given, try to read it from disk.
 * This function will try to read out all the data then check sum.
 *
 * If @data is given, just use the data.
 * This behavior is useful for RAID5/6 recovery code to verify recovered data.
 *
 * If @corrupt_bitmap is given, restore corrupted sector to that bitmap.
 * This is useful for mirror based profiles to recover its data.
 *
 * Return 0 if everything is OK.
 * Return <0 if something goes wrong, and @scrub_ctx accounting will be updated
 * if it's a data corruption.
 */
static int check_data_mirror(struct btrfs_fs_info *fs_info,
			     struct btrfs_scrub_progress *scrub_ctx,
			     char *data, u64 start, u64 len, int mirror,
			     unsigned long *corrupt_bitmap)
{
	u32 sectorsize = fs_info->sectorsize;
	u32 data_csum;
	u32 *csums = NULL;
	char *buf = NULL;
	int ret = 0;
	int err = 0;
	int i;
	unsigned long *csum_bitmap = NULL;

	if (!data) {
		buf = malloc(len);
		if (!buf)
			return -ENOMEM;
		ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start,
					     len, mirror);
		if (ret < 0)
			goto out;
		scrub_ctx->data_bytes_scrubbed += len;
	} else {
		buf = data;
	}

	/* Alloc and Check csums */
	csums = malloc(len / sectorsize * sizeof(data_csum));
	if (!csums) {
		ret = -ENOMEM;
		goto out;
	}
	csum_bitmap = malloc(calculate_bitmap_len(len / sectorsize));
	if (!csum_bitmap) {
		ret = -ENOMEM;
		goto out;
	}

	if (corrupt_bitmap)
		memset(corrupt_bitmap, 0,
			calculate_bitmap_len(len / sectorsize));
	ret = btrfs_read_data_csums(fs_info, start, len, csums, csum_bitmap);
	if (ret < 0)
		goto out;

	for (i = 0; i < len / sectorsize; i++) {
		if (!test_bit(i, csum_bitmap)) {
			scrub_ctx->csum_discards++;
			continue;
		}

		data_csum = ~(u32)0;
		data_csum = btrfs_csum_data(buf + i * sectorsize, data_csum,
					    sectorsize);
		btrfs_csum_final(data_csum, (u8 *)&data_csum);

		if (memcmp(&data_csum, (char *)csums + i * sizeof(data_csum),
				   sizeof(data_csum))) {
			error("data at bytenr %llu mirror %d csum mismatch, have 0x%08x expect 0x%08x",
			      start + i * sectorsize, mirror, data_csum,
			      *(u32 *)((char *)csums + i * sizeof(data_csum)));
			err = 1;
			scrub_ctx->csum_errors++;
			if (corrupt_bitmap)
				set_bit(i, corrupt_bitmap);
			continue;
		}
		scrub_ctx->data_bytes_scrubbed += sectorsize;
	}
out:
	if (!data)
		free(buf);
	free(csums);
	free(csum_bitmap);

	if (!ret && err)
		return -EIO;
	return ret;
}

/* Helper to check all mirrors for a good copy */
static int has_good_mirror(unsigned long *corrupt_bitmaps[], int num_copies,
			   int bit, int *good_mirror)
{
	int found_good = 0;
	int i;

	for (i = 0; i < num_copies; i++) {
		if (!test_bit(bit, corrupt_bitmaps[i])) {
			found_good = 1;
			if (good_mirror)
				*good_mirror = i + 1;
			break;
		}
	}
	return found_good;
}

/*
 * Helper function to check @corrupt_bitmaps, to verify if it's recoverable
 * for mirror based data extent.
 *
 * Return 1 for recoverable, and 0 for not recoverable
 */
static int check_data_mirror_recoverable(struct btrfs_fs_info *fs_info,
					 u64 start, u64 len, u32 sectorsize,
					 unsigned long *corrupt_bitmaps[])
{
	int i;
	int corrupted = 0;
	int bit;
	int num_copies = btrfs_num_copies(fs_info, start, len);

	for (i = 0; i < num_copies; i++) {
		for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) {
			if (!has_good_mirror(corrupt_bitmaps, num_copies,
					     bit, NULL)) {
				corrupted = 1;
				goto out;
			}
		}
	}
out:
	return !corrupted;
}

/*
 * Try to recover all corrupted sectors specified by @corrupt_bitmaps,
 * by reading out good sector in other mirror.
 */
static int recover_data_mirror(struct btrfs_fs_info *fs_info,
			       struct btrfs_scrub_progress *scrub_ctx,
			       u64 start, u64 len,
			       unsigned long *corrupt_bitmaps[])
{
	char *buf;
	u32 sectorsize = fs_info->sectorsize;
	int ret = 0;
	int bit;
	int i;
	int bad_mirror;
	int num_copies;

	/* Don't bother to recover unrecoverable extents */
	if (!check_data_mirror_recoverable(fs_info, start, len,
					   sectorsize, corrupt_bitmaps))
		return -EIO;

	buf = malloc(sectorsize);
	if (!buf)
		return -ENOMEM;

	num_copies = btrfs_num_copies(fs_info, start, len);
	for (i = 0; i < num_copies; i++) {
		for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) {
			u64 cur = start + bit * sectorsize;
			int good;

			/* Find good mirror */
			ret = has_good_mirror(corrupt_bitmaps, num_copies, bit,
					      &good);
			if (!ret) {
				error("failed to find good mirror for bytenr %llu",
					cur);
				ret = -EIO;
				goto out;
			}
			/* Read out good mirror */
			ret = read_data_from_disk(fs_info, buf, cur,
						  sectorsize, good);
			if (ret < 0) {
				error("failed to read good mirror from bytenr %llu mirror %d",
					cur, good);
				goto out;
			}
			/* Write back to all other mirrors */
			for (bad_mirror = 1; bad_mirror <= num_copies;
			     bad_mirror++) {
				if (bad_mirror == good)
					continue;
				ret = write_data_to_disk(fs_info, buf, cur,
						sectorsize, bad_mirror);
				if (ret < 0) {
					error("failed to recover mirror for bytenr %llu mirror %d",
						cur, bad_mirror);
					goto out;
				}
			}
		}
	}
out:
	free(buf);
	return ret;
}

/* Btrfs only supports up to 2 copies of data, yet */
#define BTRFS_MAX_COPIES	2

/*
 * Check all copies of range @start, @len.
 * Caller must ensure the range is covered by EXTENT_ITEM/METADATA_ITEM
 * specified by leaf of @path.
 * And @start, @len must be a subset of the EXTENT_ITEM/METADATA_ITEM.
 *
 * Return 0 if the range is all OK or recovered or recoverable.
 * Return <0 if the range can't be recoverable.
 */
static int scrub_one_extent(struct btrfs_fs_info *fs_info,
			    struct btrfs_scrub_progress *scrub_ctx,
			    struct btrfs_path *path, u64 start, u64 len,
			    int write)
{
	struct btrfs_key key;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf = path->nodes[0];
	u32 sectorsize = fs_info->sectorsize;
	unsigned long *corrupt_bitmaps[BTRFS_MAX_COPIES] = { NULL };
	int slot = path->slots[0];
	int num_copies;
	int meta_corrupted = 0;
	int meta_good_mirror = 0;
	int data_bad_mirror = 0;
	u64 extent_start;
	u64 extent_len;
	int metadata = 0;
	int i;
	int ret = 0;

	btrfs_item_key_to_cpu(leaf, &key, slot);
	if (key.type != BTRFS_METADATA_ITEM_KEY &&
	    key.type != BTRFS_EXTENT_ITEM_KEY)
		goto invalid_arg;

	extent_start = key.objectid;
	if (key.type == BTRFS_METADATA_ITEM_KEY) {
		extent_len = fs_info->nodesize;
		metadata = 1;
	} else {
		extent_len = key.offset;
		ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
		if (btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)
			metadata = 1;
	}
	if (start >= extent_start + extent_len ||
	    start + len <= extent_start)
		goto invalid_arg;

	for (i = 0; i < BTRFS_MAX_COPIES; i++) {
		corrupt_bitmaps[i] = malloc(
				calculate_bitmap_len(len / sectorsize));
		if (!corrupt_bitmaps[i])
			goto out;
	}
	num_copies = btrfs_num_copies(fs_info, start, len);
	for (i = 1; i <= num_copies; i++) {
		if (metadata) {
			ret = check_tree_mirror(fs_info, scrub_ctx,
					NULL, extent_start, i);
			scrub_ctx->tree_extents_scrubbed++;
			if (ret < 0)
				meta_corrupted++;
			else
				meta_good_mirror = i;
		} else {
			ret = check_data_mirror(fs_info, scrub_ctx, NULL, start,
						len, i, corrupt_bitmaps[i - 1]);
			scrub_ctx->data_extents_scrubbed++;
		}
	}

	/* Metadata recover and report */
	if (metadata) {
		if (!meta_corrupted) {
			goto out;
		} else if (meta_corrupted && meta_corrupted < num_copies) {
			if (write) {
				ret = recover_tree_mirror(fs_info, scrub_ctx,
						start, meta_good_mirror);
				if (ret < 0) {
					error("failed to recover tree block at bytenr %llu",
						start);
					goto out;
				}
				printf("extent %llu len %llu REPAIRED: has corrupted mirror, repaired\n",
					start, len);
				goto out;
			}
			printf("extent %llu len %llu RECOVERABLE: has corrupted mirror, but is recoverable\n",
				start, len);
			goto out;
		} else {
			error("extent %llu len %llu CORRUPTED: all mirror(s) corrupted, can't be recovered",
				start, len);
			ret = -EIO;
			goto out;
		}
	}
	/* Data recover and report */
	for (i = 0; i < num_copies; i++) {
		if (find_first_bit(corrupt_bitmaps[i], len / sectorsize) >=
		    len / sectorsize)
			continue;
		data_bad_mirror = i + 1;
	}
	/* All data sectors are good */
	if (!data_bad_mirror) {
		ret = 0;
		goto out;
	}

	if (check_data_mirror_recoverable(fs_info, start, len,
					  sectorsize, corrupt_bitmaps)) {
		if (write) {
			ret = recover_data_mirror(fs_info, scrub_ctx, start,
						  len, corrupt_bitmaps);
			if (ret < 0) {
				error("failed to recover data extent at bytenr %llu len %llu",
					start, len);
				goto out;
			}
			printf("extent %llu len %llu REPARIED: has corrupted mirror, repaired\n",
				start, len);
			goto out;
		}
		printf("extent %llu len %llu RECOVERABLE: has corrupted mirror, recoverable\n",
			start, len);
		goto out;
	}
	error("extent %llu len %llu CORRUPTED, all mirror(s) corrupted, can't be repaired",
		start, len);
	ret = -EIO;
out:
	for (i = 0; i < BTRFS_MAX_COPIES; i++)
		kfree(corrupt_bitmaps[i]);
	return ret;

invalid_arg:
	error("invalid parameter for %s", __func__);
	return -EINVAL;
}

/*
 * Scrub one full data stripe of RAID5/6.
 * This means it will check any data/metadata extent in the data stripe
 * spcified by @stripe and @stripe_len
 *
 * This function will only *CHECK* if the data stripe has any corruption.
 * Won't repair at this function.
 *
 * Return 0 if the full stripe is OK.
 * Return <0 if any error is found.
 * Note: Missing csum is not counted as error (NODATACSUM is valid)
 */
static int scrub_one_data_stripe(struct btrfs_fs_info *fs_info,
				 struct btrfs_scrub_progress *scrub_ctx,
				 struct scrub_stripe *stripe, u32 stripe_len)
{
	struct btrfs_path *path;
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_key key;
	u64 extent_start;
	u64 extent_len;
	u64 orig_csum_discards;
	int ret;

	if (!is_data_stripe(stripe))
		return -EINVAL;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = stripe->logical + stripe_len;
	key.offset = 0;
	key.type = 0;

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	while (1) {
		struct btrfs_extent_item *ei;
		struct extent_buffer *eb;
		char *data;
		int slot;
		int metadata = 0;
		u64 check_start;
		u64 check_len;

		ret = btrfs_previous_extent_item(extent_root, path, 0);
		if (ret > 0) {
			ret = 0;
			goto out;
		}
		if (ret < 0)
			goto out;
		eb = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(eb, &key, slot);
		extent_start = key.objectid;
		ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);

		/* tree block scrub */
		if (key.type == BTRFS_METADATA_ITEM_KEY ||
		    btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
			extent_len = extent_root->fs_info->nodesize;
			metadata = 1;
		} else {
			extent_len = key.offset;
			metadata = 0;
		}

		/* Current extent is out of our range, loop comes to end */
		if (extent_start + extent_len <= stripe->logical)
			break;

		if (metadata) {
			/*
			 * Check crossing stripe first, which can't be scrubbed
			 */
			if (check_crossing_stripes(fs_info, extent_start,
					extent_root->fs_info->nodesize)) {
				error("tree block at %llu is crossing stripe boundary, unable to scrub",
					extent_start);
				ret = -EIO;
				goto out;
			}
			data = stripe->data + extent_start - stripe->logical;
			ret = check_tree_mirror(fs_info, scrub_ctx,
						data, extent_start, 0);
			/* Any csum/verify error means the stripe is screwed */
			if (ret < 0) {
				stripe->csum_mismatch = 1;
				ret = -EIO;
				goto out;
			}
			ret = 0;
			continue;
		}
		/* Restrict the extent range to fit stripe range */
		check_start = max(extent_start, stripe->logical);
		check_len = min(extent_start + extent_len, stripe->logical +
				stripe_len) - check_start;

		/* Record original csum_discards to detect missing csum case */
		orig_csum_discards = scrub_ctx->csum_discards;

		data = stripe->data + check_start - stripe->logical;
		ret = check_data_mirror(fs_info, scrub_ctx, data, check_start,
					check_len, 0, NULL);
		/* Csum mismatch, no need to continue anyway*/
		if (ret < 0) {
			stripe->csum_mismatch = 1;
			goto out;
		}
		/* Check if there is any missing csum for data */
		if (scrub_ctx->csum_discards != orig_csum_discards)
			stripe->csum_missing = 1;
		/*
		 * Only increase data_extents_scrubbed if we are scrubbing the
		 * tailing part of the data extent
		 */
		if (extent_start + extent_len <= stripe->logical + stripe_len)
			scrub_ctx->data_extents_scrubbed++;
		ret = 0;
	}
out:
	btrfs_free_path(path);
	return ret;
}

/*
 * Verify parities for RAID56
 * Caller must fill @fstripe before calling this function
 *
 * Return 0 for parities matches.
 * Return >0 for P or Q mismatch
 * Return <0 for fatal error
 */
static int verify_parities(struct btrfs_fs_info *fs_info,
			   struct btrfs_scrub_progress *scrub_ctx,
			   struct scrub_full_stripe *fstripe)
{
	void **ptrs;
	void *ondisk_p = NULL;
	void *ondisk_q = NULL;
	void *buf_p;
	void *buf_q;
	int nr_stripes = fstripe->nr_stripes;
	int stripe_len = BTRFS_STRIPE_LEN;
	int i;
	int ret = 0;

	ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
	buf_p = malloc(fstripe->stripe_len);
	buf_q = malloc(fstripe->stripe_len);
	if (!ptrs || !buf_p || !buf_q) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < fstripe->nr_stripes; i++) {
		struct scrub_stripe *stripe = &fstripe->stripes[i];

		if (stripe->logical == BTRFS_RAID5_P_STRIPE) {
			ondisk_p = stripe->data;
			ptrs[i] = buf_p;
			continue;
		} else if (stripe->logical == BTRFS_RAID6_Q_STRIPE) {
			ondisk_q = stripe->data;
			ptrs[i] = buf_q;
			continue;
		} else {
			ptrs[i] = stripe->data;
			continue;
		}
	}
	/* RAID6 */
	if (ondisk_q) {
		raid6_gen_syndrome(nr_stripes, stripe_len, ptrs);

		if (memcmp(ondisk_q, ptrs[nr_stripes - 1], stripe_len) != 0 ||
		    memcmp(ondisk_p, ptrs[nr_stripes - 2], stripe_len))
			ret = 1;
	} else {
		ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1,
					ptrs);
		if (ret < 0)
			goto out;
		if (memcmp(ondisk_p, ptrs[nr_stripes - 1], stripe_len) != 0)
			ret = 1;
	}
out:
	free(buf_p);
	free(buf_q);
	free(ptrs);
	return ret;
}

/*
 * Try to recover data stripe from P or Q stripe
 *
 * Return >0 if it can't be require any more.
 * Return 0 for successful repair or no need to repair at all
 * Return <0 for fatal error
 */
static int recover_from_parities(struct btrfs_fs_info *fs_info,
				  struct btrfs_scrub_progress *scrub_ctx,
				  struct scrub_full_stripe *fstripe)
{
	void **ptrs;
	int nr_stripes = fstripe->nr_stripes;
	int stripe_len = BTRFS_STRIPE_LEN;
	int max_tolerance;
	int i;
	int ret;

	/* No need to recover */
	if (!fstripe->nr_corrupted_stripes)
		return 0;

	/* Already recovered once, no more chance */
	if (fstripe->recovered)
		return 1;

	if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5)
		max_tolerance = 1;
	else
		max_tolerance = 2;

	/* Out of repair */
	if (fstripe->nr_corrupted_stripes > max_tolerance)
		return 1;

	ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
	if (!ptrs)
		return -ENOMEM;

	/* Construct ptrs */
	for (i = 0; i < nr_stripes; i++)
		ptrs[i] = fstripe->stripes[i].data;

	ret = raid56_recov(nr_stripes, stripe_len, fstripe->bg_type,
			fstripe->corrupted_index[0],
			fstripe->corrupted_index[1], ptrs);
	fstripe->recovered = 1;
	free(ptrs);
	return ret;
}

/*
 * Helper to write a full stripe to disk
 * P/Q will be re-calculated.
 */
static int write_full_stripe(struct scrub_full_stripe *fstripe)
{
	void **ptrs;
	int nr_stripes = fstripe->nr_stripes;
	int stripe_len = BTRFS_STRIPE_LEN;
	int i;
	int ret = 0;

	ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
	if (!ptrs)
		return -ENOMEM;

	for (i = 0; i < fstripe->nr_stripes; i++)
		ptrs[i] = fstripe->stripes[i].data;

	if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID6) {
		raid6_gen_syndrome(nr_stripes, stripe_len, ptrs);
	} else {
		ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1,
					ptrs);
		if (ret < 0)
			goto out;
	}

	for (i = 0; i < fstripe->nr_stripes; i++) {
		struct scrub_stripe *stripe = &fstripe->stripes[i];

		ret = pwrite(stripe->fd, stripe->data, fstripe->stripe_len,
			     stripe->physical);
		if (ret != fstripe->stripe_len) {
			ret = -EIO;
			goto out;
		}
	}
out:
	free(ptrs);
	return ret;
}

/*
 * Return 0 if we still have chance to recover
 * Return <0 if we have no more chance
 */
static int report_recoverablity(struct scrub_full_stripe *fstripe)
{
	int max_tolerance;
	u64 start = fstripe->logical_start;

	if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5)
		max_tolerance = 1;
	else
		max_tolerance = 2;

	if (fstripe->nr_corrupted_stripes > max_tolerance) {
		error(
	"full stripe %llu CORRUPTED: too many read error or corrupted devices",
			start);
		error(
	"full stripe %llu: tolerance: %d, missing: %d, read error: %d, csum error: %d",
			start, max_tolerance, fstripe->err_read_stripes,
			fstripe->err_missing_devs, fstripe->err_csum_dstripes);
		return -EIO;
	}
	return 0;
}

static void clear_corrupted_stripe_record(struct scrub_full_stripe *fstripe)
{
	fstripe->corrupted_index[0] = -1;
	fstripe->corrupted_index[1] = -1;
	fstripe->nr_corrupted_stripes = 0;
}

static void record_corrupted_stripe(struct scrub_full_stripe *fstripe,
				    int index)
{
	int i = 0;

	for (i = 0; i < 2; i++) {
		if (fstripe->corrupted_index[i] == -1) {
			fstripe->corrupted_index[i] = index;
			break;
		}
	}
	fstripe->nr_corrupted_stripes++;
}

/*
 * Scrub one full stripe.
 *
 * If everything matches, that's good.
 * If data stripe corrupted badly, no mean to recovery, it will report it.
 * If data stripe corrupted, try recovery first and recheck csum, to
 * determine if it's recoverable or screwed up.
 */
static int scrub_one_full_stripe(struct btrfs_fs_info *fs_info,
				 struct btrfs_scrub_progress *scrub_ctx,
				 u64 start, u64 *next_ret, int write)
{
	struct scrub_full_stripe *fstripe;
	struct btrfs_map_block *map_block = NULL;
	u32 stripe_len = BTRFS_STRIPE_LEN;
	u64 bg_type;
	u64 len;
	int i;
	int ret;

	if (!next_ret) {
		error("invalid argument for %s", __func__);
		return -EINVAL;
	}

	ret = __btrfs_map_block_v2(fs_info, WRITE, start, stripe_len,
				   &map_block);
	if (ret < 0) {
		/* Let caller to skip the whole block group */
		*next_ret = (u64)-1;
		return ret;
	}
	start = map_block->start;
	len = map_block->length;
	*next_ret = start + len;

	/*
	 * Step 0: Check if we need to scrub the full stripe
	 *
	 * If no extent lies in the full stripe, not need to check
	 */
	ret = btrfs_check_extent_exists(fs_info, start, len);
	if (ret < 0) {
		free(map_block);
		return ret;
	}
	/* No extents in range, no need to check */
	if (ret == 0) {
		free(map_block);
		return 0;
	}

	bg_type = map_block->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
	if (bg_type != BTRFS_BLOCK_GROUP_RAID5 &&
	    bg_type != BTRFS_BLOCK_GROUP_RAID6) {
		free(map_block);
		return -EINVAL;
	}

	fstripe = alloc_full_stripe(map_block->num_stripes,
				    map_block->stripe_len);
	if (!fstripe)
		return -ENOMEM;

	fstripe->logical_start = map_block->start;
	fstripe->nr_stripes = map_block->num_stripes;
	fstripe->stripe_len = stripe_len;
	fstripe->bg_type = bg_type;

	/*
	 * Step 1: Read out the whole full stripe
	 *
	 * Then we have the chance to exit early if too many devices are
	 * missing.
	 */
	for (i = 0; i < map_block->num_stripes; i++) {
		struct scrub_stripe *s_stripe = &fstripe->stripes[i];
		struct btrfs_map_stripe *m_stripe = &map_block->stripes[i];

		s_stripe->logical = m_stripe->logical;
		s_stripe->fd = m_stripe->dev->fd;
		s_stripe->physical = m_stripe->physical;

		if (m_stripe->dev->fd == -1) {
			s_stripe->dev_missing = 1;
			record_corrupted_stripe(fstripe, i);
			fstripe->err_missing_devs++;
			continue;
		}

		ret = pread(m_stripe->dev->fd, s_stripe->data, stripe_len,
			    m_stripe->physical);
		if (ret < stripe_len) {
			record_corrupted_stripe(fstripe, i);
			fstripe->err_read_stripes++;
			continue;
		}
	}

	ret = report_recoverablity(fstripe);
	if (ret < 0)
		goto out;

	ret = recover_from_parities(fs_info, scrub_ctx, fstripe);
	if (ret < 0) {
		error("full stripe %llu CORRUPTED: failed to recover: %s\n",
		      fstripe->logical_start, strerror(-ret));
		goto out;
	}

	/*
	 * Clear corrupted stripes report, since they are recovered,
	 * and later checker need to record csum mismatch stripes reusing
	 * these members
	 */
	clear_corrupted_stripe_record(fstripe);

	/*
	 * Step 2: Check each data stripes against csum
	 */
	for (i = 0; i < map_block->num_stripes; i++) {
		struct scrub_stripe *stripe = &fstripe->stripes[i];

		if (!is_data_stripe(stripe))
			continue;
		ret = scrub_one_data_stripe(fs_info, scrub_ctx, stripe,
					    stripe_len);
		if (ret < 0) {
			fstripe->err_csum_dstripes++;
			record_corrupted_stripe(fstripe, i);
		}
	}

	ret = report_recoverablity(fstripe);
	if (ret < 0)
		goto out;

	/*
	 * Recovered before, but no csum error
	 */
	if (fstripe->err_csum_dstripes == 0 && fstripe->recovered) {
		error(
		"full stripe %llu RECOVERABLE: P/Q is good for recovery",
			start);
		ret = 0;
		goto out;
	}
	/*
	 * No csum error, not recovered before.
	 *
	 * Only need to check if P/Q matches.
	 */
	if (fstripe->err_csum_dstripes == 0 && !fstripe->recovered) {
		ret = verify_parities(fs_info, scrub_ctx, fstripe);
		if (ret < 0) {
			error(
		"full stripe %llu CORRUPTED: failed to check P/Q: %s",
				start, strerror(-ret));
			goto out;
		}
		if (ret > 0) {
			if (write) {
				ret = write_full_stripe(fstripe);
				if (ret < 0)
					error("failed to write full stripe %llu: %s",
						start, strerror(-ret));
				else
					printf("full stripe %llu REPARIED: only P/Q mismatches, repaired\n",
						start);
				goto out;
			} else {
				printf("full stripe %llu RECOVERABLE: only P/Q is corrupted\n",
					start);
				ret = 0;
			}
		}
		goto out;
	}

	/*
	 * Still csum error after recovery
	 *
	 * No mean to fix further, screwed up already.
	 */
	if (fstripe->err_csum_dstripes && fstripe->recovered) {
		error(
	"full stripe %llu CORRUPTED: csum still mismatch after recovery",
			start);
		ret = -EIO;
		goto out;
	}

	/* Csum mismatch, but we still has chance to recover. */
	ret = recover_from_parities(fs_info, scrub_ctx, fstripe);
	if (ret < 0) {
		error(
	"full stripe %llu CORRUPTED: failed to recover: %s\n",
			fstripe->logical_start, strerror(-ret));
		goto out;
	}

	/* After recovery, recheck data stripe csum */
	for (i = 0; i < 2; i++) {
		int index = fstripe->corrupted_index[i];
		struct scrub_stripe *stripe;

		if (i == -1)
			continue;
		stripe = &fstripe->stripes[index];
		ret = scrub_one_data_stripe(fs_info, scrub_ctx, stripe,
					    stripe_len);
		if (ret < 0) {
			error(
	"full stripe %llu CORRUPTED: csum still mismatch after recovery",
				start);
			goto out;
		}
	}
	if (write) {
		ret = write_full_stripe(fstripe);
		if (ret < 0)
			error("failed to write full stripe %llu: %s",
				start, strerror(-ret));
		else
			printf("full stripe %llu REPARIED: corrupted data with good P/Q, repaired\n",
				start);
		goto out;
	}
	printf(
	"full stripe %llu RECOVERABLE: Data stripes corrupted, but P/Q is good\n",
		start);

out:
	free_full_stripe(fstripe);
	free(map_block);
	return ret;
}

/*
 * Scrub one block group.
 *
 * This function will handle all profiles current btrfs supports.
 * Return 0 for scrubbing the block group. Found error will be recorded into
 * scrub_ctx.
 * Return <0 for fatal error preventing scrubing the block group.
 */
static int scrub_one_block_group(struct btrfs_fs_info *fs_info,
				 struct btrfs_scrub_progress *scrub_ctx,
				 struct btrfs_block_group_cache *bg_cache,
				 int write)
{
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_path *path;
	struct btrfs_key key;
	u64 bg_start = bg_cache->key.objectid;
	u64 bg_len = bg_cache->key.offset;
	int ret;

	if (bg_cache->flags &
	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
		u64 cur = bg_start;
		u64 next;

		while (cur < bg_start + bg_len) {
			ret = scrub_one_full_stripe(fs_info, scrub_ctx, cur,
						    &next, write);
			/* Ignore any non-fatal error */
			if (ret < 0 && ret != -EIO) {
				error("fatal error happens checking one full stripe at bytenr: %llu: %s",
					cur, strerror(-ret));
				return ret;
			}
			cur = next;
		}
		/* Ignore any -EIO error, such error will be reported at last */
		return 0;
	}
	/* None parity based profile, check extent by extent */
	key.objectid = bg_start;
	key.type = 0;
	key.offset = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	while (1) {
		struct extent_buffer *eb = path->nodes[0];
		int slot = path->slots[0];
		u64 extent_start;
		u64 extent_len;

		btrfs_item_key_to_cpu(eb, &key, slot);
		if (key.objectid >= bg_start + bg_len)
			break;
		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
		    key.type != BTRFS_METADATA_ITEM_KEY)
			goto next;

		extent_start = key.objectid;
		if (key.type == BTRFS_METADATA_ITEM_KEY)
			extent_len = extent_root->fs_info->nodesize;
		else
			extent_len = key.offset;

		ret = scrub_one_extent(fs_info, scrub_ctx, path, extent_start,
					extent_len, write);
		if (ret < 0 && ret != -EIO) {
			error("fatal error checking extent bytenr %llu len %llu: %s",
				extent_start, extent_len, strerror(-ret));
			goto out;
		}
		ret = 0;
next:
		ret = btrfs_next_extent_item(extent_root, path, bg_start +
					     bg_len);
		if (ret < 0)
			goto out;
		if (ret > 0) {
			ret = 0;
			break;
		}
	}
out:
	btrfs_free_path(path);
	return ret;
}

int btrfs_scrub(struct btrfs_fs_info *fs_info, struct task_context *task,
		int write)
{
	u64 bg_nr = 0;
	struct btrfs_block_group_cache *bg_cache;
	struct btrfs_scrub_progress scrub_ctx = {0};
	int ret = 0;

	ASSERT(fs_info);

	bg_cache = btrfs_lookup_first_block_group(fs_info, 0);
	if (!bg_cache) {
		error("no block group is found");
		return -ENOENT;
	}
	++bg_nr;

	if (task) {
		/* get block group numbers for progress */
		while (1) {
			u64 bg_offset = bg_cache->key.objectid +
				bg_cache->key.offset;
			bg_cache = btrfs_lookup_first_block_group(fs_info,
								  bg_offset);
			if (!bg_cache)
				break;
			++bg_nr;
		}
		task->all = bg_nr;
		task->cur = 1;
		task_start(task->info);

		bg_cache = btrfs_lookup_first_block_group(fs_info, 0);
	}

	while (1) {
		ret = scrub_one_block_group(fs_info, &scrub_ctx, bg_cache,
					    write);
		if (ret < 0 && ret != -EIO)
			break;
		if (task)
			task->cur++;

		bg_cache = btrfs_lookup_first_block_group(fs_info,
				bg_cache->key.objectid + bg_cache->key.offset);
		if (!bg_cache)
			break;
	}

	if (task)
		task_stop(task->info);

	printf("Scrub result:\n");
	printf("Tree bytes scrubbed: %llu\n", scrub_ctx.tree_bytes_scrubbed);
	printf("Tree extents scrubbed: %llu\n", scrub_ctx.tree_extents_scrubbed);
	printf("Data bytes scrubbed: %llu\n", scrub_ctx.data_bytes_scrubbed);
	printf("Data extents scrubbed: %llu\n", scrub_ctx.data_extents_scrubbed);
	printf("Data bytes without csum: %llu\n", scrub_ctx.csum_discards *
			fs_info->sectorsize);
	printf("Read error: %llu\n", scrub_ctx.read_errors);
	printf("Verify error: %llu\n", scrub_ctx.verify_errors);
	printf("Csum error: %llu\n", scrub_ctx.csum_errors);
	if (scrub_ctx.csum_errors || scrub_ctx.read_errors ||
	    scrub_ctx.uncorrectable_errors || scrub_ctx.verify_errors)
		ret = 1;
	else
		ret = 0;
	return ret;
}