2016-12-26 06:29:29 +00:00
|
|
|
/*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public
|
|
|
|
* License v2 as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Main part to implement offline(unmounted) btrfs scrub
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <unistd.h>
|
|
|
|
#include "ctree.h"
|
|
|
|
#include "volumes.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "utils.h"
|
2016-12-26 06:29:31 +00:00
|
|
|
#include "kernel-lib/bitops.h"
|
2016-12-26 06:29:34 +00:00
|
|
|
#include "kernel-lib/raid56.h"
|
2016-12-26 06:29:29 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For parity based profile (RAID56)
|
|
|
|
* Mirror/stripe based on won't need this. They are iterated by bytenr and
|
|
|
|
* mirror number.
|
|
|
|
*/
|
|
|
|
struct scrub_stripe {
|
|
|
|
/* For P/Q logical start will be BTRFS_RAID5/6_P/Q_STRIPE */
|
|
|
|
u64 logical;
|
|
|
|
|
|
|
|
u64 physical;
|
|
|
|
|
|
|
|
/* Device is missing */
|
|
|
|
unsigned int dev_missing:1;
|
|
|
|
|
|
|
|
/* Any tree/data csum mismatches */
|
|
|
|
unsigned int csum_mismatch:1;
|
|
|
|
|
|
|
|
/* Some data doesn't have csum (nodatasum) */
|
|
|
|
unsigned int csum_missing:1;
|
|
|
|
|
|
|
|
/* Device fd, to write correct data back to disc */
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
char *data;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RAID56 full stripe (data stripes + P/Q)
|
|
|
|
*/
|
|
|
|
struct scrub_full_stripe {
|
|
|
|
u64 logical_start;
|
|
|
|
u64 logical_len;
|
|
|
|
u64 bg_type;
|
|
|
|
u32 nr_stripes;
|
|
|
|
u32 stripe_len;
|
|
|
|
|
|
|
|
/* Read error stripes */
|
|
|
|
u32 err_read_stripes;
|
|
|
|
|
|
|
|
/* Missing devices */
|
|
|
|
u32 err_missing_devs;
|
|
|
|
|
|
|
|
/* Csum error data stripes */
|
|
|
|
u32 err_csum_dstripes;
|
|
|
|
|
|
|
|
/* Missing csum data stripes */
|
|
|
|
u32 missing_csum_dstripes;
|
|
|
|
|
|
|
|
/* currupted stripe index */
|
|
|
|
int corrupted_index[2];
|
|
|
|
|
|
|
|
int nr_corrupted_stripes;
|
|
|
|
|
|
|
|
/* Already recovered once? */
|
|
|
|
unsigned int recovered:1;
|
|
|
|
|
|
|
|
struct scrub_stripe stripes[];
|
|
|
|
};
|
|
|
|
|
|
|
|
static void free_full_stripe(struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < fstripe->nr_stripes; i++)
|
|
|
|
free(fstripe->stripes[i].data);
|
|
|
|
free(fstripe);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct scrub_full_stripe *alloc_full_stripe(int nr_stripes,
|
|
|
|
u32 stripe_len)
|
|
|
|
{
|
|
|
|
struct scrub_full_stripe *ret;
|
|
|
|
int size = sizeof(*ret) + sizeof(unsigned long *) +
|
|
|
|
nr_stripes * sizeof(struct scrub_stripe);
|
|
|
|
int i;
|
|
|
|
|
|
|
|
ret = malloc(size);
|
|
|
|
if (!ret)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
memset(ret, 0, size);
|
|
|
|
ret->nr_stripes = nr_stripes;
|
|
|
|
ret->stripe_len = stripe_len;
|
|
|
|
ret->corrupted_index[0] = -1;
|
|
|
|
ret->corrupted_index[1] = -1;
|
|
|
|
|
|
|
|
/* Alloc data memory for each stripe */
|
|
|
|
for (i = 0; i < nr_stripes; i++) {
|
|
|
|
struct scrub_stripe *stripe = &ret->stripes[i];
|
|
|
|
|
|
|
|
stripe->data = malloc(stripe_len);
|
|
|
|
if (!stripe->data) {
|
|
|
|
free_full_stripe(ret);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:30 +00:00
|
|
|
|
|
|
|
static inline int is_data_stripe(struct scrub_stripe *stripe)
|
|
|
|
{
|
|
|
|
u64 bytenr = stripe->logical;
|
|
|
|
|
|
|
|
if (bytenr == BTRFS_RAID5_P_STRIPE || bytenr == BTRFS_RAID6_Q_STRIPE)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check one tree mirror given by @bytenr and @mirror, or @data.
|
|
|
|
* If @data is not given (NULL), the function will try to read out tree block
|
|
|
|
* using @bytenr and @mirror.
|
|
|
|
* If @data is given, use data directly, won't try to read from disk.
|
|
|
|
*
|
|
|
|
* The extra @data prameter is handy for RAID5/6 recovery code to verify
|
|
|
|
* the recovered data.
|
|
|
|
*
|
|
|
|
* Return 0 if everything is OK.
|
|
|
|
* Return <0 something goes wrong, and @scrub_ctx accounting will be updated
|
|
|
|
* if it's a data corruption.
|
|
|
|
*/
|
|
|
|
static int check_tree_mirror(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
char *data, u64 bytenr, int mirror)
|
|
|
|
{
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
u32 nodesize = fs_info->nodesize;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
|
|
|
|
/* Such error will be reported by check_tree_block() */
|
|
|
|
scrub_ctx->verify_errors++;
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
eb = btrfs_find_create_tree_block(fs_info, bytenr, nodesize);
|
|
|
|
if (!eb)
|
|
|
|
return -ENOMEM;
|
|
|
|
if (data) {
|
|
|
|
memcpy(eb->data, data, nodesize);
|
|
|
|
} else {
|
|
|
|
ret = read_whole_eb(fs_info, eb, mirror);
|
|
|
|
if (ret) {
|
|
|
|
scrub_ctx->read_errors++;
|
|
|
|
error("failed to read tree block %llu mirror %d",
|
|
|
|
bytenr, mirror);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
scrub_ctx->tree_bytes_scrubbed += nodesize;
|
|
|
|
if (csum_tree_block(fs_info, eb, 1)) {
|
|
|
|
error("tree block %llu mirror %d checksum mismatch", bytenr,
|
|
|
|
mirror);
|
|
|
|
scrub_ctx->csum_errors++;
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = check_tree_block(fs_info, eb);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("tree block %llu mirror %d is invalid", bytenr, mirror);
|
|
|
|
scrub_ctx->verify_errors++;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
scrub_ctx->tree_extents_scrubbed++;
|
|
|
|
out:
|
|
|
|
free_extent_buffer(eb);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* read_extent_data() helper
|
|
|
|
*
|
|
|
|
* This function will handle short read and update @scrub_ctx when read
|
|
|
|
* error happens.
|
|
|
|
*/
|
|
|
|
static int read_extent_data_loop(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
char *buf, u64 start, u64 len, int mirror)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
u64 cur = 0;
|
|
|
|
|
|
|
|
while (cur < len) {
|
|
|
|
u64 read_len = len - cur;
|
|
|
|
|
|
|
|
ret = read_extent_data(fs_info, buf + cur,
|
|
|
|
start + cur, &read_len, mirror);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to read out data at bytenr %llu mirror %d",
|
|
|
|
start + cur, mirror);
|
|
|
|
scrub_ctx->read_errors++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
cur += read_len;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Recover all other (corrupted) mirrors for tree block.
|
|
|
|
*
|
|
|
|
* The method is quite simple, just read out the correct mirror specified by
|
|
|
|
* @good_mirror and write back correct data to all other blocks
|
|
|
|
*/
|
|
|
|
static int recover_tree_mirror(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
u64 start, int good_mirror)
|
|
|
|
{
|
|
|
|
char *buf;
|
|
|
|
u32 nodesize = fs_info->nodesize;
|
|
|
|
int i;
|
|
|
|
int num_copies;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
buf = malloc(nodesize);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start, nodesize,
|
|
|
|
good_mirror);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to read tree block at bytenr %llu mirror %d",
|
|
|
|
start, good_mirror);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
num_copies = btrfs_num_copies(fs_info, start, nodesize);
|
|
|
|
for (i = 0; i <= num_copies; i++) {
|
|
|
|
if (i == good_mirror)
|
|
|
|
continue;
|
|
|
|
ret = write_data_to_disk(fs_info, buf, start, nodesize, i);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to write tree block at bytenr %llu mirror %d",
|
|
|
|
start, i);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
free(buf);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:31 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check one data mirror given by @start @len and @mirror, or @data
|
|
|
|
* If @data is not given, try to read it from disk.
|
|
|
|
* This function will try to read out all the data then check sum.
|
|
|
|
*
|
|
|
|
* If @data is given, just use the data.
|
|
|
|
* This behavior is useful for RAID5/6 recovery code to verify recovered data.
|
|
|
|
*
|
|
|
|
* If @corrupt_bitmap is given, restore corrupted sector to that bitmap.
|
|
|
|
* This is useful for mirror based profiles to recover its data.
|
|
|
|
*
|
|
|
|
* Return 0 if everything is OK.
|
|
|
|
* Return <0 if something goes wrong, and @scrub_ctx accounting will be updated
|
|
|
|
* if it's a data corruption.
|
|
|
|
*/
|
|
|
|
static int check_data_mirror(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
char *data, u64 start, u64 len, int mirror,
|
|
|
|
unsigned long *corrupt_bitmap)
|
|
|
|
{
|
|
|
|
u32 sectorsize = fs_info->sectorsize;
|
|
|
|
u32 data_csum;
|
|
|
|
u32 *csums = NULL;
|
|
|
|
char *buf = NULL;
|
|
|
|
int ret = 0;
|
|
|
|
int err = 0;
|
|
|
|
int i;
|
|
|
|
unsigned long *csum_bitmap = NULL;
|
|
|
|
|
|
|
|
if (!data) {
|
|
|
|
buf = malloc(len);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start,
|
|
|
|
len, mirror);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
scrub_ctx->data_bytes_scrubbed += len;
|
|
|
|
} else {
|
|
|
|
buf = data;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Alloc and Check csums */
|
|
|
|
csums = malloc(len / sectorsize * sizeof(data_csum));
|
|
|
|
if (!csums) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
csum_bitmap = malloc(calculate_bitmap_len(len / sectorsize));
|
|
|
|
if (!csum_bitmap) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (corrupt_bitmap)
|
|
|
|
memset(corrupt_bitmap, 0,
|
|
|
|
calculate_bitmap_len(len / sectorsize));
|
|
|
|
ret = btrfs_read_data_csums(fs_info, start, len, csums, csum_bitmap);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
for (i = 0; i < len / sectorsize; i++) {
|
|
|
|
if (!test_bit(i, csum_bitmap)) {
|
|
|
|
scrub_ctx->csum_discards++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
data_csum = ~(u32)0;
|
|
|
|
data_csum = btrfs_csum_data(buf + i * sectorsize, data_csum,
|
|
|
|
sectorsize);
|
|
|
|
btrfs_csum_final(data_csum, (u8 *)&data_csum);
|
|
|
|
|
|
|
|
if (memcmp(&data_csum, (char *)csums + i * sizeof(data_csum),
|
|
|
|
sizeof(data_csum))) {
|
|
|
|
error("data at bytenr %llu mirror %d csum mismatch, have 0x%08x expect 0x%08x",
|
|
|
|
start + i * sectorsize, mirror, data_csum,
|
|
|
|
*(u32 *)((char *)csums + i * sizeof(data_csum)));
|
|
|
|
err = 1;
|
|
|
|
scrub_ctx->csum_errors++;
|
|
|
|
if (corrupt_bitmap)
|
|
|
|
set_bit(i, corrupt_bitmap);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
scrub_ctx->data_bytes_scrubbed += sectorsize;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
if (!data)
|
|
|
|
free(buf);
|
|
|
|
free(csums);
|
|
|
|
free(csum_bitmap);
|
|
|
|
|
|
|
|
if (!ret && err)
|
|
|
|
return -EIO;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Helper to check all mirrors for a good copy */
|
|
|
|
static int has_good_mirror(unsigned long *corrupt_bitmaps[], int num_copies,
|
|
|
|
int bit, int *good_mirror)
|
|
|
|
{
|
|
|
|
int found_good = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < num_copies; i++) {
|
|
|
|
if (!test_bit(bit, corrupt_bitmaps[i])) {
|
|
|
|
found_good = 1;
|
|
|
|
if (good_mirror)
|
|
|
|
*good_mirror = i + 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return found_good;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to check @corrupt_bitmaps, to verify if it's recoverable
|
|
|
|
* for mirror based data extent.
|
|
|
|
*
|
|
|
|
* Return 1 for recoverable, and 0 for not recoverable
|
|
|
|
*/
|
|
|
|
static int check_data_mirror_recoverable(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 start, u64 len, u32 sectorsize,
|
|
|
|
unsigned long *corrupt_bitmaps[])
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int corrupted = 0;
|
|
|
|
int bit;
|
|
|
|
int num_copies = btrfs_num_copies(fs_info, start, len);
|
|
|
|
|
|
|
|
for (i = 0; i < num_copies; i++) {
|
|
|
|
for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) {
|
|
|
|
if (!has_good_mirror(corrupt_bitmaps, num_copies,
|
|
|
|
bit, NULL)) {
|
|
|
|
corrupted = 1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
return !corrupted;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to recover all corrupted sectors specified by @corrupt_bitmaps,
|
|
|
|
* by reading out good sector in other mirror.
|
|
|
|
*/
|
|
|
|
static int recover_data_mirror(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
u64 start, u64 len,
|
|
|
|
unsigned long *corrupt_bitmaps[])
|
|
|
|
{
|
|
|
|
char *buf;
|
|
|
|
u32 sectorsize = fs_info->sectorsize;
|
|
|
|
int ret = 0;
|
|
|
|
int bit;
|
|
|
|
int i;
|
|
|
|
int bad_mirror;
|
|
|
|
int num_copies;
|
|
|
|
|
|
|
|
/* Don't bother to recover unrecoverable extents */
|
|
|
|
if (!check_data_mirror_recoverable(fs_info, start, len,
|
|
|
|
sectorsize, corrupt_bitmaps))
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
buf = malloc(sectorsize);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
num_copies = btrfs_num_copies(fs_info, start, len);
|
|
|
|
for (i = 0; i < num_copies; i++) {
|
2016-12-26 06:29:32 +00:00
|
|
|
for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) {
|
2016-12-26 06:29:31 +00:00
|
|
|
u64 cur = start + bit * sectorsize;
|
|
|
|
int good;
|
|
|
|
|
|
|
|
/* Find good mirror */
|
|
|
|
ret = has_good_mirror(corrupt_bitmaps, num_copies, bit,
|
|
|
|
&good);
|
|
|
|
if (!ret) {
|
|
|
|
error("failed to find good mirror for bytenr %llu",
|
|
|
|
cur);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* Read out good mirror */
|
|
|
|
ret = read_data_from_disk(fs_info, buf, cur,
|
|
|
|
sectorsize, good);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to read good mirror from bytenr %llu mirror %d",
|
|
|
|
cur, good);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* Write back to all other mirrors */
|
|
|
|
for (bad_mirror = 1; bad_mirror <= num_copies;
|
|
|
|
bad_mirror++) {
|
|
|
|
if (bad_mirror == good)
|
|
|
|
continue;
|
|
|
|
ret = write_data_to_disk(fs_info, buf, cur,
|
|
|
|
sectorsize, bad_mirror);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to recover mirror for bytenr %llu mirror %d",
|
|
|
|
cur, bad_mirror);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
free(buf);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:32 +00:00
|
|
|
|
|
|
|
/* Btrfs only supports up to 2 copies of data, yet */
|
|
|
|
#define BTRFS_MAX_COPIES 2
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check all copies of range @start, @len.
|
|
|
|
* Caller must ensure the range is covered by EXTENT_ITEM/METADATA_ITEM
|
|
|
|
* specified by leaf of @path.
|
|
|
|
* And @start, @len must be a subset of the EXTENT_ITEM/METADATA_ITEM.
|
|
|
|
*
|
|
|
|
* Return 0 if the range is all OK or recovered or recoverable.
|
|
|
|
* Return <0 if the range can't be recoverable.
|
|
|
|
*/
|
|
|
|
static int scrub_one_extent(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct btrfs_path *path, u64 start, u64 len,
|
|
|
|
int write)
|
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
u32 sectorsize = fs_info->sectorsize;
|
|
|
|
unsigned long *corrupt_bitmaps[BTRFS_MAX_COPIES] = { NULL };
|
|
|
|
int slot = path->slots[0];
|
|
|
|
int num_copies;
|
|
|
|
int meta_corrupted = 0;
|
|
|
|
int meta_good_mirror = 0;
|
|
|
|
int data_bad_mirror = 0;
|
|
|
|
u64 extent_start;
|
|
|
|
u64 extent_len;
|
|
|
|
int metadata = 0;
|
|
|
|
int i;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
|
|
if (key.type != BTRFS_METADATA_ITEM_KEY &&
|
|
|
|
key.type != BTRFS_EXTENT_ITEM_KEY)
|
|
|
|
goto invalid_arg;
|
|
|
|
|
|
|
|
extent_start = key.objectid;
|
|
|
|
if (key.type == BTRFS_METADATA_ITEM_KEY) {
|
|
|
|
extent_len = fs_info->nodesize;
|
|
|
|
metadata = 1;
|
|
|
|
} else {
|
|
|
|
extent_len = key.offset;
|
|
|
|
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
|
|
|
|
if (btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)
|
|
|
|
metadata = 1;
|
|
|
|
}
|
|
|
|
if (start >= extent_start + extent_len ||
|
|
|
|
start + len <= extent_start)
|
|
|
|
goto invalid_arg;
|
|
|
|
|
|
|
|
for (i = 0; i < BTRFS_MAX_COPIES; i++) {
|
|
|
|
corrupt_bitmaps[i] = malloc(
|
|
|
|
calculate_bitmap_len(len / sectorsize));
|
|
|
|
if (!corrupt_bitmaps[i])
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
num_copies = btrfs_num_copies(fs_info, start, len);
|
|
|
|
for (i = 1; i <= num_copies; i++) {
|
|
|
|
if (metadata) {
|
|
|
|
ret = check_tree_mirror(fs_info, scrub_ctx,
|
|
|
|
NULL, extent_start, i);
|
|
|
|
scrub_ctx->tree_extents_scrubbed++;
|
|
|
|
if (ret < 0)
|
|
|
|
meta_corrupted++;
|
|
|
|
else
|
|
|
|
meta_good_mirror = i;
|
|
|
|
} else {
|
|
|
|
ret = check_data_mirror(fs_info, scrub_ctx, NULL, start,
|
|
|
|
len, i, corrupt_bitmaps[i - 1]);
|
|
|
|
scrub_ctx->data_extents_scrubbed++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Metadata recover and report */
|
|
|
|
if (metadata) {
|
|
|
|
if (!meta_corrupted) {
|
|
|
|
goto out;
|
|
|
|
} else if (meta_corrupted && meta_corrupted < num_copies) {
|
|
|
|
if (write) {
|
|
|
|
ret = recover_tree_mirror(fs_info, scrub_ctx,
|
|
|
|
start, meta_good_mirror);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to recover tree block at bytenr %llu",
|
|
|
|
start);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf("extent %llu len %llu REPAIRED: has corrupted mirror, repaired\n",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf("extent %llu len %llu RECOVERABLE: has corrupted mirror, but is recoverable\n",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
error("extent %llu len %llu CORRUPTED: all mirror(s) corrupted, can't be recovered",
|
|
|
|
start, len);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Data recover and report */
|
|
|
|
for (i = 0; i < num_copies; i++) {
|
|
|
|
if (find_first_bit(corrupt_bitmaps[i], len / sectorsize) >=
|
|
|
|
len / sectorsize)
|
|
|
|
continue;
|
|
|
|
data_bad_mirror = i + 1;
|
|
|
|
}
|
|
|
|
/* All data sectors are good */
|
|
|
|
if (!data_bad_mirror) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (check_data_mirror_recoverable(fs_info, start, len,
|
|
|
|
sectorsize, corrupt_bitmaps)) {
|
|
|
|
if (write) {
|
|
|
|
ret = recover_data_mirror(fs_info, scrub_ctx, start,
|
|
|
|
len, corrupt_bitmaps);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to recover data extent at bytenr %llu len %llu",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf("extent %llu len %llu REPARIED: has corrupted mirror, repaired\n",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf("extent %llu len %llu RECOVERABLE: has corrupted mirror, recoverable\n",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
error("extent %llu len %llu CORRUPTED, all mirror(s) corrupted, can't be repaired",
|
|
|
|
start, len);
|
|
|
|
ret = -EIO;
|
|
|
|
out:
|
|
|
|
for (i = 0; i < BTRFS_MAX_COPIES; i++)
|
|
|
|
kfree(corrupt_bitmaps[i]);
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
invalid_arg:
|
|
|
|
error("invalid parameter for %s", __func__);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2016-12-26 06:29:33 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Scrub one full data stripe of RAID5/6.
|
|
|
|
* This means it will check any data/metadata extent in the data stripe
|
|
|
|
* spcified by @stripe and @stripe_len
|
|
|
|
*
|
|
|
|
* This function will only *CHECK* if the data stripe has any corruption.
|
|
|
|
* Won't repair at this function.
|
|
|
|
*
|
|
|
|
* Return 0 if the full stripe is OK.
|
|
|
|
* Return <0 if any error is found.
|
|
|
|
* Note: Missing csum is not counted as error (NODATACSUM is valid)
|
|
|
|
*/
|
|
|
|
static int scrub_one_data_stripe(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct scrub_stripe *stripe, u32 stripe_len)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *extent_root = fs_info->extent_root;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 extent_start;
|
|
|
|
u64 extent_len;
|
|
|
|
u64 orig_csum_discards;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!is_data_stripe(stripe))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = stripe->logical + stripe_len;
|
|
|
|
key.offset = 0;
|
|
|
|
key.type = 0;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
char *data;
|
|
|
|
int slot;
|
|
|
|
int metadata = 0;
|
|
|
|
u64 check_start;
|
|
|
|
u64 check_len;
|
|
|
|
|
|
|
|
ret = btrfs_previous_extent_item(extent_root, path, 0);
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
btrfs_item_key_to_cpu(eb, &key, slot);
|
|
|
|
extent_start = key.objectid;
|
|
|
|
ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
|
|
|
|
|
|
|
|
/* tree block scrub */
|
|
|
|
if (key.type == BTRFS_METADATA_ITEM_KEY ||
|
|
|
|
btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
|
|
|
|
extent_len = extent_root->fs_info->nodesize;
|
|
|
|
metadata = 1;
|
|
|
|
} else {
|
|
|
|
extent_len = key.offset;
|
|
|
|
metadata = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Current extent is out of our range, loop comes to end */
|
|
|
|
if (extent_start + extent_len <= stripe->logical)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (metadata) {
|
|
|
|
/*
|
|
|
|
* Check crossing stripe first, which can't be scrubbed
|
|
|
|
*/
|
|
|
|
if (check_crossing_stripes(fs_info, extent_start,
|
|
|
|
extent_root->fs_info->nodesize)) {
|
|
|
|
error("tree block at %llu is crossing stripe boundary, unable to scrub",
|
|
|
|
extent_start);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
data = stripe->data + extent_start - stripe->logical;
|
|
|
|
ret = check_tree_mirror(fs_info, scrub_ctx,
|
|
|
|
data, extent_start, 0);
|
|
|
|
/* Any csum/verify error means the stripe is screwed */
|
|
|
|
if (ret < 0) {
|
|
|
|
stripe->csum_mismatch = 1;
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Restrict the extent range to fit stripe range */
|
|
|
|
check_start = max(extent_start, stripe->logical);
|
|
|
|
check_len = min(extent_start + extent_len, stripe->logical +
|
|
|
|
stripe_len) - check_start;
|
|
|
|
|
|
|
|
/* Record original csum_discards to detect missing csum case */
|
|
|
|
orig_csum_discards = scrub_ctx->csum_discards;
|
|
|
|
|
|
|
|
data = stripe->data + check_start - stripe->logical;
|
|
|
|
ret = check_data_mirror(fs_info, scrub_ctx, data, check_start,
|
|
|
|
check_len, 0, NULL);
|
|
|
|
/* Csum mismatch, no need to continue anyway*/
|
|
|
|
if (ret < 0) {
|
|
|
|
stripe->csum_mismatch = 1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* Check if there is any missing csum for data */
|
|
|
|
if (scrub_ctx->csum_discards != orig_csum_discards)
|
|
|
|
stripe->csum_missing = 1;
|
|
|
|
/*
|
|
|
|
* Only increase data_extents_scrubbed if we are scrubbing the
|
|
|
|
* tailing part of the data extent
|
|
|
|
*/
|
|
|
|
if (extent_start + extent_len <= stripe->logical + stripe_len)
|
|
|
|
scrub_ctx->data_extents_scrubbed++;
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:34 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify parities for RAID56
|
|
|
|
* Caller must fill @fstripe before calling this function
|
|
|
|
*
|
|
|
|
* Return 0 for parities matches.
|
|
|
|
* Return >0 for P or Q mismatch
|
|
|
|
* Return <0 for fatal error
|
|
|
|
*/
|
|
|
|
static int verify_parities(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
void **ptrs;
|
|
|
|
void *ondisk_p = NULL;
|
|
|
|
void *ondisk_q = NULL;
|
|
|
|
void *buf_p;
|
|
|
|
void *buf_q;
|
|
|
|
int nr_stripes = fstripe->nr_stripes;
|
|
|
|
int stripe_len = BTRFS_STRIPE_LEN;
|
|
|
|
int i;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
|
|
|
|
buf_p = malloc(fstripe->stripe_len);
|
|
|
|
buf_q = malloc(fstripe->stripe_len);
|
|
|
|
if (!ptrs || !buf_p || !buf_q) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < fstripe->nr_stripes; i++) {
|
|
|
|
struct scrub_stripe *stripe = &fstripe->stripes[i];
|
|
|
|
|
|
|
|
if (stripe->logical == BTRFS_RAID5_P_STRIPE) {
|
|
|
|
ondisk_p = stripe->data;
|
|
|
|
ptrs[i] = buf_p;
|
|
|
|
continue;
|
|
|
|
} else if (stripe->logical == BTRFS_RAID6_Q_STRIPE) {
|
|
|
|
ondisk_q = stripe->data;
|
|
|
|
ptrs[i] = buf_q;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
ptrs[i] = stripe->data;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* RAID6 */
|
|
|
|
if (ondisk_q) {
|
|
|
|
raid6_gen_syndrome(nr_stripes, stripe_len, ptrs);
|
|
|
|
|
|
|
|
if (memcmp(ondisk_q, ptrs[nr_stripes - 1], stripe_len) != 0 ||
|
|
|
|
memcmp(ondisk_p, ptrs[nr_stripes - 2], stripe_len))
|
|
|
|
ret = 1;
|
|
|
|
} else {
|
|
|
|
ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1,
|
|
|
|
ptrs);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (memcmp(ondisk_p, ptrs[nr_stripes - 1], stripe_len) != 0)
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
free(buf_p);
|
|
|
|
free(buf_q);
|
|
|
|
free(ptrs);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to recover data stripe from P or Q stripe
|
|
|
|
*
|
|
|
|
* Return >0 if it can't be require any more.
|
|
|
|
* Return 0 for successful repair or no need to repair at all
|
|
|
|
* Return <0 for fatal error
|
|
|
|
*/
|
|
|
|
static int recover_from_parities(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
void **ptrs;
|
|
|
|
int nr_stripes = fstripe->nr_stripes;
|
|
|
|
int stripe_len = BTRFS_STRIPE_LEN;
|
|
|
|
int max_tolerance;
|
|
|
|
int i;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* No need to recover */
|
|
|
|
if (!fstripe->nr_corrupted_stripes)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Already recovered once, no more chance */
|
|
|
|
if (fstripe->recovered)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5)
|
|
|
|
max_tolerance = 1;
|
|
|
|
else
|
|
|
|
max_tolerance = 2;
|
|
|
|
|
|
|
|
/* Out of repair */
|
|
|
|
if (fstripe->nr_corrupted_stripes > max_tolerance)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
|
|
|
|
if (!ptrs)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Construct ptrs */
|
|
|
|
for (i = 0; i < nr_stripes; i++)
|
|
|
|
ptrs[i] = fstripe->stripes[i].data;
|
|
|
|
|
|
|
|
ret = raid56_recov(nr_stripes, stripe_len, fstripe->bg_type,
|
|
|
|
fstripe->corrupted_index[0],
|
|
|
|
fstripe->corrupted_index[1], ptrs);
|
|
|
|
fstripe->recovered = 1;
|
|
|
|
free(ptrs);
|
|
|
|
return ret;
|
|
|
|
}
|