2016-12-26 06:29:29 +00:00
|
|
|
/*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public
|
|
|
|
* License v2 as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Main part to implement offline(unmounted) btrfs scrub
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <unistd.h>
|
|
|
|
#include "ctree.h"
|
|
|
|
#include "volumes.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "utils.h"
|
2016-12-26 06:29:31 +00:00
|
|
|
#include "kernel-lib/bitops.h"
|
btrfs-progs: scrub: Introduce offline scrub function
Now, btrfs-progs has a kernel scrub equivalent.
A new option, --offline is added to "btrfs scrub start".
If --offline is given, btrfs scrub will just act like kernel scrub, to
check every copy of extent and do a report on corrupted data and if it's
recoverable.
The advantage compare to kernel scrub is:
1) No race
Unlike kernel scrub, which is done in parallel, offline scrub is done
by a single thread.
Although it may be slower than kernel one, it's safer and no false
alert.
2) Correctness
Kernel has a known bug (fix submitted) which will recovery RAID5/6
data but screw up P/Q, due to the hardness coding in kernel.
While in btrfs-progs, no page, (almost) no memory size limit, we're
can focus on the scrub, and make things easier.
New offline scrub can detect and report P/Q corruption with
recoverability report, while kernel will only report data stripe error.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Su <suy.fnst@cn.fujitsu.com>
Signed-off-by: Gu Jinxiang <gujx@cn.fujitsu.com>
2017-03-28 13:31:48 +00:00
|
|
|
#include "task-utils.h"
|
2016-12-26 06:29:34 +00:00
|
|
|
#include "kernel-lib/raid56.h"
|
2016-12-26 06:29:29 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For parity based profile (RAID56)
|
|
|
|
* Mirror/stripe based on won't need this. They are iterated by bytenr and
|
|
|
|
* mirror number.
|
|
|
|
*/
|
|
|
|
struct scrub_stripe {
|
|
|
|
/* For P/Q logical start will be BTRFS_RAID5/6_P/Q_STRIPE */
|
|
|
|
u64 logical;
|
|
|
|
|
|
|
|
u64 physical;
|
|
|
|
|
|
|
|
/* Device is missing */
|
|
|
|
unsigned int dev_missing:1;
|
|
|
|
|
|
|
|
/* Any tree/data csum mismatches */
|
|
|
|
unsigned int csum_mismatch:1;
|
|
|
|
|
|
|
|
/* Some data doesn't have csum (nodatasum) */
|
|
|
|
unsigned int csum_missing:1;
|
|
|
|
|
|
|
|
/* Device fd, to write correct data back to disc */
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
char *data;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RAID56 full stripe (data stripes + P/Q)
|
|
|
|
*/
|
|
|
|
struct scrub_full_stripe {
|
|
|
|
u64 logical_start;
|
|
|
|
u64 logical_len;
|
|
|
|
u64 bg_type;
|
|
|
|
u32 nr_stripes;
|
|
|
|
u32 stripe_len;
|
|
|
|
|
|
|
|
/* Read error stripes */
|
|
|
|
u32 err_read_stripes;
|
|
|
|
|
|
|
|
/* Missing devices */
|
|
|
|
u32 err_missing_devs;
|
|
|
|
|
|
|
|
/* Csum error data stripes */
|
|
|
|
u32 err_csum_dstripes;
|
|
|
|
|
|
|
|
/* Missing csum data stripes */
|
|
|
|
u32 missing_csum_dstripes;
|
|
|
|
|
|
|
|
/* currupted stripe index */
|
|
|
|
int corrupted_index[2];
|
|
|
|
|
|
|
|
int nr_corrupted_stripes;
|
|
|
|
|
|
|
|
/* Already recovered once? */
|
|
|
|
unsigned int recovered:1;
|
|
|
|
|
|
|
|
struct scrub_stripe stripes[];
|
|
|
|
};
|
|
|
|
|
|
|
|
static void free_full_stripe(struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < fstripe->nr_stripes; i++)
|
|
|
|
free(fstripe->stripes[i].data);
|
|
|
|
free(fstripe);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct scrub_full_stripe *alloc_full_stripe(int nr_stripes,
|
|
|
|
u32 stripe_len)
|
|
|
|
{
|
|
|
|
struct scrub_full_stripe *ret;
|
|
|
|
int size = sizeof(*ret) + sizeof(unsigned long *) +
|
|
|
|
nr_stripes * sizeof(struct scrub_stripe);
|
|
|
|
int i;
|
|
|
|
|
|
|
|
ret = malloc(size);
|
|
|
|
if (!ret)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
memset(ret, 0, size);
|
|
|
|
ret->nr_stripes = nr_stripes;
|
|
|
|
ret->stripe_len = stripe_len;
|
|
|
|
ret->corrupted_index[0] = -1;
|
|
|
|
ret->corrupted_index[1] = -1;
|
|
|
|
|
|
|
|
/* Alloc data memory for each stripe */
|
|
|
|
for (i = 0; i < nr_stripes; i++) {
|
|
|
|
struct scrub_stripe *stripe = &ret->stripes[i];
|
|
|
|
|
|
|
|
stripe->data = malloc(stripe_len);
|
|
|
|
if (!stripe->data) {
|
|
|
|
free_full_stripe(ret);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:30 +00:00
|
|
|
|
|
|
|
static inline int is_data_stripe(struct scrub_stripe *stripe)
|
|
|
|
{
|
|
|
|
u64 bytenr = stripe->logical;
|
|
|
|
|
|
|
|
if (bytenr == BTRFS_RAID5_P_STRIPE || bytenr == BTRFS_RAID6_Q_STRIPE)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check one tree mirror given by @bytenr and @mirror, or @data.
|
|
|
|
* If @data is not given (NULL), the function will try to read out tree block
|
|
|
|
* using @bytenr and @mirror.
|
|
|
|
* If @data is given, use data directly, won't try to read from disk.
|
|
|
|
*
|
|
|
|
* The extra @data prameter is handy for RAID5/6 recovery code to verify
|
|
|
|
* the recovered data.
|
|
|
|
*
|
|
|
|
* Return 0 if everything is OK.
|
|
|
|
* Return <0 something goes wrong, and @scrub_ctx accounting will be updated
|
|
|
|
* if it's a data corruption.
|
|
|
|
*/
|
|
|
|
static int check_tree_mirror(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
char *data, u64 bytenr, int mirror)
|
|
|
|
{
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
u32 nodesize = fs_info->nodesize;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
|
|
|
|
/* Such error will be reported by check_tree_block() */
|
|
|
|
scrub_ctx->verify_errors++;
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
2017-09-12 12:35:57 +00:00
|
|
|
eb = btrfs_find_create_tree_block(fs_info, bytenr);
|
2016-12-26 06:29:30 +00:00
|
|
|
if (!eb)
|
|
|
|
return -ENOMEM;
|
|
|
|
if (data) {
|
|
|
|
memcpy(eb->data, data, nodesize);
|
|
|
|
} else {
|
|
|
|
ret = read_whole_eb(fs_info, eb, mirror);
|
|
|
|
if (ret) {
|
|
|
|
scrub_ctx->read_errors++;
|
|
|
|
error("failed to read tree block %llu mirror %d",
|
|
|
|
bytenr, mirror);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
scrub_ctx->tree_bytes_scrubbed += nodesize;
|
|
|
|
if (csum_tree_block(fs_info, eb, 1)) {
|
|
|
|
error("tree block %llu mirror %d checksum mismatch", bytenr,
|
|
|
|
mirror);
|
|
|
|
scrub_ctx->csum_errors++;
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = check_tree_block(fs_info, eb);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("tree block %llu mirror %d is invalid", bytenr, mirror);
|
|
|
|
scrub_ctx->verify_errors++;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
scrub_ctx->tree_extents_scrubbed++;
|
|
|
|
out:
|
|
|
|
free_extent_buffer(eb);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* read_extent_data() helper
|
|
|
|
*
|
|
|
|
* This function will handle short read and update @scrub_ctx when read
|
|
|
|
* error happens.
|
|
|
|
*/
|
|
|
|
static int read_extent_data_loop(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
char *buf, u64 start, u64 len, int mirror)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
u64 cur = 0;
|
|
|
|
|
|
|
|
while (cur < len) {
|
|
|
|
u64 read_len = len - cur;
|
|
|
|
|
|
|
|
ret = read_extent_data(fs_info, buf + cur,
|
|
|
|
start + cur, &read_len, mirror);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to read out data at bytenr %llu mirror %d",
|
|
|
|
start + cur, mirror);
|
|
|
|
scrub_ctx->read_errors++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
cur += read_len;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Recover all other (corrupted) mirrors for tree block.
|
|
|
|
*
|
|
|
|
* The method is quite simple, just read out the correct mirror specified by
|
|
|
|
* @good_mirror and write back correct data to all other blocks
|
|
|
|
*/
|
|
|
|
static int recover_tree_mirror(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
u64 start, int good_mirror)
|
|
|
|
{
|
|
|
|
char *buf;
|
|
|
|
u32 nodesize = fs_info->nodesize;
|
|
|
|
int i;
|
|
|
|
int num_copies;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
buf = malloc(nodesize);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start, nodesize,
|
|
|
|
good_mirror);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to read tree block at bytenr %llu mirror %d",
|
|
|
|
start, good_mirror);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
num_copies = btrfs_num_copies(fs_info, start, nodesize);
|
|
|
|
for (i = 0; i <= num_copies; i++) {
|
|
|
|
if (i == good_mirror)
|
|
|
|
continue;
|
|
|
|
ret = write_data_to_disk(fs_info, buf, start, nodesize, i);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to write tree block at bytenr %llu mirror %d",
|
|
|
|
start, i);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
free(buf);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:31 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check one data mirror given by @start @len and @mirror, or @data
|
|
|
|
* If @data is not given, try to read it from disk.
|
|
|
|
* This function will try to read out all the data then check sum.
|
|
|
|
*
|
|
|
|
* If @data is given, just use the data.
|
|
|
|
* This behavior is useful for RAID5/6 recovery code to verify recovered data.
|
|
|
|
*
|
|
|
|
* If @corrupt_bitmap is given, restore corrupted sector to that bitmap.
|
|
|
|
* This is useful for mirror based profiles to recover its data.
|
|
|
|
*
|
|
|
|
* Return 0 if everything is OK.
|
|
|
|
* Return <0 if something goes wrong, and @scrub_ctx accounting will be updated
|
|
|
|
* if it's a data corruption.
|
|
|
|
*/
|
|
|
|
static int check_data_mirror(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
char *data, u64 start, u64 len, int mirror,
|
|
|
|
unsigned long *corrupt_bitmap)
|
|
|
|
{
|
|
|
|
u32 sectorsize = fs_info->sectorsize;
|
|
|
|
u32 data_csum;
|
|
|
|
u32 *csums = NULL;
|
|
|
|
char *buf = NULL;
|
|
|
|
int ret = 0;
|
|
|
|
int err = 0;
|
|
|
|
int i;
|
|
|
|
unsigned long *csum_bitmap = NULL;
|
|
|
|
|
|
|
|
if (!data) {
|
|
|
|
buf = malloc(len);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start,
|
|
|
|
len, mirror);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
scrub_ctx->data_bytes_scrubbed += len;
|
|
|
|
} else {
|
|
|
|
buf = data;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Alloc and Check csums */
|
|
|
|
csums = malloc(len / sectorsize * sizeof(data_csum));
|
|
|
|
if (!csums) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
csum_bitmap = malloc(calculate_bitmap_len(len / sectorsize));
|
|
|
|
if (!csum_bitmap) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (corrupt_bitmap)
|
|
|
|
memset(corrupt_bitmap, 0,
|
|
|
|
calculate_bitmap_len(len / sectorsize));
|
|
|
|
ret = btrfs_read_data_csums(fs_info, start, len, csums, csum_bitmap);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
for (i = 0; i < len / sectorsize; i++) {
|
|
|
|
if (!test_bit(i, csum_bitmap)) {
|
|
|
|
scrub_ctx->csum_discards++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
data_csum = ~(u32)0;
|
|
|
|
data_csum = btrfs_csum_data(buf + i * sectorsize, data_csum,
|
|
|
|
sectorsize);
|
|
|
|
btrfs_csum_final(data_csum, (u8 *)&data_csum);
|
|
|
|
|
|
|
|
if (memcmp(&data_csum, (char *)csums + i * sizeof(data_csum),
|
|
|
|
sizeof(data_csum))) {
|
|
|
|
error("data at bytenr %llu mirror %d csum mismatch, have 0x%08x expect 0x%08x",
|
|
|
|
start + i * sectorsize, mirror, data_csum,
|
|
|
|
*(u32 *)((char *)csums + i * sizeof(data_csum)));
|
|
|
|
err = 1;
|
|
|
|
scrub_ctx->csum_errors++;
|
|
|
|
if (corrupt_bitmap)
|
|
|
|
set_bit(i, corrupt_bitmap);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
scrub_ctx->data_bytes_scrubbed += sectorsize;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
if (!data)
|
|
|
|
free(buf);
|
|
|
|
free(csums);
|
|
|
|
free(csum_bitmap);
|
|
|
|
|
|
|
|
if (!ret && err)
|
|
|
|
return -EIO;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Helper to check all mirrors for a good copy */
|
|
|
|
static int has_good_mirror(unsigned long *corrupt_bitmaps[], int num_copies,
|
|
|
|
int bit, int *good_mirror)
|
|
|
|
{
|
|
|
|
int found_good = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < num_copies; i++) {
|
|
|
|
if (!test_bit(bit, corrupt_bitmaps[i])) {
|
|
|
|
found_good = 1;
|
|
|
|
if (good_mirror)
|
|
|
|
*good_mirror = i + 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return found_good;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to check @corrupt_bitmaps, to verify if it's recoverable
|
|
|
|
* for mirror based data extent.
|
|
|
|
*
|
|
|
|
* Return 1 for recoverable, and 0 for not recoverable
|
|
|
|
*/
|
|
|
|
static int check_data_mirror_recoverable(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 start, u64 len, u32 sectorsize,
|
|
|
|
unsigned long *corrupt_bitmaps[])
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int corrupted = 0;
|
|
|
|
int bit;
|
|
|
|
int num_copies = btrfs_num_copies(fs_info, start, len);
|
|
|
|
|
|
|
|
for (i = 0; i < num_copies; i++) {
|
|
|
|
for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) {
|
|
|
|
if (!has_good_mirror(corrupt_bitmaps, num_copies,
|
|
|
|
bit, NULL)) {
|
|
|
|
corrupted = 1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
return !corrupted;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to recover all corrupted sectors specified by @corrupt_bitmaps,
|
|
|
|
* by reading out good sector in other mirror.
|
|
|
|
*/
|
|
|
|
static int recover_data_mirror(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
u64 start, u64 len,
|
|
|
|
unsigned long *corrupt_bitmaps[])
|
|
|
|
{
|
|
|
|
char *buf;
|
|
|
|
u32 sectorsize = fs_info->sectorsize;
|
|
|
|
int ret = 0;
|
|
|
|
int bit;
|
|
|
|
int i;
|
|
|
|
int bad_mirror;
|
|
|
|
int num_copies;
|
|
|
|
|
|
|
|
/* Don't bother to recover unrecoverable extents */
|
|
|
|
if (!check_data_mirror_recoverable(fs_info, start, len,
|
|
|
|
sectorsize, corrupt_bitmaps))
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
buf = malloc(sectorsize);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
num_copies = btrfs_num_copies(fs_info, start, len);
|
|
|
|
for (i = 0; i < num_copies; i++) {
|
2016-12-26 06:29:32 +00:00
|
|
|
for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) {
|
2016-12-26 06:29:31 +00:00
|
|
|
u64 cur = start + bit * sectorsize;
|
|
|
|
int good;
|
|
|
|
|
|
|
|
/* Find good mirror */
|
|
|
|
ret = has_good_mirror(corrupt_bitmaps, num_copies, bit,
|
|
|
|
&good);
|
|
|
|
if (!ret) {
|
|
|
|
error("failed to find good mirror for bytenr %llu",
|
|
|
|
cur);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* Read out good mirror */
|
|
|
|
ret = read_data_from_disk(fs_info, buf, cur,
|
|
|
|
sectorsize, good);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to read good mirror from bytenr %llu mirror %d",
|
|
|
|
cur, good);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* Write back to all other mirrors */
|
|
|
|
for (bad_mirror = 1; bad_mirror <= num_copies;
|
|
|
|
bad_mirror++) {
|
|
|
|
if (bad_mirror == good)
|
|
|
|
continue;
|
|
|
|
ret = write_data_to_disk(fs_info, buf, cur,
|
|
|
|
sectorsize, bad_mirror);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to recover mirror for bytenr %llu mirror %d",
|
|
|
|
cur, bad_mirror);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
free(buf);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:32 +00:00
|
|
|
|
|
|
|
/* Btrfs only supports up to 2 copies of data, yet */
|
|
|
|
#define BTRFS_MAX_COPIES 2
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check all copies of range @start, @len.
|
|
|
|
* Caller must ensure the range is covered by EXTENT_ITEM/METADATA_ITEM
|
|
|
|
* specified by leaf of @path.
|
|
|
|
* And @start, @len must be a subset of the EXTENT_ITEM/METADATA_ITEM.
|
|
|
|
*
|
|
|
|
* Return 0 if the range is all OK or recovered or recoverable.
|
|
|
|
* Return <0 if the range can't be recoverable.
|
|
|
|
*/
|
|
|
|
static int scrub_one_extent(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct btrfs_path *path, u64 start, u64 len,
|
|
|
|
int write)
|
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
u32 sectorsize = fs_info->sectorsize;
|
|
|
|
unsigned long *corrupt_bitmaps[BTRFS_MAX_COPIES] = { NULL };
|
|
|
|
int slot = path->slots[0];
|
|
|
|
int num_copies;
|
|
|
|
int meta_corrupted = 0;
|
|
|
|
int meta_good_mirror = 0;
|
|
|
|
int data_bad_mirror = 0;
|
|
|
|
u64 extent_start;
|
|
|
|
u64 extent_len;
|
|
|
|
int metadata = 0;
|
|
|
|
int i;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
|
|
if (key.type != BTRFS_METADATA_ITEM_KEY &&
|
|
|
|
key.type != BTRFS_EXTENT_ITEM_KEY)
|
|
|
|
goto invalid_arg;
|
|
|
|
|
|
|
|
extent_start = key.objectid;
|
|
|
|
if (key.type == BTRFS_METADATA_ITEM_KEY) {
|
|
|
|
extent_len = fs_info->nodesize;
|
|
|
|
metadata = 1;
|
|
|
|
} else {
|
|
|
|
extent_len = key.offset;
|
|
|
|
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
|
|
|
|
if (btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)
|
|
|
|
metadata = 1;
|
|
|
|
}
|
|
|
|
if (start >= extent_start + extent_len ||
|
|
|
|
start + len <= extent_start)
|
|
|
|
goto invalid_arg;
|
|
|
|
|
|
|
|
for (i = 0; i < BTRFS_MAX_COPIES; i++) {
|
|
|
|
corrupt_bitmaps[i] = malloc(
|
|
|
|
calculate_bitmap_len(len / sectorsize));
|
|
|
|
if (!corrupt_bitmaps[i])
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
num_copies = btrfs_num_copies(fs_info, start, len);
|
|
|
|
for (i = 1; i <= num_copies; i++) {
|
|
|
|
if (metadata) {
|
|
|
|
ret = check_tree_mirror(fs_info, scrub_ctx,
|
|
|
|
NULL, extent_start, i);
|
|
|
|
scrub_ctx->tree_extents_scrubbed++;
|
|
|
|
if (ret < 0)
|
|
|
|
meta_corrupted++;
|
|
|
|
else
|
|
|
|
meta_good_mirror = i;
|
|
|
|
} else {
|
|
|
|
ret = check_data_mirror(fs_info, scrub_ctx, NULL, start,
|
|
|
|
len, i, corrupt_bitmaps[i - 1]);
|
|
|
|
scrub_ctx->data_extents_scrubbed++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Metadata recover and report */
|
|
|
|
if (metadata) {
|
|
|
|
if (!meta_corrupted) {
|
|
|
|
goto out;
|
|
|
|
} else if (meta_corrupted && meta_corrupted < num_copies) {
|
|
|
|
if (write) {
|
|
|
|
ret = recover_tree_mirror(fs_info, scrub_ctx,
|
|
|
|
start, meta_good_mirror);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to recover tree block at bytenr %llu",
|
|
|
|
start);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf("extent %llu len %llu REPAIRED: has corrupted mirror, repaired\n",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf("extent %llu len %llu RECOVERABLE: has corrupted mirror, but is recoverable\n",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
error("extent %llu len %llu CORRUPTED: all mirror(s) corrupted, can't be recovered",
|
|
|
|
start, len);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Data recover and report */
|
|
|
|
for (i = 0; i < num_copies; i++) {
|
|
|
|
if (find_first_bit(corrupt_bitmaps[i], len / sectorsize) >=
|
|
|
|
len / sectorsize)
|
|
|
|
continue;
|
|
|
|
data_bad_mirror = i + 1;
|
|
|
|
}
|
|
|
|
/* All data sectors are good */
|
|
|
|
if (!data_bad_mirror) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (check_data_mirror_recoverable(fs_info, start, len,
|
|
|
|
sectorsize, corrupt_bitmaps)) {
|
|
|
|
if (write) {
|
|
|
|
ret = recover_data_mirror(fs_info, scrub_ctx, start,
|
|
|
|
len, corrupt_bitmaps);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("failed to recover data extent at bytenr %llu len %llu",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf("extent %llu len %llu REPARIED: has corrupted mirror, repaired\n",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf("extent %llu len %llu RECOVERABLE: has corrupted mirror, recoverable\n",
|
|
|
|
start, len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
error("extent %llu len %llu CORRUPTED, all mirror(s) corrupted, can't be repaired",
|
|
|
|
start, len);
|
|
|
|
ret = -EIO;
|
|
|
|
out:
|
|
|
|
for (i = 0; i < BTRFS_MAX_COPIES; i++)
|
|
|
|
kfree(corrupt_bitmaps[i]);
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
invalid_arg:
|
|
|
|
error("invalid parameter for %s", __func__);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2016-12-26 06:29:33 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Scrub one full data stripe of RAID5/6.
|
|
|
|
* This means it will check any data/metadata extent in the data stripe
|
|
|
|
* spcified by @stripe and @stripe_len
|
|
|
|
*
|
|
|
|
* This function will only *CHECK* if the data stripe has any corruption.
|
|
|
|
* Won't repair at this function.
|
|
|
|
*
|
|
|
|
* Return 0 if the full stripe is OK.
|
|
|
|
* Return <0 if any error is found.
|
|
|
|
* Note: Missing csum is not counted as error (NODATACSUM is valid)
|
|
|
|
*/
|
|
|
|
static int scrub_one_data_stripe(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct scrub_stripe *stripe, u32 stripe_len)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *extent_root = fs_info->extent_root;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 extent_start;
|
|
|
|
u64 extent_len;
|
|
|
|
u64 orig_csum_discards;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!is_data_stripe(stripe))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = stripe->logical + stripe_len;
|
|
|
|
key.offset = 0;
|
|
|
|
key.type = 0;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
char *data;
|
|
|
|
int slot;
|
|
|
|
int metadata = 0;
|
|
|
|
u64 check_start;
|
|
|
|
u64 check_len;
|
|
|
|
|
|
|
|
ret = btrfs_previous_extent_item(extent_root, path, 0);
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
eb = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
btrfs_item_key_to_cpu(eb, &key, slot);
|
|
|
|
extent_start = key.objectid;
|
|
|
|
ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
|
|
|
|
|
|
|
|
/* tree block scrub */
|
|
|
|
if (key.type == BTRFS_METADATA_ITEM_KEY ||
|
|
|
|
btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
|
|
|
|
extent_len = extent_root->fs_info->nodesize;
|
|
|
|
metadata = 1;
|
|
|
|
} else {
|
|
|
|
extent_len = key.offset;
|
|
|
|
metadata = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Current extent is out of our range, loop comes to end */
|
|
|
|
if (extent_start + extent_len <= stripe->logical)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (metadata) {
|
|
|
|
/*
|
|
|
|
* Check crossing stripe first, which can't be scrubbed
|
|
|
|
*/
|
|
|
|
if (check_crossing_stripes(fs_info, extent_start,
|
|
|
|
extent_root->fs_info->nodesize)) {
|
|
|
|
error("tree block at %llu is crossing stripe boundary, unable to scrub",
|
|
|
|
extent_start);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
data = stripe->data + extent_start - stripe->logical;
|
|
|
|
ret = check_tree_mirror(fs_info, scrub_ctx,
|
|
|
|
data, extent_start, 0);
|
|
|
|
/* Any csum/verify error means the stripe is screwed */
|
|
|
|
if (ret < 0) {
|
|
|
|
stripe->csum_mismatch = 1;
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Restrict the extent range to fit stripe range */
|
|
|
|
check_start = max(extent_start, stripe->logical);
|
|
|
|
check_len = min(extent_start + extent_len, stripe->logical +
|
|
|
|
stripe_len) - check_start;
|
|
|
|
|
|
|
|
/* Record original csum_discards to detect missing csum case */
|
|
|
|
orig_csum_discards = scrub_ctx->csum_discards;
|
|
|
|
|
|
|
|
data = stripe->data + check_start - stripe->logical;
|
|
|
|
ret = check_data_mirror(fs_info, scrub_ctx, data, check_start,
|
|
|
|
check_len, 0, NULL);
|
|
|
|
/* Csum mismatch, no need to continue anyway*/
|
|
|
|
if (ret < 0) {
|
|
|
|
stripe->csum_mismatch = 1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* Check if there is any missing csum for data */
|
|
|
|
if (scrub_ctx->csum_discards != orig_csum_discards)
|
|
|
|
stripe->csum_missing = 1;
|
|
|
|
/*
|
|
|
|
* Only increase data_extents_scrubbed if we are scrubbing the
|
|
|
|
* tailing part of the data extent
|
|
|
|
*/
|
|
|
|
if (extent_start + extent_len <= stripe->logical + stripe_len)
|
|
|
|
scrub_ctx->data_extents_scrubbed++;
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:34 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify parities for RAID56
|
|
|
|
* Caller must fill @fstripe before calling this function
|
|
|
|
*
|
|
|
|
* Return 0 for parities matches.
|
|
|
|
* Return >0 for P or Q mismatch
|
|
|
|
* Return <0 for fatal error
|
|
|
|
*/
|
|
|
|
static int verify_parities(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
void **ptrs;
|
|
|
|
void *ondisk_p = NULL;
|
|
|
|
void *ondisk_q = NULL;
|
|
|
|
void *buf_p;
|
|
|
|
void *buf_q;
|
|
|
|
int nr_stripes = fstripe->nr_stripes;
|
|
|
|
int stripe_len = BTRFS_STRIPE_LEN;
|
|
|
|
int i;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
|
|
|
|
buf_p = malloc(fstripe->stripe_len);
|
|
|
|
buf_q = malloc(fstripe->stripe_len);
|
|
|
|
if (!ptrs || !buf_p || !buf_q) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < fstripe->nr_stripes; i++) {
|
|
|
|
struct scrub_stripe *stripe = &fstripe->stripes[i];
|
|
|
|
|
|
|
|
if (stripe->logical == BTRFS_RAID5_P_STRIPE) {
|
|
|
|
ondisk_p = stripe->data;
|
|
|
|
ptrs[i] = buf_p;
|
|
|
|
continue;
|
|
|
|
} else if (stripe->logical == BTRFS_RAID6_Q_STRIPE) {
|
|
|
|
ondisk_q = stripe->data;
|
|
|
|
ptrs[i] = buf_q;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
ptrs[i] = stripe->data;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* RAID6 */
|
|
|
|
if (ondisk_q) {
|
|
|
|
raid6_gen_syndrome(nr_stripes, stripe_len, ptrs);
|
|
|
|
|
|
|
|
if (memcmp(ondisk_q, ptrs[nr_stripes - 1], stripe_len) != 0 ||
|
|
|
|
memcmp(ondisk_p, ptrs[nr_stripes - 2], stripe_len))
|
|
|
|
ret = 1;
|
|
|
|
} else {
|
|
|
|
ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1,
|
|
|
|
ptrs);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (memcmp(ondisk_p, ptrs[nr_stripes - 1], stripe_len) != 0)
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
free(buf_p);
|
|
|
|
free(buf_q);
|
|
|
|
free(ptrs);
|
|
|
|
return ret;
|
|
|
|
}
|
2016-12-26 06:29:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to recover data stripe from P or Q stripe
|
|
|
|
*
|
|
|
|
* Return >0 if it can't be require any more.
|
|
|
|
* Return 0 for successful repair or no need to repair at all
|
|
|
|
* Return <0 for fatal error
|
|
|
|
*/
|
|
|
|
static int recover_from_parities(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
void **ptrs;
|
|
|
|
int nr_stripes = fstripe->nr_stripes;
|
|
|
|
int stripe_len = BTRFS_STRIPE_LEN;
|
|
|
|
int max_tolerance;
|
|
|
|
int i;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* No need to recover */
|
|
|
|
if (!fstripe->nr_corrupted_stripes)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Already recovered once, no more chance */
|
|
|
|
if (fstripe->recovered)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5)
|
|
|
|
max_tolerance = 1;
|
|
|
|
else
|
|
|
|
max_tolerance = 2;
|
|
|
|
|
|
|
|
/* Out of repair */
|
|
|
|
if (fstripe->nr_corrupted_stripes > max_tolerance)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
|
|
|
|
if (!ptrs)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Construct ptrs */
|
|
|
|
for (i = 0; i < nr_stripes; i++)
|
|
|
|
ptrs[i] = fstripe->stripes[i].data;
|
|
|
|
|
|
|
|
ret = raid56_recov(nr_stripes, stripe_len, fstripe->bg_type,
|
|
|
|
fstripe->corrupted_index[0],
|
|
|
|
fstripe->corrupted_index[1], ptrs);
|
|
|
|
fstripe->recovered = 1;
|
|
|
|
free(ptrs);
|
|
|
|
return ret;
|
|
|
|
}
|
2017-05-24 08:43:00 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper to write a full stripe to disk
|
|
|
|
* P/Q will be re-calculated.
|
|
|
|
*/
|
|
|
|
static int write_full_stripe(struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
void **ptrs;
|
|
|
|
int nr_stripes = fstripe->nr_stripes;
|
|
|
|
int stripe_len = BTRFS_STRIPE_LEN;
|
|
|
|
int i;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
|
|
|
|
if (!ptrs)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
for (i = 0; i < fstripe->nr_stripes; i++)
|
|
|
|
ptrs[i] = fstripe->stripes[i].data;
|
|
|
|
|
|
|
|
if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID6) {
|
|
|
|
raid6_gen_syndrome(nr_stripes, stripe_len, ptrs);
|
|
|
|
} else {
|
|
|
|
ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1,
|
|
|
|
ptrs);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < fstripe->nr_stripes; i++) {
|
|
|
|
struct scrub_stripe *stripe = &fstripe->stripes[i];
|
|
|
|
|
|
|
|
ret = pwrite(stripe->fd, stripe->data, fstripe->stripe_len,
|
|
|
|
stripe->physical);
|
|
|
|
if (ret != fstripe->stripe_len) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
free(ptrs);
|
|
|
|
return ret;
|
btrfs-progs: scrub: Introduce a function to scrub one full stripe
Introduce a new function, scrub_one_full_stripe(), to check a full
stripe.
It handles the full stripe scrub in the following steps:
0) Check if we need to check full stripe
If full stripe contains no extent, why waste our CPU and IO?
1) Read out full stripe
Then we know how many devices are missing or have read error.
If out of repair, then exit
If have missing device or have read error, try recover here.
2) Check data stripe against csum
We add data stripe with csum error as corrupted stripe, just like
dev missing or read error.
Then recheck if csum mismatch is still below tolerance.
Finally we check the full stripe using 2 factors only:
A) If the full stripe go through recover ever
B) If the full stripe has csum error
Combine factor A and B we get:
1) A && B: Recovered, csum mismatch
Screwed up totally
2) A && !B: Recovered, csum match
Recoverable, data corrupted but P/Q is good to recover
3) !A && B: Not recovered, csum mismatch
Try to recover corrupted data stripes
If recovered csum match, then recoverable
Else, screwed up
4) !A && !B: Not recovered, no csum mismatch
Best case, just check if P/Q matches.
If P/Q matches, everything is good
Else, just P/Q is screwed up, still recoverable.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2016-12-26 06:29:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return 0 if we still have chance to recover
|
|
|
|
* Return <0 if we have no more chance
|
|
|
|
*/
|
|
|
|
static int report_recoverablity(struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
int max_tolerance;
|
|
|
|
u64 start = fstripe->logical_start;
|
|
|
|
|
|
|
|
if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5)
|
|
|
|
max_tolerance = 1;
|
|
|
|
else
|
|
|
|
max_tolerance = 2;
|
|
|
|
|
|
|
|
if (fstripe->nr_corrupted_stripes > max_tolerance) {
|
|
|
|
error(
|
|
|
|
"full stripe %llu CORRUPTED: too many read error or corrupted devices",
|
|
|
|
start);
|
|
|
|
error(
|
|
|
|
"full stripe %llu: tolerance: %d, missing: %d, read error: %d, csum error: %d",
|
|
|
|
start, max_tolerance, fstripe->err_read_stripes,
|
|
|
|
fstripe->err_missing_devs, fstripe->err_csum_dstripes);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void clear_corrupted_stripe_record(struct scrub_full_stripe *fstripe)
|
|
|
|
{
|
|
|
|
fstripe->corrupted_index[0] = -1;
|
|
|
|
fstripe->corrupted_index[1] = -1;
|
|
|
|
fstripe->nr_corrupted_stripes = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void record_corrupted_stripe(struct scrub_full_stripe *fstripe,
|
|
|
|
int index)
|
|
|
|
{
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < 2; i++) {
|
|
|
|
if (fstripe->corrupted_index[i] == -1) {
|
|
|
|
fstripe->corrupted_index[i] = index;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fstripe->nr_corrupted_stripes++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scrub one full stripe.
|
|
|
|
*
|
|
|
|
* If everything matches, that's good.
|
|
|
|
* If data stripe corrupted badly, no mean to recovery, it will report it.
|
|
|
|
* If data stripe corrupted, try recovery first and recheck csum, to
|
|
|
|
* determine if it's recoverable or screwed up.
|
|
|
|
*/
|
|
|
|
static int scrub_one_full_stripe(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
u64 start, u64 *next_ret, int write)
|
|
|
|
{
|
|
|
|
struct scrub_full_stripe *fstripe;
|
|
|
|
struct btrfs_map_block *map_block = NULL;
|
|
|
|
u32 stripe_len = BTRFS_STRIPE_LEN;
|
|
|
|
u64 bg_type;
|
|
|
|
u64 len;
|
|
|
|
int i;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!next_ret) {
|
|
|
|
error("invalid argument for %s", __func__);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = __btrfs_map_block_v2(fs_info, WRITE, start, stripe_len,
|
|
|
|
&map_block);
|
|
|
|
if (ret < 0) {
|
|
|
|
/* Let caller to skip the whole block group */
|
|
|
|
*next_ret = (u64)-1;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
start = map_block->start;
|
|
|
|
len = map_block->length;
|
|
|
|
*next_ret = start + len;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Step 0: Check if we need to scrub the full stripe
|
|
|
|
*
|
|
|
|
* If no extent lies in the full stripe, not need to check
|
|
|
|
*/
|
|
|
|
ret = btrfs_check_extent_exists(fs_info, start, len);
|
|
|
|
if (ret < 0) {
|
|
|
|
free(map_block);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
/* No extents in range, no need to check */
|
|
|
|
if (ret == 0) {
|
|
|
|
free(map_block);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bg_type = map_block->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
|
|
|
|
if (bg_type != BTRFS_BLOCK_GROUP_RAID5 &&
|
|
|
|
bg_type != BTRFS_BLOCK_GROUP_RAID6) {
|
|
|
|
free(map_block);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
fstripe = alloc_full_stripe(map_block->num_stripes,
|
|
|
|
map_block->stripe_len);
|
|
|
|
if (!fstripe)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
fstripe->logical_start = map_block->start;
|
|
|
|
fstripe->nr_stripes = map_block->num_stripes;
|
|
|
|
fstripe->stripe_len = stripe_len;
|
|
|
|
fstripe->bg_type = bg_type;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Step 1: Read out the whole full stripe
|
|
|
|
*
|
|
|
|
* Then we have the chance to exit early if too many devices are
|
|
|
|
* missing.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < map_block->num_stripes; i++) {
|
|
|
|
struct scrub_stripe *s_stripe = &fstripe->stripes[i];
|
|
|
|
struct btrfs_map_stripe *m_stripe = &map_block->stripes[i];
|
|
|
|
|
|
|
|
s_stripe->logical = m_stripe->logical;
|
|
|
|
s_stripe->fd = m_stripe->dev->fd;
|
|
|
|
s_stripe->physical = m_stripe->physical;
|
|
|
|
|
|
|
|
if (m_stripe->dev->fd == -1) {
|
|
|
|
s_stripe->dev_missing = 1;
|
|
|
|
record_corrupted_stripe(fstripe, i);
|
|
|
|
fstripe->err_missing_devs++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = pread(m_stripe->dev->fd, s_stripe->data, stripe_len,
|
|
|
|
m_stripe->physical);
|
|
|
|
if (ret < stripe_len) {
|
|
|
|
record_corrupted_stripe(fstripe, i);
|
|
|
|
fstripe->err_read_stripes++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = report_recoverablity(fstripe);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = recover_from_parities(fs_info, scrub_ctx, fstripe);
|
|
|
|
if (ret < 0) {
|
|
|
|
error("full stripe %llu CORRUPTED: failed to recover: %s\n",
|
|
|
|
fstripe->logical_start, strerror(-ret));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear corrupted stripes report, since they are recovered,
|
|
|
|
* and later checker need to record csum mismatch stripes reusing
|
|
|
|
* these members
|
|
|
|
*/
|
|
|
|
clear_corrupted_stripe_record(fstripe);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Step 2: Check each data stripes against csum
|
|
|
|
*/
|
|
|
|
for (i = 0; i < map_block->num_stripes; i++) {
|
|
|
|
struct scrub_stripe *stripe = &fstripe->stripes[i];
|
|
|
|
|
|
|
|
if (!is_data_stripe(stripe))
|
|
|
|
continue;
|
|
|
|
ret = scrub_one_data_stripe(fs_info, scrub_ctx, stripe,
|
|
|
|
stripe_len);
|
|
|
|
if (ret < 0) {
|
|
|
|
fstripe->err_csum_dstripes++;
|
|
|
|
record_corrupted_stripe(fstripe, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = report_recoverablity(fstripe);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Recovered before, but no csum error
|
|
|
|
*/
|
|
|
|
if (fstripe->err_csum_dstripes == 0 && fstripe->recovered) {
|
|
|
|
error(
|
|
|
|
"full stripe %llu RECOVERABLE: P/Q is good for recovery",
|
|
|
|
start);
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* No csum error, not recovered before.
|
|
|
|
*
|
|
|
|
* Only need to check if P/Q matches.
|
|
|
|
*/
|
|
|
|
if (fstripe->err_csum_dstripes == 0 && !fstripe->recovered) {
|
|
|
|
ret = verify_parities(fs_info, scrub_ctx, fstripe);
|
|
|
|
if (ret < 0) {
|
|
|
|
error(
|
|
|
|
"full stripe %llu CORRUPTED: failed to check P/Q: %s",
|
|
|
|
start, strerror(-ret));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (ret > 0) {
|
|
|
|
if (write) {
|
|
|
|
ret = write_full_stripe(fstripe);
|
|
|
|
if (ret < 0)
|
|
|
|
error("failed to write full stripe %llu: %s",
|
|
|
|
start, strerror(-ret));
|
|
|
|
else
|
|
|
|
printf("full stripe %llu REPARIED: only P/Q mismatches, repaired\n",
|
|
|
|
start);
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
printf("full stripe %llu RECOVERABLE: only P/Q is corrupted\n",
|
|
|
|
start);
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
goto out;
|
|
|
|
}
|
2017-05-24 08:43:00 +00:00
|
|
|
|
btrfs-progs: scrub: Introduce a function to scrub one full stripe
Introduce a new function, scrub_one_full_stripe(), to check a full
stripe.
It handles the full stripe scrub in the following steps:
0) Check if we need to check full stripe
If full stripe contains no extent, why waste our CPU and IO?
1) Read out full stripe
Then we know how many devices are missing or have read error.
If out of repair, then exit
If have missing device or have read error, try recover here.
2) Check data stripe against csum
We add data stripe with csum error as corrupted stripe, just like
dev missing or read error.
Then recheck if csum mismatch is still below tolerance.
Finally we check the full stripe using 2 factors only:
A) If the full stripe go through recover ever
B) If the full stripe has csum error
Combine factor A and B we get:
1) A && B: Recovered, csum mismatch
Screwed up totally
2) A && !B: Recovered, csum match
Recoverable, data corrupted but P/Q is good to recover
3) !A && B: Not recovered, csum mismatch
Try to recover corrupted data stripes
If recovered csum match, then recoverable
Else, screwed up
4) !A && !B: Not recovered, no csum mismatch
Best case, just check if P/Q matches.
If P/Q matches, everything is good
Else, just P/Q is screwed up, still recoverable.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2016-12-26 06:29:37 +00:00
|
|
|
/*
|
|
|
|
* Still csum error after recovery
|
|
|
|
*
|
|
|
|
* No mean to fix further, screwed up already.
|
|
|
|
*/
|
|
|
|
if (fstripe->err_csum_dstripes && fstripe->recovered) {
|
|
|
|
error(
|
|
|
|
"full stripe %llu CORRUPTED: csum still mismatch after recovery",
|
|
|
|
start);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Csum mismatch, but we still has chance to recover. */
|
|
|
|
ret = recover_from_parities(fs_info, scrub_ctx, fstripe);
|
|
|
|
if (ret < 0) {
|
|
|
|
error(
|
|
|
|
"full stripe %llu CORRUPTED: failed to recover: %s\n",
|
|
|
|
fstripe->logical_start, strerror(-ret));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* After recovery, recheck data stripe csum */
|
|
|
|
for (i = 0; i < 2; i++) {
|
|
|
|
int index = fstripe->corrupted_index[i];
|
|
|
|
struct scrub_stripe *stripe;
|
|
|
|
|
|
|
|
if (i == -1)
|
|
|
|
continue;
|
|
|
|
stripe = &fstripe->stripes[index];
|
|
|
|
ret = scrub_one_data_stripe(fs_info, scrub_ctx, stripe,
|
|
|
|
stripe_len);
|
|
|
|
if (ret < 0) {
|
|
|
|
error(
|
|
|
|
"full stripe %llu CORRUPTED: csum still mismatch after recovery",
|
|
|
|
start);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (write) {
|
|
|
|
ret = write_full_stripe(fstripe);
|
|
|
|
if (ret < 0)
|
|
|
|
error("failed to write full stripe %llu: %s",
|
|
|
|
start, strerror(-ret));
|
|
|
|
else
|
|
|
|
printf("full stripe %llu REPARIED: corrupted data with good P/Q, repaired\n",
|
|
|
|
start);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
printf(
|
|
|
|
"full stripe %llu RECOVERABLE: Data stripes corrupted, but P/Q is good\n",
|
|
|
|
start);
|
|
|
|
|
|
|
|
out:
|
|
|
|
free_full_stripe(fstripe);
|
|
|
|
free(map_block);
|
|
|
|
return ret;
|
2017-05-24 08:43:00 +00:00
|
|
|
}
|
2016-12-26 06:29:38 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Scrub one block group.
|
|
|
|
*
|
|
|
|
* This function will handle all profiles current btrfs supports.
|
|
|
|
* Return 0 for scrubbing the block group. Found error will be recorded into
|
|
|
|
* scrub_ctx.
|
|
|
|
* Return <0 for fatal error preventing scrubing the block group.
|
|
|
|
*/
|
|
|
|
static int scrub_one_block_group(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_scrub_progress *scrub_ctx,
|
|
|
|
struct btrfs_block_group_cache *bg_cache,
|
|
|
|
int write)
|
|
|
|
{
|
|
|
|
struct btrfs_root *extent_root = fs_info->extent_root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 bg_start = bg_cache->key.objectid;
|
|
|
|
u64 bg_len = bg_cache->key.offset;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (bg_cache->flags &
|
|
|
|
(BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
|
|
|
|
u64 cur = bg_start;
|
|
|
|
u64 next;
|
|
|
|
|
|
|
|
while (cur < bg_start + bg_len) {
|
|
|
|
ret = scrub_one_full_stripe(fs_info, scrub_ctx, cur,
|
|
|
|
&next, write);
|
|
|
|
/* Ignore any non-fatal error */
|
|
|
|
if (ret < 0 && ret != -EIO) {
|
|
|
|
error("fatal error happens checking one full stripe at bytenr: %llu: %s",
|
|
|
|
cur, strerror(-ret));
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
cur = next;
|
|
|
|
}
|
|
|
|
/* Ignore any -EIO error, such error will be reported at last */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
/* None parity based profile, check extent by extent */
|
|
|
|
key.objectid = bg_start;
|
|
|
|
key.type = 0;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
while (1) {
|
|
|
|
struct extent_buffer *eb = path->nodes[0];
|
|
|
|
int slot = path->slots[0];
|
|
|
|
u64 extent_start;
|
|
|
|
u64 extent_len;
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(eb, &key, slot);
|
|
|
|
if (key.objectid >= bg_start + bg_len)
|
|
|
|
break;
|
|
|
|
if (key.type != BTRFS_EXTENT_ITEM_KEY &&
|
|
|
|
key.type != BTRFS_METADATA_ITEM_KEY)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
extent_start = key.objectid;
|
|
|
|
if (key.type == BTRFS_METADATA_ITEM_KEY)
|
|
|
|
extent_len = extent_root->fs_info->nodesize;
|
|
|
|
else
|
|
|
|
extent_len = key.offset;
|
|
|
|
|
|
|
|
ret = scrub_one_extent(fs_info, scrub_ctx, path, extent_start,
|
|
|
|
extent_len, write);
|
|
|
|
if (ret < 0 && ret != -EIO) {
|
|
|
|
error("fatal error checking extent bytenr %llu len %llu: %s",
|
|
|
|
extent_start, extent_len, strerror(-ret));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
next:
|
|
|
|
ret = btrfs_next_extent_item(extent_root, path, bg_start +
|
|
|
|
bg_len);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
btrfs-progs: scrub: Introduce offline scrub function
Now, btrfs-progs has a kernel scrub equivalent.
A new option, --offline is added to "btrfs scrub start".
If --offline is given, btrfs scrub will just act like kernel scrub, to
check every copy of extent and do a report on corrupted data and if it's
recoverable.
The advantage compare to kernel scrub is:
1) No race
Unlike kernel scrub, which is done in parallel, offline scrub is done
by a single thread.
Although it may be slower than kernel one, it's safer and no false
alert.
2) Correctness
Kernel has a known bug (fix submitted) which will recovery RAID5/6
data but screw up P/Q, due to the hardness coding in kernel.
While in btrfs-progs, no page, (almost) no memory size limit, we're
can focus on the scrub, and make things easier.
New offline scrub can detect and report P/Q corruption with
recoverability report, while kernel will only report data stripe error.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Su <suy.fnst@cn.fujitsu.com>
Signed-off-by: Gu Jinxiang <gujx@cn.fujitsu.com>
2017-03-28 13:31:48 +00:00
|
|
|
|
|
|
|
int btrfs_scrub(struct btrfs_fs_info *fs_info, struct task_context *task,
|
|
|
|
int write)
|
|
|
|
{
|
|
|
|
u64 bg_nr = 0;
|
|
|
|
struct btrfs_block_group_cache *bg_cache;
|
|
|
|
struct btrfs_scrub_progress scrub_ctx = {0};
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ASSERT(fs_info);
|
|
|
|
|
|
|
|
bg_cache = btrfs_lookup_first_block_group(fs_info, 0);
|
|
|
|
if (!bg_cache) {
|
|
|
|
error("no block group is found");
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
++bg_nr;
|
|
|
|
|
|
|
|
if (task) {
|
|
|
|
/* get block group numbers for progress */
|
|
|
|
while (1) {
|
|
|
|
u64 bg_offset = bg_cache->key.objectid +
|
|
|
|
bg_cache->key.offset;
|
|
|
|
bg_cache = btrfs_lookup_first_block_group(fs_info,
|
|
|
|
bg_offset);
|
|
|
|
if (!bg_cache)
|
|
|
|
break;
|
|
|
|
++bg_nr;
|
|
|
|
}
|
|
|
|
task->all = bg_nr;
|
|
|
|
task->cur = 1;
|
|
|
|
task_start(task->info);
|
|
|
|
|
|
|
|
bg_cache = btrfs_lookup_first_block_group(fs_info, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
ret = scrub_one_block_group(fs_info, &scrub_ctx, bg_cache,
|
|
|
|
write);
|
|
|
|
if (ret < 0 && ret != -EIO)
|
|
|
|
break;
|
|
|
|
if (task)
|
|
|
|
task->cur++;
|
|
|
|
|
|
|
|
bg_cache = btrfs_lookup_first_block_group(fs_info,
|
|
|
|
bg_cache->key.objectid + bg_cache->key.offset);
|
|
|
|
if (!bg_cache)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (task)
|
|
|
|
task_stop(task->info);
|
|
|
|
|
|
|
|
printf("Scrub result:\n");
|
|
|
|
printf("Tree bytes scrubbed: %llu\n", scrub_ctx.tree_bytes_scrubbed);
|
|
|
|
printf("Tree extents scrubbed: %llu\n", scrub_ctx.tree_extents_scrubbed);
|
|
|
|
printf("Data bytes scrubbed: %llu\n", scrub_ctx.data_bytes_scrubbed);
|
|
|
|
printf("Data extents scrubbed: %llu\n", scrub_ctx.data_extents_scrubbed);
|
|
|
|
printf("Data bytes without csum: %llu\n", scrub_ctx.csum_discards *
|
|
|
|
fs_info->sectorsize);
|
|
|
|
printf("Read error: %llu\n", scrub_ctx.read_errors);
|
|
|
|
printf("Verify error: %llu\n", scrub_ctx.verify_errors);
|
|
|
|
printf("Csum error: %llu\n", scrub_ctx.csum_errors);
|
|
|
|
if (scrub_ctx.csum_errors || scrub_ctx.read_errors ||
|
|
|
|
scrub_ctx.uncorrectable_errors || scrub_ctx.verify_errors)
|
|
|
|
ret = 1;
|
|
|
|
else
|
|
|
|
ret = 0;
|
|
|
|
return ret;
|
|
|
|
}
|