btrfs-progs: properly handle degraded raid56 reads
[BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
dd7c458cb3
commit
2aa4085bf7
|
@ -0,0 +1,45 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
/*
|
||||
* A user-space bitmap wrapper to provide a subset of kernel bitmap operations.
|
||||
*
|
||||
* Most functions are not a direct copy of the kernel version, but should be
|
||||
* good enough for single thread usage.
|
||||
*/
|
||||
|
||||
#ifndef _BTRFS_PROGS_LINUX_BITMAP_H_
|
||||
#define _BTRFS_PROGS_LINUX_BITMAP_H_
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "kerncompat.h"
|
||||
#include "kernel-lib/bitops.h"
|
||||
|
||||
static inline unsigned long *bitmap_zalloc(unsigned int nbits)
|
||||
{
|
||||
return calloc(BITS_TO_LONGS(nbits), BITS_PER_LONG);
|
||||
}
|
||||
|
||||
static inline void bitmap_free(unsigned long *bitmap)
|
||||
{
|
||||
free(bitmap);
|
||||
}
|
||||
|
||||
#define BITMAP_LAST_WORK_MASK(nbits) (~0ULL >> (-(nbits) & (BITS_PER_LONG - 1)))
|
||||
|
||||
static inline unsigned int bitmap_weight(const unsigned long *bitmap, unsigned int nbits)
|
||||
{
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
/* Handle the aligned part first. */
|
||||
for (i = 0; i < nbits / BITS_PER_LONG; i++)
|
||||
ret += hweight_long(bitmap[i]);
|
||||
|
||||
/* The remaining unaligned part. */
|
||||
if (nbits % BITS_PER_LONG)
|
||||
ret += bitmap[i] & BITMAP_LAST_WORD_MASK(nbits);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -27,6 +27,7 @@
|
|||
#include "kernel-shared/extent_io.h"
|
||||
#include "kernel-lib/list.h"
|
||||
#include "kernel-lib/raid56.h"
|
||||
#include "kernel-lib/bitmap.h"
|
||||
#include "kernel-shared/ctree.h"
|
||||
#include "kernel-shared/volumes.h"
|
||||
#include "kernel-shared/disk-io.h"
|
||||
|
@ -791,9 +792,11 @@ static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
|
|||
u64 len, int mirror, struct btrfs_multi_bio *multi,
|
||||
u64 *raid_map)
|
||||
{
|
||||
const int tolerance = (multi->type & BTRFS_RAID_RAID6 ? 2 : 1);
|
||||
const int num_stripes = multi->num_stripes;
|
||||
const u64 full_stripe_start = raid_map[0];
|
||||
void **pointers = NULL;
|
||||
unsigned long *failed_stripe_bitmap = NULL;
|
||||
int failed_a = -1;
|
||||
int failed_b = -1;
|
||||
int i;
|
||||
|
@ -820,6 +823,12 @@ static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
|
|||
}
|
||||
}
|
||||
|
||||
failed_stripe_bitmap = bitmap_zalloc(num_stripes);
|
||||
if (!failed_stripe_bitmap) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read the full stripe.
|
||||
*
|
||||
|
@ -830,10 +839,8 @@ static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
|
|||
ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i],
|
||||
BTRFS_STRIPE_LEN, multi->stripes[i].physical,
|
||||
fs_info->zoned);
|
||||
if (ret < BTRFS_STRIPE_LEN) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
if (ret < BTRFS_STRIPE_LEN)
|
||||
set_bit(i, failed_stripe_bitmap);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -842,29 +849,30 @@ static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
|
|||
* Since we're reading using mirror_num > 1 already, it means the data
|
||||
* stripe where @logical lies in is definitely corrupted.
|
||||
*/
|
||||
failed_a = (logical - full_stripe_start) / BTRFS_STRIPE_LEN;
|
||||
set_bit((logical - full_stripe_start) / BTRFS_STRIPE_LEN, failed_stripe_bitmap);
|
||||
|
||||
/*
|
||||
* For RAID6, we don't have good way to exhaust all the combinations,
|
||||
* so here we can only go through the map to see if we have missing devices.
|
||||
*
|
||||
* If we only have one failed stripe (marked by above set_bit()), then
|
||||
* we have no better idea, fallback to use P corruption.
|
||||
*/
|
||||
if (multi->type & BTRFS_BLOCK_GROUP_RAID6) {
|
||||
for (i = 0; i < num_stripes; i++) {
|
||||
/* Skip failed_a, as it's already marked failed */
|
||||
if (i == failed_a)
|
||||
continue;
|
||||
/* Missing dev */
|
||||
if (multi->stripes[i].dev->fd == -1) {
|
||||
failed_b = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* No missing device, we have no better idea, default to P
|
||||
* corruption
|
||||
*/
|
||||
if (failed_b < 0)
|
||||
failed_b = num_stripes - 2;
|
||||
if (multi->type & BTRFS_BLOCK_GROUP_RAID6 &&
|
||||
bitmap_weight(failed_stripe_bitmap, num_stripes) < 2)
|
||||
set_bit(num_stripes - 2, failed_stripe_bitmap);
|
||||
|
||||
/* Damaged beyond repair already. */
|
||||
if (bitmap_weight(failed_stripe_bitmap, num_stripes) > tolerance) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for_each_set_bit(i, failed_stripe_bitmap, num_stripes) {
|
||||
if (failed_a < 0)
|
||||
failed_a = i;
|
||||
else if (failed_b < 0)
|
||||
failed_b = i;
|
||||
}
|
||||
|
||||
/* Rebuild the full stripe */
|
||||
|
@ -877,6 +885,7 @@ static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
|
|||
BTRFS_STRIPE_LEN, len);
|
||||
ret = 0;
|
||||
out:
|
||||
free(failed_stripe_bitmap);
|
||||
for (i = 0; i < num_stripes; i++)
|
||||
free(pointers[i]);
|
||||
free(pointers);
|
||||
|
|
Loading…
Reference in New Issue