btrfs-progs/kernel-shared/extent_io.c

683 lines
17 KiB
C
Raw Normal View History

/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include "kerncompat.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <errno.h>
#include "kernel-lib/list.h"
#include "kernel-lib/raid56.h"
btrfs-progs: properly handle degraded raid56 reads [BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-13 06:32:38 +00:00
#include "kernel-lib/bitmap.h"
#include "kernel-shared/accessors.h"
#include "kernel-shared/extent-io-tree.h"
#include "kernel-shared/extent_io.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/volumes.h"
#include "kernel-shared/disk-io.h"
#include "kernel-shared/messages.h"
#include "kernel-shared/uapi/btrfs.h"
#include "kernel-shared/uapi/btrfs_tree.h"
#include "common/messages.h"
#include "common/utils.h"
#include "common/device-utils.h"
#include "common/internal.h"
static void free_extent_buffer_final(struct extent_buffer *eb);
void extent_buffer_init_cache(struct btrfs_fs_info *fs_info)
{
fs_info->max_cache_size = total_memory() / 4;
fs_info->cache_size = 0;
INIT_LIST_HEAD(&fs_info->lru);
}
void extent_buffer_free_cache(struct btrfs_fs_info *fs_info)
{
struct extent_buffer *eb;
while(!list_empty(&fs_info->lru)) {
eb = list_entry(fs_info->lru.next, struct extent_buffer, lru);
if (eb->refs) {
/*
* Reset extent buffer refs to 1, so the
* free_extent_buffer_nocache() can free it for sure.
*/
eb->refs = 1;
fprintf(stderr,
"extent buffer leak: start %llu len %u\n",
(unsigned long long)eb->start, eb->len);
free_extent_buffer_nocache(eb);
} else {
free_extent_buffer_final(eb);
}
}
free_extent_cache_tree(&fs_info->extent_cache);
fs_info->cache_size = 0;
}
/*
* extent_buffer_bitmap_set - set an area of a bitmap
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @pos: bit number of the first bit
* @len: number of bits to set
*/
void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
u8 *p = (u8 *)eb->data + start + BIT_BYTE(pos);
const unsigned int size = pos + len;
int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
while (len >= bits_to_set) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_BYTE;
mask_to_set = ~0;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
*p |= mask_to_set;
}
}
/*
* extent_buffer_bitmap_clear - clear an area of a bitmap
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @pos: bit number of the first bit
* @len: number of bits to clear
*/
void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
u8 *p = (u8 *)eb->data + start + BIT_BYTE(pos);
const unsigned int size = pos + len;
int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
while (len >= bits_to_clear) {
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_BYTE;
mask_to_clear = ~0;
p++;
}
if (len) {
mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
*p &= ~mask_to_clear;
}
}
static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *info,
u64 bytenr, u32 blocksize)
{
struct extent_buffer *eb;
eb = calloc(1, sizeof(struct extent_buffer) + blocksize);
if (!eb)
return NULL;
eb->start = bytenr;
eb->len = blocksize;
eb->refs = 1;
eb->flags = 0;
eb->cache_node.start = bytenr;
eb->cache_node.size = blocksize;
eb->fs_info = info;
INIT_LIST_HEAD(&eb->recow);
INIT_LIST_HEAD(&eb->lru);
btrfs-progs: check/original: Fix uninitialized extent buffer contents [BUG] Valgrind reports the following error for fsck/012: adding new tree backref on start 4206592 len 4096 parent 0 root 5 ==100735== Syscall param pwrite64(buf) points to uninitialised byte(s) ==100735== at 0x49F303A: pwrite (in /usr/lib/libpthread-2.31.so) ==100735== by 0x1A5C85: write_extent_to_disk (extent_io.c:815) ==100735== by 0x1B2507: write_and_map_eb (disk-io.c:512) ==100735== by 0x1B26A7: write_tree_block (disk-io.c:545) ==100735== by 0x1D4822: __commit_transaction (transaction.c:148) ==100735== by 0x1D4AA2: btrfs_commit_transaction (transaction.c:213) ==100735== by 0x16360D: fixup_extent_refs (main.c:7662) ==100735== by 0x16449F: check_extent_refs (main.c:8033) ==100735== by 0x166199: check_chunks_and_extents (main.c:8786) ==100735== by 0x166441: do_check_chunks_and_extents (main.c:8842) ==100735== by 0x169D13: cmd_check (main.c:10324) ==100735== by 0x11CDC6: cmd_execute (commands.h:125) ==100735== Address 0x4e8aeb0 is 128 bytes inside a block of size 4,224 alloc'd ==100735== at 0x483BB65: calloc (vg_replace_malloc.c:762) ==100735== by 0x1A54C5: __alloc_extent_buffer (extent_io.c:609) ==100735== by 0x1A5AD1: alloc_extent_buffer (extent_io.c:752) ==100735== by 0x1B1A0A: btrfs_find_create_tree_block (disk-io.c:222) ==100735== by 0x1BD4A2: btrfs_alloc_free_block (extent-tree.c:2538) ==100735== by 0x1A8CE3: __btrfs_cow_block (ctree.c:322) ==100735== by 0x1A91C6: btrfs_cow_block (ctree.c:415) ==100735== by 0x1AB16C: btrfs_search_slot (ctree.c:1185) ==100735== by 0x160BBC: delete_extent_records (main.c:6652) ==100735== by 0x16343F: fixup_extent_refs (main.c:7629) ==100735== by 0x16449F: check_extent_refs (main.c:8033) ==100735== by 0x166199: check_chunks_and_extents (main.c:8786) ==100735== [CAUSE] For new extent buffer allocated, we don't initialize its content. This is not a major concern, at all. For the above report, the reported range is inside the unused part of the extent buffer, thus won't cause anything. Regular btrfs_cow_block() will cover all the used ranges of one extent buffer. [FIX] But still, since kernel initialize the extent buffer with 0, it won't hurt to do extra initialized to make valgrind happy. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2020-03-24 10:53:14 +00:00
memset_extent_buffer(eb, 0, 0, blocksize);
return eb;
}
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
{
struct extent_buffer *new;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (!new)
return NULL;
copy_extent_buffer_full(new, src);
new->flags |= EXTENT_BUFFER_DUMMY;
return new;
}
static void free_extent_buffer_final(struct extent_buffer *eb)
{
BUG_ON(eb->refs);
list_del_init(&eb->lru);
if (!(eb->flags & EXTENT_BUFFER_DUMMY)) {
remove_cache_extent(&eb->fs_info->extent_cache, &eb->cache_node);
BUG_ON(eb->fs_info->cache_size < eb->len);
eb->fs_info->cache_size -= eb->len;
}
kfree(eb);
}
static void free_extent_buffer_internal(struct extent_buffer *eb, bool free_now)
{
if (!eb || IS_ERR(eb))
return;
eb->refs--;
BUG_ON(eb->refs < 0);
if (eb->refs == 0) {
if (eb->flags & EXTENT_BUFFER_DIRTY) {
btrfs-progs: only warn if there are leaked extent buffers after transaction abort Another BUG_ON() during fuzz/003: ====== RUN MAYFAIL btrfs check --init-csum-tree tests/fuzz-tests/images/bko-161821.raw.restored [1/7] checking root items Fixed 0 roots. [2/7] checking extents parent transid verify failed on 4198400 wanted 14 found 1114126 parent transid verify failed on 4198400 wanted 14 found 1114126 Ignoring transid failure owner ref check failed [4198400 4096] repair deleting extent record: key [4198400,169,0] adding new tree backref on start 4198400 len 4096 parent 0 root 5 Repaired extent references for 4198400 ref mismatch on [4222976 4096] extent item 1, found 0 backref 4222976 root 7 not referenced back 0x5617f8ecf780 incorrect global backref count on 4222976 found 1 wanted 0 backpointer mismatch on [4222976 4096] owner ref check failed [4222976 4096] repair deleting extent record: key [4222976,169,0] Repaired extent references for 4222976 [3/7] checking free space cache [4/7] checking fs roots parent transid verify failed on 4198400 wanted 14 found 1114126 Ignoring transid failure Wrong generation of child node/leaf, wanted: 1114126, have: 14 root 5 missing its root dir, recreating parent transid verify failed on 4198400 wanted 14 found 1114126 Ignoring transid failure ERROR: child eb corrupted: parent bytenr=4222976 item=0 parent level=1 child level=2 ERROR: errors found in fs roots extent buffer leak: start 4222976 len 4096 extent_io.c:611: free_extent_buffer_internal: BUG_ON `eb->flags & EXTENT_DIRTY` triggered, value 1 failed (ignored, ret=134): btrfs check --init-csum-tree tests/fuzz-tests/images/bko-161821.raw.restored mayfail: returned code 134 (SIGABRT), not ignored test failed for case 003-multi-check-unmounted Since we're shifting to use btrfs_abort_transaction() in btrfs-progs, it will be more and more common to see dirty leaked eb. Instead of BUG_ON(), we only need to report it as a warning. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-03 05:50:19 +00:00
warning(
"dirty eb leak (aborted trans): start %llu len %u",
eb->start, eb->len);
}
list_del_init(&eb->recow);
if (eb->flags & EXTENT_BUFFER_DUMMY || free_now)
free_extent_buffer_final(eb);
}
}
void free_extent_buffer(struct extent_buffer *eb)
{
free_extent_buffer_internal(eb, 0);
}
void free_extent_buffer_nocache(struct extent_buffer *eb)
{
free_extent_buffer_internal(eb, 1);
}
void free_extent_buffer_stale(struct extent_buffer *eb)
{
free_extent_buffer_internal(eb, 1);
}
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 bytenr)
{
struct extent_buffer *eb = NULL;
struct cache_extent *cache;
cache = lookup_cache_extent(&fs_info->extent_cache, bytenr,
fs_info->nodesize);
if (cache && cache->start == bytenr &&
cache->size == fs_info->nodesize) {
eb = container_of(cache, struct extent_buffer, cache_node);
list_move_tail(&eb->lru, &fs_info->lru);
eb->refs++;
}
return eb;
}
struct extent_buffer *find_first_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
struct extent_buffer *eb = NULL;
struct cache_extent *cache;
cache = search_cache_extent(&fs_info->extent_cache, start);
if (cache) {
eb = container_of(cache, struct extent_buffer, cache_node);
list_move_tail(&eb->lru, &fs_info->lru);
eb->refs++;
}
return eb;
}
static void trim_extent_buffer_cache(struct btrfs_fs_info *fs_info)
{
struct extent_buffer *eb, *tmp;
list_for_each_entry_safe(eb, tmp, &fs_info->lru, lru) {
if (eb->refs == 0)
free_extent_buffer_final(eb);
if (fs_info->cache_size <= ((fs_info->max_cache_size * 9) / 10))
break;
}
}
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 bytenr, u32 blocksize)
{
struct extent_buffer *eb;
struct cache_extent *cache;
cache = lookup_cache_extent(&fs_info->extent_cache, bytenr, blocksize);
if (cache && cache->start == bytenr &&
cache->size == blocksize) {
eb = container_of(cache, struct extent_buffer, cache_node);
list_move_tail(&eb->lru, &fs_info->lru);
eb->refs++;
} else {
int ret;
if (cache) {
eb = container_of(cache, struct extent_buffer,
cache_node);
free_extent_buffer(eb);
}
eb = __alloc_extent_buffer(fs_info, bytenr, blocksize);
if (!eb)
return NULL;
ret = insert_cache_extent(&fs_info->extent_cache, &eb->cache_node);
if (ret) {
kfree(eb);
return NULL;
}
list_add_tail(&eb->lru, &fs_info->lru);
fs_info->cache_size += blocksize;
if (fs_info->cache_size >= fs_info->max_cache_size)
trim_extent_buffer_cache(fs_info);
}
return eb;
}
btrfs-progs: disk-io: Verify the bytenr passed in is mapped for read_tree_block() [BUG] For a fuzzed image, `btrfs check` will segfault at open_ctree() stage: $ btrfs check --mode=lowmem issue_207.raw Opening filesystem to check... extent_io.c:665: free_extent_buffer_internal: BUG_ON `eb->refs < 0` triggered, value 1 btrfs(+0x6bf67)[0x56431d278f67] btrfs(+0x6c16e)[0x56431d27916e] btrfs(alloc_extent_buffer+0x45)[0x56431d279db5] btrfs(read_tree_block+0x59)[0x56431d2848f9] btrfs(btrfs_setup_all_roots+0x29c)[0x56431d28535c] btrfs(+0x78903)[0x56431d285903] btrfs(open_ctree_fs_info+0x90)[0x56431d285b60] btrfs(+0x45a01)[0x56431d252a01] btrfs(main+0x94)[0x56431d2220c4] /usr/lib/libc.so.6(__libc_start_main+0xf3)[0x7f6e28519153] btrfs(_start+0x2e)[0x56431d22235e] [CAUSE] The fuzzed image has a strange log root bytenr: log_root 61440 log_root_transid 0 In fact, the log_root seems to be fuzzed, as its transid is 0, which is invalid. Note that range [61440, 77824) covers the physical offset of the primary super block. The bug is caused by the following sequence: 1. cache for tree block [64K, 68K) is created by open_ctree() __open_ctree_fd() |- btrfs_setup_chunk_tree_and_device_map() |- btrfs_read_sys_array() |- sb = btrfs_find_create_tree_block() |- free_extent_buffer(sb) This created an extent buffer [64K, 68K) in fs_info->extent_cache, then reduce the refcount of that eb back to 0, but not freed yet. 2. Try to read that corrupted log root __open_ctree_fd() |- btrfs_setup_chunk_tree_and_device_map() |- btrfs_setup_all_roots() |- find_and_setup_log_root() |- read_tree_block() |- btrfs_find_create_tree_block() |- alloc_extent_buffer() The final alloc_extent_buffer() will try to free that cached eb [64K, 68K), since it doesn't match with current search. And since that cached eb is already released (refcount == 0), the extra free_extent_buffer() will cause above BUG_ON(). [FIX] Here we fix it through a more comprehensive method, instead of simply verifying log_root_transid, here we just don't pollute eb cache when reading sys chunk array. So that we won't have an eb cache [64K, 68K), and will error out at logical mapping phase. Issue: #207 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-12-18 01:19:39 +00:00
/*
* Allocate a dummy extent buffer which won't be inserted into extent buffer
* cache.
*
* This mostly allows super block read write using existing eb infrastructure
* without pulluting the eb cache.
*
* This is especially important to avoid injecting eb->start == SZ_64K, as
* fuzzed image could have invalid tree bytenr covers super block range,
* and cause ref count underflow.
*/
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 bytenr, u32 blocksize)
{
struct extent_buffer *ret;
ret = __alloc_extent_buffer(fs_info, bytenr, blocksize);
if (!ret)
return NULL;
ret->flags |= EXTENT_BUFFER_DUMMY;
return ret;
}
static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
u64 len, int mirror, struct btrfs_multi_bio *multi,
u64 *raid_map)
{
btrfs-progs: properly handle degraded raid56 reads [BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-13 06:32:38 +00:00
const int tolerance = (multi->type & BTRFS_RAID_RAID6 ? 2 : 1);
const int num_stripes = multi->num_stripes;
const u64 full_stripe_start = raid_map[0];
void **pointers = NULL;
btrfs-progs: properly handle degraded raid56 reads [BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-13 06:32:38 +00:00
unsigned long *failed_stripe_bitmap = NULL;
int failed_a = -1;
int failed_b = -1;
int i;
int ret;
/* Only read repair should go this path */
ASSERT(mirror > 1);
ASSERT(raid_map);
/* The read length should be inside one stripe */
ASSERT(len <= BTRFS_STRIPE_LEN);
pointers = calloc(num_stripes, sizeof(void *));
if (!pointers)
return -ENOMEM;
/* Allocate memory for the full stripe */
for (i = 0; i < num_stripes; i++) {
pointers[i] = kmalloc(BTRFS_STRIPE_LEN, GFP_KERNEL);
if (!pointers[i]) {
ret = -ENOMEM;
goto out;
}
}
btrfs-progs: properly handle degraded raid56 reads [BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-13 06:32:38 +00:00
failed_stripe_bitmap = bitmap_zalloc(num_stripes);
if (!failed_stripe_bitmap) {
ret = -ENOMEM;
goto out;
}
/*
* Read the full stripe.
*
* The stripes in @multi is not rotated, thus can be used to read from
* disk directly.
*/
for (i = 0; i < num_stripes; i++) {
ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i],
BTRFS_STRIPE_LEN, multi->stripes[i].physical,
fs_info->zoned);
btrfs-progs: properly handle degraded raid56 reads [BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-13 06:32:38 +00:00
if (ret < BTRFS_STRIPE_LEN)
set_bit(i, failed_stripe_bitmap);
}
/*
* Get the failed index.
*
* Since we're reading using mirror_num > 1 already, it means the data
* stripe where @logical lies in is definitely corrupted.
*/
btrfs-progs: properly handle degraded raid56 reads [BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-13 06:32:38 +00:00
set_bit((logical - full_stripe_start) / BTRFS_STRIPE_LEN, failed_stripe_bitmap);
/*
* For RAID6, we don't have good way to exhaust all the combinations,
* so here we can only go through the map to see if we have missing devices.
btrfs-progs: properly handle degraded raid56 reads [BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-13 06:32:38 +00:00
*
* If we only have one failed stripe (marked by above set_bit()), then
* we have no better idea, fallback to use P corruption.
*/
btrfs-progs: properly handle degraded raid56 reads [BUG] For a degraded RAID5, btrfs check will fail to even read the chunk root: # mkfs.btrfs -f -m raid5 -d raid5 $dev1 $dev2 $dev3 # wipefs -fa $dev1 # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing bad tree block 22036480, bytenr mismatch, want=22036480, have=0 ERROR: cannot read chunk root ERROR: cannot open file system [CAUSE] Although read_tree_block() function from btrfs-progs is properly iterating the mirrors (mirror 1 is reading from the disk directly, mirror 2 will be rebuild from parity), the raid56 recovery path is not handling the read error correctly. The existing code will try to read the full stripe, but any read failure (including missing device) will immediately cause an error: for (i = 0; i < num_stripes; i++) { ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], BTRFS_STRIPE_LEN, multi->stripes[i].physical, fs_info->zoned); if (ret < BTRFS_STRIPE_LEN) { ret = -EIO; goto out; } } [FIX] To make failed_a/failed_b calculation much easier, and properly handle too many missing devices, here this patch will introduce a new bitmap based solution. The new @failed_stripe_bitmap will represent all the failed stripes. So the initial read will mark all the missing devices in the @failed_stripe_bitmap, and later operations will all operate on that bitmap. Only before we call raid56_recov(), we convert the bitmap to the old failed_a/failed_b interface and continue. Now btrfs check can handle above case properly: # btrfs check $dev2 Opening filesystem to check... warning, device 1 is missing Checking filesystem on /dev/test/scratch2 UUID: 8b2e1cb4-f35b-4856-9b11-262d39d8458b [1/7] checking root items [2/7] checking extents [3/7] checking free space tree [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 147456 bytes used, no error found total csum bytes: 0 total tree bytes: 147456 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 139871 file data blocks allocated: 0 referenced 0 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-13 06:32:38 +00:00
if (multi->type & BTRFS_BLOCK_GROUP_RAID6 &&
bitmap_weight(failed_stripe_bitmap, num_stripes) < 2)
set_bit(num_stripes - 2, failed_stripe_bitmap);
/* Damaged beyond repair already. */
if (bitmap_weight(failed_stripe_bitmap, num_stripes) > tolerance) {
ret = -EIO;
goto out;
}
for_each_set_bit(i, failed_stripe_bitmap, num_stripes) {
if (failed_a < 0)
failed_a = i;
else if (failed_b < 0)
failed_b = i;
}
/* Rebuild the full stripe */
ret = raid56_recov(num_stripes, BTRFS_STRIPE_LEN, multi->type,
failed_a, failed_b, pointers);
ASSERT(ret == 0);
/* Now copy the data back to original buf */
memcpy(buf, pointers[failed_a] + (logical - full_stripe_start) %
BTRFS_STRIPE_LEN, len);
ret = 0;
out:
kfree(failed_stripe_bitmap);
for (i = 0; i < num_stripes; i++)
kfree(pointers[i]);
kfree(pointers);
return ret;
}
int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical,
u64 *len, int mirror)
{
struct btrfs_multi_bio *multi = NULL;
struct btrfs_device *device;
u64 read_len = *len;
u64 *raid_map = NULL;
int ret;
ret = btrfs_map_block(info, READ, logical, &read_len, &multi, mirror,
&raid_map);
if (ret) {
fprintf(stderr, "Couldn't map the block %llu\n", logical);
return -EIO;
}
read_len = min(*len, read_len);
/* We need to rebuild from P/Q */
if (mirror > 1 && multi->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = read_raid56(info, buf, logical, read_len, mirror, multi,
raid_map);
kfree(multi);
kfree(raid_map);
*len = read_len;
return ret;
}
kfree(raid_map);
device = multi->stripes[0].dev;
if (device->fd <= 0) {
kfree(multi);
return -EIO;
}
ret = btrfs_pread(device->fd, buf, read_len,
multi->stripes[0].physical, info->zoned);
kfree(multi);
if (ret < 0) {
fprintf(stderr, "Error reading %llu, %d\n", logical,
ret);
return ret;
}
if (ret != read_len) {
fprintf(stderr,
"Short read for %llu, read %d, read_len %llu\n",
logical, ret, read_len);
return -EIO;
}
*len = read_len;
return 0;
}
/*
* Write the data in @buf to logical bytenr @offset.
*
* Such data will be written to all mirrors and RAID56 P/Q will also be
* properly handled.
*/
int write_data_to_disk(struct btrfs_fs_info *info, const void *buf, u64 offset,
u64 bytes)
{
struct btrfs_multi_bio *multi = NULL;
struct btrfs_device *device;
u64 bytes_left = bytes;
u64 this_len;
u64 total_write = 0;
u64 *raid_map = NULL;
u64 dev_bytenr;
int dev_nr;
int ret = 0;
while (bytes_left > 0) {
this_len = bytes_left;
dev_nr = 0;
ret = btrfs_map_block(info, WRITE, offset, &this_len, &multi,
0, &raid_map);
if (ret) {
fprintf(stderr, "Couldn't map the block %llu\n",
offset);
return -EIO;
}
if (raid_map) {
struct extent_buffer *eb;
u64 stripe_len = this_len;
this_len = min(this_len, bytes_left);
this_len = min(this_len, (u64)info->nodesize);
eb = kmalloc(sizeof(struct extent_buffer) + this_len, GFP_KERNEL);
if (!eb) {
error_msg(ERROR_MSG_MEMORY, "extent buffer");
ret = -ENOMEM;
goto out;
}
memset(eb, 0, sizeof(struct extent_buffer) + this_len);
eb->start = offset;
eb->len = this_len;
memcpy(eb->data, buf + total_write, this_len);
ret = write_raid56_with_parity(info, eb, multi,
stripe_len, raid_map);
BUG_ON(ret < 0);
kfree(eb);
kfree(raid_map);
raid_map = NULL;
} else while (dev_nr < multi->num_stripes) {
device = multi->stripes[dev_nr].dev;
if (device->fd <= 0) {
kfree(multi);
return -EIO;
}
dev_bytenr = multi->stripes[dev_nr].physical;
this_len = min(this_len, bytes_left);
dev_nr++;
device->total_ios++;
ret = btrfs_pwrite(device->fd, buf + total_write,
this_len, dev_bytenr, info->zoned);
if (ret != this_len) {
if (ret < 0) {
fprintf(stderr, "Error writing to "
"device %d\n", errno);
btrfs-progs: properly handle write error when writing back tree blocks [BUG] If we emulate a write error during commit transaction, by setting the block device read-only, then we can easily have the following crash using "btrfs check --clear-space-cache v2": Opening filesystem to check... Checking filesystem on /dev/test/scratch1 UUID: 5945915b-37f1-4bfa-9f64-684b318b8f73 Clear free space cache v2 Error writing to device 1 kernel-shared/transaction.c:156: __commit_transaction: BUG_ON `ret` triggered, value 1 ./btrfs(+0x570c9)[0x562ec894f0c9] ./btrfs(+0x57167)[0x562ec894f167] ./btrfs(__commit_transaction+0x13b)[0x562ec894f7f2] ./btrfs(btrfs_commit_transaction+0x214)[0x562ec894fa64] ./btrfs(btrfs_clear_free_space_tree+0x177)[0x562ec8941ae6] ./btrfs(+0xc8958)[0x562ec89c0958] ./btrfs(+0xc9d53)[0x562ec89c1d53] ./btrfs(+0x17ec7)[0x562ec890fec7] ./btrfs(main+0x12f)[0x562ec8910908] /usr/lib/libc.so.6(+0x232d0)[0x7ff917ee82d0] /usr/lib/libc.so.6(__libc_start_main+0x8a)[0x7ff917ee838a] ./btrfs(_start+0x25)[0x562ec890fdc5] Aborted (core dumped) [CAUSE] The call trace has shown it's a BUG_ON(), and it's from __commit_transaction(), which is writing tree blocks back. [FIX] The fix is pretty simple, just return error. In fact we even have an error value check in btrfs_commit_transaction() just after __commit_transaction() call (although not catching the return value from it). And since we're here, also call btrfs_abort_transaction() to prevent newer transactions from being started. Now we won't have a full crash: Opening filesystem to check... Checking filesystem on /dev/test/scratch1 UUID: 5945915b-37f1-4bfa-9f64-684b318b8f73 Clear free space cache v2 Error writing to device 1 ERROR: failed to write bytenr 30425088 length 16384: Operation not permitted ERROR: failed to write tree block 30425088: Operation not permitted ERROR: failed to clear free space cache v2: -1 extent buffer leak: start 30720000 len 16384 Reported-by: Christoph Anton Mitterer <calestyo@scientia.org> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-23 11:59:46 +00:00
ret = -errno;
kfree(multi);
return ret;
} else {
fprintf(stderr, "Short write\n");
kfree(multi);
return -EIO;
}
}
}
BUG_ON(bytes_left < this_len);
bytes_left -= this_len;
offset += this_len;
total_write += this_len;
kfree(multi);
multi = NULL;
}
return 0;
out:
kfree(raid_map);
return ret;
}
int set_extent_buffer_dirty(struct extent_buffer *eb)
{
struct extent_io_tree *tree = &eb->fs_info->dirty_buffers;
if (!(eb->flags & EXTENT_BUFFER_DIRTY)) {
eb->flags |= EXTENT_BUFFER_DIRTY;
set_extent_dirty(tree, eb->start, eb->start + eb->len - 1,
GFP_NOFS);
extent_buffer_get(eb);
}
return 0;
}
int btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct extent_io_tree *tree = &eb->fs_info->dirty_buffers;
if (eb->flags & EXTENT_BUFFER_DIRTY) {
eb->flags &= ~EXTENT_BUFFER_DIRTY;
clear_extent_dirty(tree, eb->start, eb->start + eb->len - 1,
NULL);
free_extent_buffer(eb);
}
return 0;
}
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
return memcmp(eb->data + start, ptrv, len);
}
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start, unsigned long len)
{
memcpy(dst, eb->data + start, len);
}
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src)
{
write_extent_buffer(eb, src, btrfs_header_fsid(), BTRFS_FSID_SIZE);
}
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src)
{
write_extent_buffer(eb, src, btrfs_header_chunk_tree_uuid(eb), BTRFS_FSID_SIZE);
}
void write_extent_buffer(const struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len)
{
memcpy((void *)eb->data + start, src, len);
}
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
copy_extent_buffer(dst, src, 0, 0, src->len);
}
void copy_extent_buffer(const struct extent_buffer *dst,
const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
memcpy((void *)dst->data + dst_offset, src->data + src_offset, len);
}
void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len)
{
memcpy((void *)dst->data + dst_offset, dst->data + src_offset, len);
}
void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len)
{
memmove((void *)dst->data + dst_offset, dst->data + src_offset, len);
}
void memset_extent_buffer(const struct extent_buffer *eb, char c,
unsigned long start, unsigned long len)
{
memset((void *)eb->data + start, c, len);
}
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
return le_test_bit(nr, (u8 *)eb->data + start);
}
/*
* btrfs_readahead_node_child - readahead a node's child block
* @node: parent node we're reading from
* @slot: slot in the parent node for the child we want to read
*
* A helper for readahead_tree_block, we simply read the bytenr pointed at the
* slot in the node provided.
*/
void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
{
readahead_tree_block(node->fs_info, btrfs_node_blockptr(node, slot),
btrfs_node_ptr_generation(node, slot));
}