btrfs-progs/kernel-shared/extent_io.c

1031 lines
24 KiB
C
Raw Normal View History

/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdbool.h>
#include "kerncompat.h"
#include "kernel-shared/extent_io.h"
#include "kernel-lib/list.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/volumes.h"
#include "common/utils.h"
#include "common/internal.h"
void extent_io_tree_init(struct extent_io_tree *tree)
{
cache_tree_init(&tree->state);
cache_tree_init(&tree->cache);
INIT_LIST_HEAD(&tree->lru);
tree->cache_size = 0;
tree->max_cache_size = (u64)total_memory() / 4;
}
void extent_io_tree_init_cache_max(struct extent_io_tree *tree,
u64 max_cache_size)
{
extent_io_tree_init(tree);
tree->max_cache_size = max_cache_size;
}
static struct extent_state *alloc_extent_state(void)
{
struct extent_state *state;
state = malloc(sizeof(*state));
if (!state)
return NULL;
state->cache_node.objectid = 0;
state->refs = 1;
state->state = 0;
state->xprivate = 0;
return state;
}
static void btrfs_free_extent_state(struct extent_state *state)
{
state->refs--;
BUG_ON(state->refs < 0);
if (state->refs == 0)
free(state);
}
static void free_extent_state_func(struct cache_extent *cache)
{
struct extent_state *es;
es = container_of(cache, struct extent_state, cache_node);
btrfs_free_extent_state(es);
}
static void free_extent_buffer_final(struct extent_buffer *eb);
void extent_io_tree_cleanup(struct extent_io_tree *tree)
{
struct extent_buffer *eb;
while(!list_empty(&tree->lru)) {
eb = list_entry(tree->lru.next, struct extent_buffer, lru);
if (eb->refs) {
fprintf(stderr,
"extent buffer leak: start %llu len %u\n",
(unsigned long long)eb->start, eb->len);
free_extent_buffer_nocache(eb);
} else {
free_extent_buffer_final(eb);
}
}
cache_tree_free_extents(&tree->state, free_extent_state_func);
}
static inline void update_extent_state(struct extent_state *state)
{
state->cache_node.start = state->start;
state->cache_node.size = state->end + 1 - state->start;
}
/*
* Utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
* extent in the tree. Extents with EXTENT_IO in their state field are
* not merged
*/
static int merge_state(struct extent_io_tree *tree,
struct extent_state *state)
{
struct extent_state *other;
struct cache_extent *other_node;
if (state->state & EXTENT_IOBITS)
return 0;
other_node = prev_cache_extent(&state->cache_node);
if (other_node) {
other = container_of(other_node, struct extent_state,
cache_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
state->start = other->start;
update_extent_state(state);
remove_cache_extent(&tree->state, &other->cache_node);
btrfs_free_extent_state(other);
}
}
other_node = next_cache_extent(&state->cache_node);
if (other_node) {
other = container_of(other_node, struct extent_state,
cache_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
other->start = state->start;
update_extent_state(other);
remove_cache_extent(&tree->state, &state->cache_node);
btrfs_free_extent_state(state);
}
}
return 0;
}
/*
* insert an extent_state struct into the tree. 'bits' are set on the
* struct before it is inserted.
*/
static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
int bits)
{
int ret;
BUG_ON(end < start);
state->state |= bits;
state->start = start;
state->end = end;
update_extent_state(state);
ret = insert_cache_extent(&tree->state, &state->cache_node);
BUG_ON(ret);
merge_state(tree, state);
return 0;
}
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
* offset inside 'orig' where it should be split.
*/
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
int ret;
prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
update_extent_state(prealloc);
orig->start = split;
update_extent_state(orig);
ret = insert_cache_extent(&tree->state, &prealloc->cache_node);
BUG_ON(ret);
return 0;
}
/*
* clear some bits on a range in the tree.
*/
static int clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state, int bits)
{
int ret = state->state & bits;
state->state &= ~bits;
if (state->state == 0) {
remove_cache_extent(&tree->state, &state->cache_node);
btrfs_free_extent_state(state);
} else {
merge_state(tree, state);
}
return ret;
}
/*
* extent_buffer_bitmap_set - set an area of a bitmap
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @pos: bit number of the first bit
* @len: number of bits to set
*/
void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
u8 *p = (u8 *)eb->data + start + BIT_BYTE(pos);
const unsigned int size = pos + len;
int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
while (len >= bits_to_set) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_BYTE;
mask_to_set = ~0;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
*p |= mask_to_set;
}
}
/*
* extent_buffer_bitmap_clear - clear an area of a bitmap
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @pos: bit number of the first bit
* @len: number of bits to clear
*/
void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
u8 *p = (u8 *)eb->data + start + BIT_BYTE(pos);
const unsigned int size = pos + len;
int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
while (len >= bits_to_clear) {
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_BYTE;
mask_to_clear = ~0;
p++;
}
if (len) {
mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
*p &= ~mask_to_clear;
}
}
/*
* clear some bits on a range in the tree.
*/
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct cache_extent *node;
u64 last_end;
int err;
int set = 0;
again:
if (!prealloc) {
prealloc = alloc_extent_state();
if (!prealloc)
return -ENOMEM;
}
/*
* this search will find the extents that end after
* our range starts
*/
node = search_cache_extent(&tree->state, start);
if (!node)
goto out;
state = container_of(node, struct extent_state, cache_node);
if (state->start > end)
goto out;
last_end = state->end;
/*
* | ---- desired range ---- |
* | state | or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip
* bits on second half.
*
* If the extent we found extends past our range, we
* just split and search again. It'll get split again
* the next time though.
*
* If the extent we found is inside our range, we clear
* the desired bit on it.
*/
if (state->start < start) {
err = split_state(tree, state, prealloc, start);
BUG_ON(err == -EEXIST);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
set |= clear_state_bit(tree, state, bits);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
} else {
start = state->start;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* We need to split the extent, and clear the bit
* on the first half
*/
if (state->start <= end && state->end > end) {
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
set |= clear_state_bit(tree, prealloc, bits);
prealloc = NULL;
goto out;
}
start = state->end + 1;
set |= clear_state_bit(tree, state, bits);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
goto search_again;
out:
if (prealloc)
btrfs_free_extent_state(prealloc);
return set;
search_again:
if (start > end)
goto out;
goto again;
}
/*
* set some bits on a range in the tree.
*/
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct cache_extent *node;
int err = 0;
u64 last_start;
u64 last_end;
again:
if (!prealloc) {
prealloc = alloc_extent_state();
if (!prealloc)
return -ENOMEM;
}
/*
* this search will find the extents that end after
* our range starts
*/
node = search_cache_extent(&tree->state, start);
if (!node) {
err = insert_state(tree, prealloc, start, end, bits);
BUG_ON(err == -EEXIST);
prealloc = NULL;
goto out;
}
state = container_of(node, struct extent_state, cache_node);
last_start = state->start;
last_end = state->end;
/*
* | ---- desired range ---- |
* | state |
*
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
state->state |= bits;
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip bits on
* second half.
*
* If the extent we found extends past our
* range, we just split and search again. It'll get split
* again the next time though.
*
* If the extent we found is inside our range, we set the
* desired bit on it.
*/
if (state->start < start) {
err = split_state(tree, state, prealloc, start);
BUG_ON(err == -EEXIST);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
state->state |= bits;
start = state->end + 1;
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
} else {
start = state->start;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state | or | state |
*
* There's a hole, we need to insert something in it and
* ignore the extent we found.
*/
if (state->start > start) {
u64 this_end;
if (end < last_start)
this_end = end;
else
this_end = last_start -1;
err = insert_state(tree, prealloc, start, this_end,
bits);
BUG_ON(err == -EEXIST);
prealloc = NULL;
if (err)
goto out;
start = this_end + 1;
goto search_again;
}
/*
* | ---- desired range ---- |
* | ---------- state ---------- |
* We need to split the extent, and set the bit
* on the first half
*/
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
state->state |= bits;
merge_state(tree, prealloc);
prealloc = NULL;
out:
if (prealloc)
btrfs_free_extent_state(prealloc);
return err;
search_again:
if (start > end)
goto out;
goto again;
}
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end)
{
return set_extent_bits(tree, start, end, EXTENT_DIRTY);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end)
{
return clear_extent_bits(tree, start, end, EXTENT_DIRTY);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, int bits)
{
struct cache_extent *node;
struct extent_state *state;
int ret = 1;
/*
* this search will find all the extents that end after
* our range starts.
*/
node = search_cache_extent(&tree->state, start);
if (!node)
goto out;
while(1) {
state = container_of(node, struct extent_state, cache_node);
if (state->end >= start && (state->state & bits)) {
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
break;
}
node = next_cache_extent(node);
if (!node)
break;
}
out:
return ret;
}
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int filled)
{
struct extent_state *state = NULL;
struct cache_extent *node;
int bitset = 0;
node = search_cache_extent(&tree->state, start);
while (node && start <= end) {
state = container_of(node, struct extent_state, cache_node);
if (filled && state->start > start) {
bitset = 0;
break;
}
if (state->start > end)
break;
if (state->state & bits) {
bitset = 1;
if (!filled)
break;
} else if (filled) {
bitset = 0;
break;
}
start = state->end + 1;
if (start > end)
break;
node = next_cache_extent(node);
if (!node) {
if (filled)
bitset = 0;
break;
}
}
return bitset;
}
int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
{
struct cache_extent *node;
struct extent_state *state;
int ret = 0;
node = search_cache_extent(&tree->state, start);
if (!node) {
ret = -ENOENT;
goto out;
}
state = container_of(node, struct extent_state, cache_node);
if (state->start != start) {
ret = -ENOENT;
goto out;
}
state->xprivate = private;
out:
return ret;
}
int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
{
struct cache_extent *node;
struct extent_state *state;
int ret = 0;
node = search_cache_extent(&tree->state, start);
if (!node) {
ret = -ENOENT;
goto out;
}
state = container_of(node, struct extent_state, cache_node);
if (state->start != start) {
ret = -ENOENT;
goto out;
}
*private = state->xprivate;
out:
return ret;
}
static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *info,
u64 bytenr, u32 blocksize)
{
struct extent_buffer *eb;
eb = calloc(1, sizeof(struct extent_buffer) + blocksize);
if (!eb)
return NULL;
eb->start = bytenr;
eb->len = blocksize;
eb->refs = 1;
eb->flags = 0;
eb->fd = -1;
eb->dev_bytenr = (u64)-1;
eb->cache_node.start = bytenr;
eb->cache_node.size = blocksize;
eb->fs_info = info;
INIT_LIST_HEAD(&eb->recow);
INIT_LIST_HEAD(&eb->lru);
btrfs-progs: check/original: Fix uninitialized extent buffer contents [BUG] Valgrind reports the following error for fsck/012: adding new tree backref on start 4206592 len 4096 parent 0 root 5 ==100735== Syscall param pwrite64(buf) points to uninitialised byte(s) ==100735== at 0x49F303A: pwrite (in /usr/lib/libpthread-2.31.so) ==100735== by 0x1A5C85: write_extent_to_disk (extent_io.c:815) ==100735== by 0x1B2507: write_and_map_eb (disk-io.c:512) ==100735== by 0x1B26A7: write_tree_block (disk-io.c:545) ==100735== by 0x1D4822: __commit_transaction (transaction.c:148) ==100735== by 0x1D4AA2: btrfs_commit_transaction (transaction.c:213) ==100735== by 0x16360D: fixup_extent_refs (main.c:7662) ==100735== by 0x16449F: check_extent_refs (main.c:8033) ==100735== by 0x166199: check_chunks_and_extents (main.c:8786) ==100735== by 0x166441: do_check_chunks_and_extents (main.c:8842) ==100735== by 0x169D13: cmd_check (main.c:10324) ==100735== by 0x11CDC6: cmd_execute (commands.h:125) ==100735== Address 0x4e8aeb0 is 128 bytes inside a block of size 4,224 alloc'd ==100735== at 0x483BB65: calloc (vg_replace_malloc.c:762) ==100735== by 0x1A54C5: __alloc_extent_buffer (extent_io.c:609) ==100735== by 0x1A5AD1: alloc_extent_buffer (extent_io.c:752) ==100735== by 0x1B1A0A: btrfs_find_create_tree_block (disk-io.c:222) ==100735== by 0x1BD4A2: btrfs_alloc_free_block (extent-tree.c:2538) ==100735== by 0x1A8CE3: __btrfs_cow_block (ctree.c:322) ==100735== by 0x1A91C6: btrfs_cow_block (ctree.c:415) ==100735== by 0x1AB16C: btrfs_search_slot (ctree.c:1185) ==100735== by 0x160BBC: delete_extent_records (main.c:6652) ==100735== by 0x16343F: fixup_extent_refs (main.c:7629) ==100735== by 0x16449F: check_extent_refs (main.c:8033) ==100735== by 0x166199: check_chunks_and_extents (main.c:8786) ==100735== [CAUSE] For new extent buffer allocated, we don't initialize its content. This is not a major concern, at all. For the above report, the reported range is inside the unused part of the extent buffer, thus won't cause anything. Regular btrfs_cow_block() will cover all the used ranges of one extent buffer. [FIX] But still, since kernel initialize the extent buffer with 0, it won't hurt to do extra initialized to make valgrind happy. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2020-03-24 10:53:14 +00:00
memset_extent_buffer(eb, 0, 0, blocksize);
return eb;
}
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
{
struct extent_buffer *new;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (!new)
return NULL;
copy_extent_buffer(new, src, 0, 0, src->len);
new->flags |= EXTENT_BUFFER_DUMMY;
return new;
}
static void free_extent_buffer_final(struct extent_buffer *eb)
{
BUG_ON(eb->refs);
list_del_init(&eb->lru);
if (!(eb->flags & EXTENT_BUFFER_DUMMY)) {
struct extent_io_tree *tree = &eb->fs_info->extent_cache;
remove_cache_extent(&tree->cache, &eb->cache_node);
BUG_ON(tree->cache_size < eb->len);
tree->cache_size -= eb->len;
}
free(eb);
}
static void free_extent_buffer_internal(struct extent_buffer *eb, bool free_now)
{
if (!eb || IS_ERR(eb))
return;
eb->refs--;
BUG_ON(eb->refs < 0);
if (eb->refs == 0) {
btrfs-progs: only warn if there are leaked extent buffers after transaction abort Another BUG_ON() during fuzz/003: ====== RUN MAYFAIL btrfs check --init-csum-tree tests/fuzz-tests/images/bko-161821.raw.restored [1/7] checking root items Fixed 0 roots. [2/7] checking extents parent transid verify failed on 4198400 wanted 14 found 1114126 parent transid verify failed on 4198400 wanted 14 found 1114126 Ignoring transid failure owner ref check failed [4198400 4096] repair deleting extent record: key [4198400,169,0] adding new tree backref on start 4198400 len 4096 parent 0 root 5 Repaired extent references for 4198400 ref mismatch on [4222976 4096] extent item 1, found 0 backref 4222976 root 7 not referenced back 0x5617f8ecf780 incorrect global backref count on 4222976 found 1 wanted 0 backpointer mismatch on [4222976 4096] owner ref check failed [4222976 4096] repair deleting extent record: key [4222976,169,0] Repaired extent references for 4222976 [3/7] checking free space cache [4/7] checking fs roots parent transid verify failed on 4198400 wanted 14 found 1114126 Ignoring transid failure Wrong generation of child node/leaf, wanted: 1114126, have: 14 root 5 missing its root dir, recreating parent transid verify failed on 4198400 wanted 14 found 1114126 Ignoring transid failure ERROR: child eb corrupted: parent bytenr=4222976 item=0 parent level=1 child level=2 ERROR: errors found in fs roots extent buffer leak: start 4222976 len 4096 extent_io.c:611: free_extent_buffer_internal: BUG_ON `eb->flags & EXTENT_DIRTY` triggered, value 1 failed (ignored, ret=134): btrfs check --init-csum-tree tests/fuzz-tests/images/bko-161821.raw.restored mayfail: returned code 134 (SIGABRT), not ignored test failed for case 003-multi-check-unmounted Since we're shifting to use btrfs_abort_transaction() in btrfs-progs, it will be more and more common to see dirty leaked eb. Instead of BUG_ON(), we only need to report it as a warning. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-03 05:50:19 +00:00
if (eb->flags & EXTENT_DIRTY) {
warning(
"dirty eb leak (aborted trans): start %llu len %u",
eb->start, eb->len);
}
list_del_init(&eb->recow);
if (eb->flags & EXTENT_BUFFER_DUMMY || free_now)
free_extent_buffer_final(eb);
}
}
void free_extent_buffer(struct extent_buffer *eb)
{
free_extent_buffer_internal(eb, 0);
}
void free_extent_buffer_nocache(struct extent_buffer *eb)
{
free_extent_buffer_internal(eb, 1);
}
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 bytenr, u32 blocksize)
{
struct extent_buffer *eb = NULL;
struct cache_extent *cache;
cache = lookup_cache_extent(&tree->cache, bytenr, blocksize);
if (cache && cache->start == bytenr &&
cache->size == blocksize) {
eb = container_of(cache, struct extent_buffer, cache_node);
list_move_tail(&eb->lru, &tree->lru);
eb->refs++;
}
return eb;
}
struct extent_buffer *find_first_extent_buffer(struct extent_io_tree *tree,
u64 start)
{
struct extent_buffer *eb = NULL;
struct cache_extent *cache;
cache = search_cache_extent(&tree->cache, start);
if (cache) {
eb = container_of(cache, struct extent_buffer, cache_node);
list_move_tail(&eb->lru, &tree->lru);
eb->refs++;
}
return eb;
}
static void trim_extent_buffer_cache(struct extent_io_tree *tree)
{
struct extent_buffer *eb, *tmp;
list_for_each_entry_safe(eb, tmp, &tree->lru, lru) {
if (eb->refs == 0)
free_extent_buffer_final(eb);
if (tree->cache_size <= ((tree->max_cache_size * 9) / 10))
break;
}
}
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 bytenr, u32 blocksize)
{
struct extent_buffer *eb;
struct extent_io_tree *tree = &fs_info->extent_cache;
struct cache_extent *cache;
cache = lookup_cache_extent(&tree->cache, bytenr, blocksize);
if (cache && cache->start == bytenr &&
cache->size == blocksize) {
eb = container_of(cache, struct extent_buffer, cache_node);
list_move_tail(&eb->lru, &tree->lru);
eb->refs++;
} else {
int ret;
if (cache) {
eb = container_of(cache, struct extent_buffer,
cache_node);
free_extent_buffer(eb);
}
eb = __alloc_extent_buffer(fs_info, bytenr, blocksize);
if (!eb)
return NULL;
ret = insert_cache_extent(&tree->cache, &eb->cache_node);
if (ret) {
free(eb);
return NULL;
}
list_add_tail(&eb->lru, &tree->lru);
tree->cache_size += blocksize;
if (tree->cache_size >= tree->max_cache_size)
trim_extent_buffer_cache(tree);
}
return eb;
}
btrfs-progs: disk-io: Verify the bytenr passed in is mapped for read_tree_block() [BUG] For a fuzzed image, `btrfs check` will segfault at open_ctree() stage: $ btrfs check --mode=lowmem issue_207.raw Opening filesystem to check... extent_io.c:665: free_extent_buffer_internal: BUG_ON `eb->refs < 0` triggered, value 1 btrfs(+0x6bf67)[0x56431d278f67] btrfs(+0x6c16e)[0x56431d27916e] btrfs(alloc_extent_buffer+0x45)[0x56431d279db5] btrfs(read_tree_block+0x59)[0x56431d2848f9] btrfs(btrfs_setup_all_roots+0x29c)[0x56431d28535c] btrfs(+0x78903)[0x56431d285903] btrfs(open_ctree_fs_info+0x90)[0x56431d285b60] btrfs(+0x45a01)[0x56431d252a01] btrfs(main+0x94)[0x56431d2220c4] /usr/lib/libc.so.6(__libc_start_main+0xf3)[0x7f6e28519153] btrfs(_start+0x2e)[0x56431d22235e] [CAUSE] The fuzzed image has a strange log root bytenr: log_root 61440 log_root_transid 0 In fact, the log_root seems to be fuzzed, as its transid is 0, which is invalid. Note that range [61440, 77824) covers the physical offset of the primary super block. The bug is caused by the following sequence: 1. cache for tree block [64K, 68K) is created by open_ctree() __open_ctree_fd() |- btrfs_setup_chunk_tree_and_device_map() |- btrfs_read_sys_array() |- sb = btrfs_find_create_tree_block() |- free_extent_buffer(sb) This created an extent buffer [64K, 68K) in fs_info->extent_cache, then reduce the refcount of that eb back to 0, but not freed yet. 2. Try to read that corrupted log root __open_ctree_fd() |- btrfs_setup_chunk_tree_and_device_map() |- btrfs_setup_all_roots() |- find_and_setup_log_root() |- read_tree_block() |- btrfs_find_create_tree_block() |- alloc_extent_buffer() The final alloc_extent_buffer() will try to free that cached eb [64K, 68K), since it doesn't match with current search. And since that cached eb is already released (refcount == 0), the extra free_extent_buffer() will cause above BUG_ON(). [FIX] Here we fix it through a more comprehensive method, instead of simply verifying log_root_transid, here we just don't pollute eb cache when reading sys chunk array. So that we won't have an eb cache [64K, 68K), and will error out at logical mapping phase. Issue: #207 Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-12-18 01:19:39 +00:00
/*
* Allocate a dummy extent buffer which won't be inserted into extent buffer
* cache.
*
* This mostly allows super block read write using existing eb infrastructure
* without pulluting the eb cache.
*
* This is especially important to avoid injecting eb->start == SZ_64K, as
* fuzzed image could have invalid tree bytenr covers super block range,
* and cause ref count underflow.
*/
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 bytenr, u32 blocksize)
{
struct extent_buffer *ret;
ret = __alloc_extent_buffer(fs_info, bytenr, blocksize);
if (!ret)
return NULL;
ret->flags |= EXTENT_BUFFER_DUMMY;
return ret;
}
int read_extent_from_disk(struct extent_buffer *eb,
unsigned long offset, unsigned long len)
{
int ret;
ret = pread(eb->fd, eb->data + offset, len, eb->dev_bytenr);
if (ret < 0) {
ret = -errno;
goto out;
}
if (ret != len) {
ret = -EIO;
goto out;
}
ret = 0;
out:
return ret;
}
int write_extent_to_disk(struct extent_buffer *eb)
{
int ret;
ret = pwrite(eb->fd, eb->data, eb->len, eb->dev_bytenr);
if (ret < 0)
goto out;
if (ret != eb->len) {
ret = -EIO;
goto out;
}
ret = 0;
out:
return ret;
}
int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 offset,
u64 bytes, int mirror)
{
struct btrfs_multi_bio *multi = NULL;
struct btrfs_device *device;
u64 bytes_left = bytes;
u64 read_len;
u64 total_read = 0;
int ret;
while (bytes_left) {
read_len = bytes_left;
ret = btrfs_map_block(info, READ, offset, &read_len, &multi,
mirror, NULL);
if (ret) {
fprintf(stderr, "Couldn't map the block %Lu\n",
offset);
return -EIO;
}
device = multi->stripes[0].dev;
read_len = min(bytes_left, read_len);
if (device->fd <= 0) {
kfree(multi);
return -EIO;
}
ret = pread(device->fd, buf + total_read, read_len,
multi->stripes[0].physical);
kfree(multi);
if (ret < 0) {
fprintf(stderr, "Error reading %Lu, %d\n", offset,
ret);
return ret;
}
if (ret != read_len) {
fprintf(stderr, "Short read for %Lu, read %d, "
"read_len %Lu\n", offset, ret, read_len);
return -EIO;
}
bytes_left -= read_len;
offset += read_len;
total_read += read_len;
}
return 0;
}
int write_data_to_disk(struct btrfs_fs_info *info, void *buf, u64 offset,
u64 bytes, int mirror)
{
struct btrfs_multi_bio *multi = NULL;
struct btrfs_device *device;
u64 bytes_left = bytes;
u64 this_len;
u64 total_write = 0;
u64 *raid_map = NULL;
u64 dev_bytenr;
int dev_nr;
int ret = 0;
while (bytes_left > 0) {
this_len = bytes_left;
dev_nr = 0;
ret = btrfs_map_block(info, WRITE, offset, &this_len, &multi,
mirror, &raid_map);
if (ret) {
fprintf(stderr, "Couldn't map the block %Lu\n",
offset);
return -EIO;
}
if (raid_map) {
struct extent_buffer *eb;
u64 stripe_len = this_len;
this_len = min(this_len, bytes_left);
this_len = min(this_len, (u64)info->nodesize);
eb = malloc(sizeof(struct extent_buffer) + this_len);
if (!eb) {
fprintf(stderr, "cannot allocate memory for eb\n");
ret = -ENOMEM;
goto out;
}
memset(eb, 0, sizeof(struct extent_buffer) + this_len);
eb->start = offset;
eb->len = this_len;
memcpy(eb->data, buf + total_write, this_len);
ret = write_raid56_with_parity(info, eb, multi,
stripe_len, raid_map);
BUG_ON(ret);
free(eb);
kfree(raid_map);
raid_map = NULL;
} else while (dev_nr < multi->num_stripes) {
device = multi->stripes[dev_nr].dev;
if (device->fd <= 0) {
kfree(multi);
return -EIO;
}
dev_bytenr = multi->stripes[dev_nr].physical;
this_len = min(this_len, bytes_left);
dev_nr++;
ret = pwrite(device->fd, buf + total_write, this_len, dev_bytenr);
if (ret != this_len) {
if (ret < 0) {
fprintf(stderr, "Error writing to "
"device %d\n", errno);
ret = errno;
kfree(multi);
return ret;
} else {
fprintf(stderr, "Short write\n");
kfree(multi);
return -EIO;
}
}
}
BUG_ON(bytes_left < this_len);
bytes_left -= this_len;
offset += this_len;
total_write += this_len;
kfree(multi);
multi = NULL;
}
return 0;
out:
kfree(raid_map);
return ret;
}
int set_extent_buffer_dirty(struct extent_buffer *eb)
{
struct extent_io_tree *tree = &eb->fs_info->extent_cache;
if (!(eb->flags & EXTENT_DIRTY)) {
eb->flags |= EXTENT_DIRTY;
set_extent_dirty(tree, eb->start, eb->start + eb->len - 1);
extent_buffer_get(eb);
}
return 0;
}
int clear_extent_buffer_dirty(struct extent_buffer *eb)
{
struct extent_io_tree *tree = &eb->fs_info->extent_cache;
if (eb->flags & EXTENT_DIRTY) {
eb->flags &= ~EXTENT_DIRTY;
clear_extent_dirty(tree, eb->start, eb->start + eb->len - 1);
free_extent_buffer(eb);
}
return 0;
}
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
return memcmp(eb->data + start, ptrv, len);
}
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start, unsigned long len)
{
memcpy(dst, eb->data + start, len);
}
void write_extent_buffer(struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len)
{
memcpy(eb->data + start, src, len);
}
void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
memcpy(dst->data + dst_offset, src->data + src_offset, len);
}
void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len)
{
memmove(dst->data + dst_offset, dst->data + src_offset, len);
}
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len)
{
memset(eb->data + start, c, len);
}
int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
return le_test_bit(nr, (u8 *)eb->data + start);
}