/* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ /* * Main part to implement offline(unmounted) btrfs scrub */ #include #include "ctree.h" #include "volumes.h" #include "disk-io.h" #include "utils.h" #include "kernel-lib/bitops.h" #include "task-utils.h" #include "kernel-lib/raid56.h" /* * For parity based profile (RAID56) * Mirror/stripe based on won't need this. They are iterated by bytenr and * mirror number. */ struct scrub_stripe { /* For P/Q logical start will be BTRFS_RAID5/6_P/Q_STRIPE */ u64 logical; u64 physical; /* Device is missing */ unsigned int dev_missing:1; /* Any tree/data csum mismatches */ unsigned int csum_mismatch:1; /* Some data doesn't have csum (nodatasum) */ unsigned int csum_missing:1; /* Device fd, to write correct data back to disc */ int fd; char *data; }; /* * RAID56 full stripe (data stripes + P/Q) */ struct scrub_full_stripe { u64 logical_start; u64 logical_len; u64 bg_type; u32 nr_stripes; u32 stripe_len; /* Read error stripes */ u32 err_read_stripes; /* Missing devices */ u32 err_missing_devs; /* Csum error data stripes */ u32 err_csum_dstripes; /* Missing csum data stripes */ u32 missing_csum_dstripes; /* currupted stripe index */ int corrupted_index[2]; int nr_corrupted_stripes; /* Already recovered once? */ unsigned int recovered:1; struct scrub_stripe stripes[]; }; static void free_full_stripe(struct scrub_full_stripe *fstripe) { int i; for (i = 0; i < fstripe->nr_stripes; i++) free(fstripe->stripes[i].data); free(fstripe); } static struct scrub_full_stripe *alloc_full_stripe(int nr_stripes, u32 stripe_len) { struct scrub_full_stripe *ret; int size = sizeof(*ret) + sizeof(unsigned long *) + nr_stripes * sizeof(struct scrub_stripe); int i; ret = malloc(size); if (!ret) return NULL; memset(ret, 0, size); ret->nr_stripes = nr_stripes; ret->stripe_len = stripe_len; ret->corrupted_index[0] = -1; ret->corrupted_index[1] = -1; /* Alloc data memory for each stripe */ for (i = 0; i < nr_stripes; i++) { struct scrub_stripe *stripe = &ret->stripes[i]; stripe->data = malloc(stripe_len); if (!stripe->data) { free_full_stripe(ret); return NULL; } } return ret; } static inline int is_data_stripe(struct scrub_stripe *stripe) { u64 bytenr = stripe->logical; if (bytenr == BTRFS_RAID5_P_STRIPE || bytenr == BTRFS_RAID6_Q_STRIPE) return 0; return 1; } /* * Check one tree mirror given by @bytenr and @mirror, or @data. * If @data is not given (NULL), the function will try to read out tree block * using @bytenr and @mirror. * If @data is given, use data directly, won't try to read from disk. * * The extra @data prameter is handy for RAID5/6 recovery code to verify * the recovered data. * * Return 0 if everything is OK. * Return <0 something goes wrong, and @scrub_ctx accounting will be updated * if it's a data corruption. */ static int check_tree_mirror(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, char *data, u64 bytenr, int mirror) { struct extent_buffer *eb; u32 nodesize = fs_info->nodesize; int ret; if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) { /* Such error will be reported by check_tree_block() */ scrub_ctx->verify_errors++; return -EIO; } eb = btrfs_find_create_tree_block(fs_info, bytenr); if (!eb) return -ENOMEM; if (data) { memcpy(eb->data, data, nodesize); } else { ret = read_whole_eb(fs_info, eb, mirror); if (ret) { scrub_ctx->read_errors++; error("failed to read tree block %llu mirror %d", bytenr, mirror); goto out; } } scrub_ctx->tree_bytes_scrubbed += nodesize; if (csum_tree_block(fs_info, eb, 1)) { error("tree block %llu mirror %d checksum mismatch", bytenr, mirror); scrub_ctx->csum_errors++; ret = -EIO; goto out; } ret = check_tree_block(fs_info, eb); if (ret < 0) { error("tree block %llu mirror %d is invalid", bytenr, mirror); scrub_ctx->verify_errors++; goto out; } scrub_ctx->tree_extents_scrubbed++; out: free_extent_buffer(eb); return ret; } /* * read_extent_data() helper * * This function will handle short read and update @scrub_ctx when read * error happens. */ static int read_extent_data_loop(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, char *buf, u64 start, u64 len, int mirror) { int ret = 0; u64 cur = 0; while (cur < len) { u64 read_len = len - cur; ret = read_extent_data(fs_info, buf + cur, start + cur, &read_len, mirror); if (ret < 0) { error("failed to read out data at bytenr %llu mirror %d", start + cur, mirror); scrub_ctx->read_errors++; break; } cur += read_len; } return ret; } /* * Recover all other (corrupted) mirrors for tree block. * * The method is quite simple, just read out the correct mirror specified by * @good_mirror and write back correct data to all other blocks */ static int recover_tree_mirror(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, u64 start, int good_mirror) { char *buf; u32 nodesize = fs_info->nodesize; int i; int num_copies; int ret; buf = malloc(nodesize); if (!buf) return -ENOMEM; ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start, nodesize, good_mirror); if (ret < 0) { error("failed to read tree block at bytenr %llu mirror %d", start, good_mirror); goto out; } num_copies = btrfs_num_copies(fs_info, start, nodesize); for (i = 0; i <= num_copies; i++) { if (i == good_mirror) continue; ret = write_data_to_disk(fs_info, buf, start, nodesize, i); if (ret < 0) { error("failed to write tree block at bytenr %llu mirror %d", start, i); goto out; } } ret = 0; out: free(buf); return ret; } /* * Check one data mirror given by @start @len and @mirror, or @data * If @data is not given, try to read it from disk. * This function will try to read out all the data then check sum. * * If @data is given, just use the data. * This behavior is useful for RAID5/6 recovery code to verify recovered data. * * If @corrupt_bitmap is given, restore corrupted sector to that bitmap. * This is useful for mirror based profiles to recover its data. * * Return 0 if everything is OK. * Return <0 if something goes wrong, and @scrub_ctx accounting will be updated * if it's a data corruption. */ static int check_data_mirror(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, char *data, u64 start, u64 len, int mirror, unsigned long *corrupt_bitmap) { u32 sectorsize = fs_info->sectorsize; u32 data_csum; u32 *csums = NULL; char *buf = NULL; int ret = 0; int err = 0; int i; unsigned long *csum_bitmap = NULL; if (!data) { buf = malloc(len); if (!buf) return -ENOMEM; ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start, len, mirror); if (ret < 0) goto out; scrub_ctx->data_bytes_scrubbed += len; } else { buf = data; } /* Alloc and Check csums */ csums = malloc(len / sectorsize * sizeof(data_csum)); if (!csums) { ret = -ENOMEM; goto out; } csum_bitmap = malloc(calculate_bitmap_len(len / sectorsize)); if (!csum_bitmap) { ret = -ENOMEM; goto out; } if (corrupt_bitmap) memset(corrupt_bitmap, 0, calculate_bitmap_len(len / sectorsize)); ret = btrfs_read_data_csums(fs_info, start, len, csums, csum_bitmap); if (ret < 0) goto out; for (i = 0; i < len / sectorsize; i++) { if (!test_bit(i, csum_bitmap)) { scrub_ctx->csum_discards++; continue; } data_csum = ~(u32)0; data_csum = btrfs_csum_data(buf + i * sectorsize, data_csum, sectorsize); btrfs_csum_final(data_csum, (u8 *)&data_csum); if (memcmp(&data_csum, (char *)csums + i * sizeof(data_csum), sizeof(data_csum))) { error("data at bytenr %llu mirror %d csum mismatch, have 0x%08x expect 0x%08x", start + i * sectorsize, mirror, data_csum, *(u32 *)((char *)csums + i * sizeof(data_csum))); err = 1; scrub_ctx->csum_errors++; if (corrupt_bitmap) set_bit(i, corrupt_bitmap); continue; } scrub_ctx->data_bytes_scrubbed += sectorsize; } out: if (!data) free(buf); free(csums); free(csum_bitmap); if (!ret && err) return -EIO; return ret; } /* Helper to check all mirrors for a good copy */ static int has_good_mirror(unsigned long *corrupt_bitmaps[], int num_copies, int bit, int *good_mirror) { int found_good = 0; int i; for (i = 0; i < num_copies; i++) { if (!test_bit(bit, corrupt_bitmaps[i])) { found_good = 1; if (good_mirror) *good_mirror = i + 1; break; } } return found_good; } /* * Helper function to check @corrupt_bitmaps, to verify if it's recoverable * for mirror based data extent. * * Return 1 for recoverable, and 0 for not recoverable */ static int check_data_mirror_recoverable(struct btrfs_fs_info *fs_info, u64 start, u64 len, u32 sectorsize, unsigned long *corrupt_bitmaps[]) { int i; int corrupted = 0; int bit; int num_copies = btrfs_num_copies(fs_info, start, len); for (i = 0; i < num_copies; i++) { for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) { if (!has_good_mirror(corrupt_bitmaps, num_copies, bit, NULL)) { corrupted = 1; goto out; } } } out: return !corrupted; } /* * Try to recover all corrupted sectors specified by @corrupt_bitmaps, * by reading out good sector in other mirror. */ static int recover_data_mirror(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, u64 start, u64 len, unsigned long *corrupt_bitmaps[]) { char *buf; u32 sectorsize = fs_info->sectorsize; int ret = 0; int bit; int i; int bad_mirror; int num_copies; /* Don't bother to recover unrecoverable extents */ if (!check_data_mirror_recoverable(fs_info, start, len, sectorsize, corrupt_bitmaps)) return -EIO; buf = malloc(sectorsize); if (!buf) return -ENOMEM; num_copies = btrfs_num_copies(fs_info, start, len); for (i = 0; i < num_copies; i++) { for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) { u64 cur = start + bit * sectorsize; int good; /* Find good mirror */ ret = has_good_mirror(corrupt_bitmaps, num_copies, bit, &good); if (!ret) { error("failed to find good mirror for bytenr %llu", cur); ret = -EIO; goto out; } /* Read out good mirror */ ret = read_data_from_disk(fs_info, buf, cur, sectorsize, good); if (ret < 0) { error("failed to read good mirror from bytenr %llu mirror %d", cur, good); goto out; } /* Write back to all other mirrors */ for (bad_mirror = 1; bad_mirror <= num_copies; bad_mirror++) { if (bad_mirror == good) continue; ret = write_data_to_disk(fs_info, buf, cur, sectorsize, bad_mirror); if (ret < 0) { error("failed to recover mirror for bytenr %llu mirror %d", cur, bad_mirror); goto out; } } } } out: free(buf); return ret; } /* Btrfs only supports up to 2 copies of data, yet */ #define BTRFS_MAX_COPIES 2 /* * Check all copies of range @start, @len. * Caller must ensure the range is covered by EXTENT_ITEM/METADATA_ITEM * specified by leaf of @path. * And @start, @len must be a subset of the EXTENT_ITEM/METADATA_ITEM. * * Return 0 if the range is all OK or recovered or recoverable. * Return <0 if the range can't be recoverable. */ static int scrub_one_extent(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, struct btrfs_path *path, u64 start, u64 len, int write) { struct btrfs_key key; struct btrfs_extent_item *ei; struct extent_buffer *leaf = path->nodes[0]; u32 sectorsize = fs_info->sectorsize; unsigned long *corrupt_bitmaps[BTRFS_MAX_COPIES] = { NULL }; int slot = path->slots[0]; int num_copies; int meta_corrupted = 0; int meta_good_mirror = 0; int data_bad_mirror = 0; u64 extent_start; u64 extent_len; int metadata = 0; int i; int ret = 0; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.type != BTRFS_METADATA_ITEM_KEY && key.type != BTRFS_EXTENT_ITEM_KEY) goto invalid_arg; extent_start = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) { extent_len = fs_info->nodesize; metadata = 1; } else { extent_len = key.offset; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); if (btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK) metadata = 1; } if (start >= extent_start + extent_len || start + len <= extent_start) goto invalid_arg; for (i = 0; i < BTRFS_MAX_COPIES; i++) { corrupt_bitmaps[i] = malloc( calculate_bitmap_len(len / sectorsize)); if (!corrupt_bitmaps[i]) goto out; } num_copies = btrfs_num_copies(fs_info, start, len); for (i = 1; i <= num_copies; i++) { if (metadata) { ret = check_tree_mirror(fs_info, scrub_ctx, NULL, extent_start, i); scrub_ctx->tree_extents_scrubbed++; if (ret < 0) meta_corrupted++; else meta_good_mirror = i; } else { ret = check_data_mirror(fs_info, scrub_ctx, NULL, start, len, i, corrupt_bitmaps[i - 1]); scrub_ctx->data_extents_scrubbed++; } } /* Metadata recover and report */ if (metadata) { if (!meta_corrupted) { goto out; } else if (meta_corrupted && meta_corrupted < num_copies) { if (write) { ret = recover_tree_mirror(fs_info, scrub_ctx, start, meta_good_mirror); if (ret < 0) { error("failed to recover tree block at bytenr %llu", start); goto out; } printf("extent %llu len %llu REPAIRED: has corrupted mirror, repaired\n", start, len); goto out; } printf("extent %llu len %llu RECOVERABLE: has corrupted mirror, but is recoverable\n", start, len); goto out; } else { error("extent %llu len %llu CORRUPTED: all mirror(s) corrupted, can't be recovered", start, len); ret = -EIO; goto out; } } /* Data recover and report */ for (i = 0; i < num_copies; i++) { if (find_first_bit(corrupt_bitmaps[i], len / sectorsize) >= len / sectorsize) continue; data_bad_mirror = i + 1; } /* All data sectors are good */ if (!data_bad_mirror) { ret = 0; goto out; } if (check_data_mirror_recoverable(fs_info, start, len, sectorsize, corrupt_bitmaps)) { if (write) { ret = recover_data_mirror(fs_info, scrub_ctx, start, len, corrupt_bitmaps); if (ret < 0) { error("failed to recover data extent at bytenr %llu len %llu", start, len); goto out; } printf("extent %llu len %llu REPARIED: has corrupted mirror, repaired\n", start, len); goto out; } printf("extent %llu len %llu RECOVERABLE: has corrupted mirror, recoverable\n", start, len); goto out; } error("extent %llu len %llu CORRUPTED, all mirror(s) corrupted, can't be repaired", start, len); ret = -EIO; out: for (i = 0; i < BTRFS_MAX_COPIES; i++) kfree(corrupt_bitmaps[i]); return ret; invalid_arg: error("invalid parameter for %s", __func__); return -EINVAL; } /* * Scrub one full data stripe of RAID5/6. * This means it will check any data/metadata extent in the data stripe * spcified by @stripe and @stripe_len * * This function will only *CHECK* if the data stripe has any corruption. * Won't repair at this function. * * Return 0 if the full stripe is OK. * Return <0 if any error is found. * Note: Missing csum is not counted as error (NODATACSUM is valid) */ static int scrub_one_data_stripe(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, struct scrub_stripe *stripe, u32 stripe_len) { struct btrfs_path *path; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_key key; u64 extent_start; u64 extent_len; u64 orig_csum_discards; int ret; if (!is_data_stripe(stripe)) return -EINVAL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = stripe->logical + stripe_len; key.offset = 0; key.type = 0; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; while (1) { struct btrfs_extent_item *ei; struct extent_buffer *eb; char *data; int slot; int metadata = 0; u64 check_start; u64 check_len; ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret > 0) { ret = 0; goto out; } if (ret < 0) goto out; eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &key, slot); extent_start = key.objectid; ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); /* tree block scrub */ if (key.type == BTRFS_METADATA_ITEM_KEY || btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK) { extent_len = extent_root->fs_info->nodesize; metadata = 1; } else { extent_len = key.offset; metadata = 0; } /* Current extent is out of our range, loop comes to end */ if (extent_start + extent_len <= stripe->logical) break; if (metadata) { /* * Check crossing stripe first, which can't be scrubbed */ if (check_crossing_stripes(fs_info, extent_start, extent_root->fs_info->nodesize)) { error("tree block at %llu is crossing stripe boundary, unable to scrub", extent_start); ret = -EIO; goto out; } data = stripe->data + extent_start - stripe->logical; ret = check_tree_mirror(fs_info, scrub_ctx, data, extent_start, 0); /* Any csum/verify error means the stripe is screwed */ if (ret < 0) { stripe->csum_mismatch = 1; ret = -EIO; goto out; } ret = 0; continue; } /* Restrict the extent range to fit stripe range */ check_start = max(extent_start, stripe->logical); check_len = min(extent_start + extent_len, stripe->logical + stripe_len) - check_start; /* Record original csum_discards to detect missing csum case */ orig_csum_discards = scrub_ctx->csum_discards; data = stripe->data + check_start - stripe->logical; ret = check_data_mirror(fs_info, scrub_ctx, data, check_start, check_len, 0, NULL); /* Csum mismatch, no need to continue anyway*/ if (ret < 0) { stripe->csum_mismatch = 1; goto out; } /* Check if there is any missing csum for data */ if (scrub_ctx->csum_discards != orig_csum_discards) stripe->csum_missing = 1; /* * Only increase data_extents_scrubbed if we are scrubbing the * tailing part of the data extent */ if (extent_start + extent_len <= stripe->logical + stripe_len) scrub_ctx->data_extents_scrubbed++; ret = 0; } out: btrfs_free_path(path); return ret; } /* * Verify parities for RAID56 * Caller must fill @fstripe before calling this function * * Return 0 for parities matches. * Return >0 for P or Q mismatch * Return <0 for fatal error */ static int verify_parities(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, struct scrub_full_stripe *fstripe) { void **ptrs; void *ondisk_p = NULL; void *ondisk_q = NULL; void *buf_p; void *buf_q; int nr_stripes = fstripe->nr_stripes; int stripe_len = BTRFS_STRIPE_LEN; int i; int ret = 0; ptrs = malloc(sizeof(void *) * fstripe->nr_stripes); buf_p = malloc(fstripe->stripe_len); buf_q = malloc(fstripe->stripe_len); if (!ptrs || !buf_p || !buf_q) { ret = -ENOMEM; goto out; } for (i = 0; i < fstripe->nr_stripes; i++) { struct scrub_stripe *stripe = &fstripe->stripes[i]; if (stripe->logical == BTRFS_RAID5_P_STRIPE) { ondisk_p = stripe->data; ptrs[i] = buf_p; continue; } else if (stripe->logical == BTRFS_RAID6_Q_STRIPE) { ondisk_q = stripe->data; ptrs[i] = buf_q; continue; } else { ptrs[i] = stripe->data; continue; } } /* RAID6 */ if (ondisk_q) { raid6_gen_syndrome(nr_stripes, stripe_len, ptrs); if (memcmp(ondisk_q, ptrs[nr_stripes - 1], stripe_len) != 0 || memcmp(ondisk_p, ptrs[nr_stripes - 2], stripe_len)) ret = 1; } else { ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1, ptrs); if (ret < 0) goto out; if (memcmp(ondisk_p, ptrs[nr_stripes - 1], stripe_len) != 0) ret = 1; } out: free(buf_p); free(buf_q); free(ptrs); return ret; } /* * Try to recover data stripe from P or Q stripe * * Return >0 if it can't be require any more. * Return 0 for successful repair or no need to repair at all * Return <0 for fatal error */ static int recover_from_parities(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, struct scrub_full_stripe *fstripe) { void **ptrs; int nr_stripes = fstripe->nr_stripes; int stripe_len = BTRFS_STRIPE_LEN; int max_tolerance; int i; int ret; /* No need to recover */ if (!fstripe->nr_corrupted_stripes) return 0; /* Already recovered once, no more chance */ if (fstripe->recovered) return 1; if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5) max_tolerance = 1; else max_tolerance = 2; /* Out of repair */ if (fstripe->nr_corrupted_stripes > max_tolerance) return 1; ptrs = malloc(sizeof(void *) * fstripe->nr_stripes); if (!ptrs) return -ENOMEM; /* Construct ptrs */ for (i = 0; i < nr_stripes; i++) ptrs[i] = fstripe->stripes[i].data; ret = raid56_recov(nr_stripes, stripe_len, fstripe->bg_type, fstripe->corrupted_index[0], fstripe->corrupted_index[1], ptrs); fstripe->recovered = 1; free(ptrs); return ret; } /* * Helper to write a full stripe to disk * P/Q will be re-calculated. */ static int write_full_stripe(struct scrub_full_stripe *fstripe) { void **ptrs; int nr_stripes = fstripe->nr_stripes; int stripe_len = BTRFS_STRIPE_LEN; int i; int ret = 0; ptrs = malloc(sizeof(void *) * fstripe->nr_stripes); if (!ptrs) return -ENOMEM; for (i = 0; i < fstripe->nr_stripes; i++) ptrs[i] = fstripe->stripes[i].data; if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID6) { raid6_gen_syndrome(nr_stripes, stripe_len, ptrs); } else { ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1, ptrs); if (ret < 0) goto out; } for (i = 0; i < fstripe->nr_stripes; i++) { struct scrub_stripe *stripe = &fstripe->stripes[i]; ret = pwrite(stripe->fd, stripe->data, fstripe->stripe_len, stripe->physical); if (ret != fstripe->stripe_len) { ret = -EIO; goto out; } } out: free(ptrs); return ret; } /* * Return 0 if we still have chance to recover * Return <0 if we have no more chance */ static int report_recoverablity(struct scrub_full_stripe *fstripe) { int max_tolerance; u64 start = fstripe->logical_start; if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5) max_tolerance = 1; else max_tolerance = 2; if (fstripe->nr_corrupted_stripes > max_tolerance) { error( "full stripe %llu CORRUPTED: too many read error or corrupted devices", start); error( "full stripe %llu: tolerance: %d, missing: %d, read error: %d, csum error: %d", start, max_tolerance, fstripe->err_read_stripes, fstripe->err_missing_devs, fstripe->err_csum_dstripes); return -EIO; } return 0; } static void clear_corrupted_stripe_record(struct scrub_full_stripe *fstripe) { fstripe->corrupted_index[0] = -1; fstripe->corrupted_index[1] = -1; fstripe->nr_corrupted_stripes = 0; } static void record_corrupted_stripe(struct scrub_full_stripe *fstripe, int index) { int i = 0; for (i = 0; i < 2; i++) { if (fstripe->corrupted_index[i] == -1) { fstripe->corrupted_index[i] = index; break; } } fstripe->nr_corrupted_stripes++; } /* * Scrub one full stripe. * * If everything matches, that's good. * If data stripe corrupted badly, no mean to recovery, it will report it. * If data stripe corrupted, try recovery first and recheck csum, to * determine if it's recoverable or screwed up. */ static int scrub_one_full_stripe(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, u64 start, u64 *next_ret, int write) { struct scrub_full_stripe *fstripe; struct btrfs_map_block *map_block = NULL; u32 stripe_len = BTRFS_STRIPE_LEN; u64 bg_type; u64 len; int i; int ret; if (!next_ret) { error("invalid argument for %s", __func__); return -EINVAL; } ret = __btrfs_map_block_v2(fs_info, WRITE, start, stripe_len, &map_block); if (ret < 0) { /* Let caller to skip the whole block group */ *next_ret = (u64)-1; return ret; } start = map_block->start; len = map_block->length; *next_ret = start + len; /* * Step 0: Check if we need to scrub the full stripe * * If no extent lies in the full stripe, not need to check */ ret = btrfs_check_extent_exists(fs_info, start, len); if (ret < 0) { free(map_block); return ret; } /* No extents in range, no need to check */ if (ret == 0) { free(map_block); return 0; } bg_type = map_block->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; if (bg_type != BTRFS_BLOCK_GROUP_RAID5 && bg_type != BTRFS_BLOCK_GROUP_RAID6) { free(map_block); return -EINVAL; } fstripe = alloc_full_stripe(map_block->num_stripes, map_block->stripe_len); if (!fstripe) return -ENOMEM; fstripe->logical_start = map_block->start; fstripe->nr_stripes = map_block->num_stripes; fstripe->stripe_len = stripe_len; fstripe->bg_type = bg_type; /* * Step 1: Read out the whole full stripe * * Then we have the chance to exit early if too many devices are * missing. */ for (i = 0; i < map_block->num_stripes; i++) { struct scrub_stripe *s_stripe = &fstripe->stripes[i]; struct btrfs_map_stripe *m_stripe = &map_block->stripes[i]; s_stripe->logical = m_stripe->logical; s_stripe->fd = m_stripe->dev->fd; s_stripe->physical = m_stripe->physical; if (m_stripe->dev->fd == -1) { s_stripe->dev_missing = 1; record_corrupted_stripe(fstripe, i); fstripe->err_missing_devs++; continue; } ret = pread(m_stripe->dev->fd, s_stripe->data, stripe_len, m_stripe->physical); if (ret < stripe_len) { record_corrupted_stripe(fstripe, i); fstripe->err_read_stripes++; continue; } } ret = report_recoverablity(fstripe); if (ret < 0) goto out; ret = recover_from_parities(fs_info, scrub_ctx, fstripe); if (ret < 0) { error("full stripe %llu CORRUPTED: failed to recover: %s\n", fstripe->logical_start, strerror(-ret)); goto out; } /* * Clear corrupted stripes report, since they are recovered, * and later checker need to record csum mismatch stripes reusing * these members */ clear_corrupted_stripe_record(fstripe); /* * Step 2: Check each data stripes against csum */ for (i = 0; i < map_block->num_stripes; i++) { struct scrub_stripe *stripe = &fstripe->stripes[i]; if (!is_data_stripe(stripe)) continue; ret = scrub_one_data_stripe(fs_info, scrub_ctx, stripe, stripe_len); if (ret < 0) { fstripe->err_csum_dstripes++; record_corrupted_stripe(fstripe, i); } } ret = report_recoverablity(fstripe); if (ret < 0) goto out; /* * Recovered before, but no csum error */ if (fstripe->err_csum_dstripes == 0 && fstripe->recovered) { error( "full stripe %llu RECOVERABLE: P/Q is good for recovery", start); ret = 0; goto out; } /* * No csum error, not recovered before. * * Only need to check if P/Q matches. */ if (fstripe->err_csum_dstripes == 0 && !fstripe->recovered) { ret = verify_parities(fs_info, scrub_ctx, fstripe); if (ret < 0) { error( "full stripe %llu CORRUPTED: failed to check P/Q: %s", start, strerror(-ret)); goto out; } if (ret > 0) { if (write) { ret = write_full_stripe(fstripe); if (ret < 0) error("failed to write full stripe %llu: %s", start, strerror(-ret)); else printf("full stripe %llu REPARIED: only P/Q mismatches, repaired\n", start); goto out; } else { printf("full stripe %llu RECOVERABLE: only P/Q is corrupted\n", start); ret = 0; } } goto out; } /* * Still csum error after recovery * * No mean to fix further, screwed up already. */ if (fstripe->err_csum_dstripes && fstripe->recovered) { error( "full stripe %llu CORRUPTED: csum still mismatch after recovery", start); ret = -EIO; goto out; } /* Csum mismatch, but we still has chance to recover. */ ret = recover_from_parities(fs_info, scrub_ctx, fstripe); if (ret < 0) { error( "full stripe %llu CORRUPTED: failed to recover: %s\n", fstripe->logical_start, strerror(-ret)); goto out; } /* After recovery, recheck data stripe csum */ for (i = 0; i < 2; i++) { int index = fstripe->corrupted_index[i]; struct scrub_stripe *stripe; if (i == -1) continue; stripe = &fstripe->stripes[index]; ret = scrub_one_data_stripe(fs_info, scrub_ctx, stripe, stripe_len); if (ret < 0) { error( "full stripe %llu CORRUPTED: csum still mismatch after recovery", start); goto out; } } if (write) { ret = write_full_stripe(fstripe); if (ret < 0) error("failed to write full stripe %llu: %s", start, strerror(-ret)); else printf("full stripe %llu REPARIED: corrupted data with good P/Q, repaired\n", start); goto out; } printf( "full stripe %llu RECOVERABLE: Data stripes corrupted, but P/Q is good\n", start); out: free_full_stripe(fstripe); free(map_block); return ret; } /* * Scrub one block group. * * This function will handle all profiles current btrfs supports. * Return 0 for scrubbing the block group. Found error will be recorded into * scrub_ctx. * Return <0 for fatal error preventing scrubing the block group. */ static int scrub_one_block_group(struct btrfs_fs_info *fs_info, struct btrfs_scrub_progress *scrub_ctx, struct btrfs_block_group_cache *bg_cache, int write) { struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_path *path; struct btrfs_key key; u64 bg_start = bg_cache->key.objectid; u64 bg_len = bg_cache->key.offset; int ret; if (bg_cache->flags & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { u64 cur = bg_start; u64 next; while (cur < bg_start + bg_len) { ret = scrub_one_full_stripe(fs_info, scrub_ctx, cur, &next, write); /* Ignore any non-fatal error */ if (ret < 0 && ret != -EIO) { error("fatal error happens checking one full stripe at bytenr: %llu: %s", cur, strerror(-ret)); return ret; } cur = next; } /* Ignore any -EIO error, such error will be reported at last */ return 0; } /* None parity based profile, check extent by extent */ key.objectid = bg_start; key.type = 0; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; while (1) { struct extent_buffer *eb = path->nodes[0]; int slot = path->slots[0]; u64 extent_start; u64 extent_len; btrfs_item_key_to_cpu(eb, &key, slot); if (key.objectid >= bg_start + bg_len) break; if (key.type != BTRFS_EXTENT_ITEM_KEY && key.type != BTRFS_METADATA_ITEM_KEY) goto next; extent_start = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) extent_len = extent_root->fs_info->nodesize; else extent_len = key.offset; ret = scrub_one_extent(fs_info, scrub_ctx, path, extent_start, extent_len, write); if (ret < 0 && ret != -EIO) { error("fatal error checking extent bytenr %llu len %llu: %s", extent_start, extent_len, strerror(-ret)); goto out; } ret = 0; next: ret = btrfs_next_extent_item(extent_root, path, bg_start + bg_len); if (ret < 0) goto out; if (ret > 0) { ret = 0; break; } } out: btrfs_free_path(path); return ret; } int btrfs_scrub(struct btrfs_fs_info *fs_info, struct task_context *task, int write) { u64 bg_nr = 0; struct btrfs_block_group_cache *bg_cache; struct btrfs_scrub_progress scrub_ctx = {0}; int ret = 0; ASSERT(fs_info); bg_cache = btrfs_lookup_first_block_group(fs_info, 0); if (!bg_cache) { error("no block group is found"); return -ENOENT; } ++bg_nr; if (task) { /* get block group numbers for progress */ while (1) { u64 bg_offset = bg_cache->key.objectid + bg_cache->key.offset; bg_cache = btrfs_lookup_first_block_group(fs_info, bg_offset); if (!bg_cache) break; ++bg_nr; } task->all = bg_nr; task->cur = 1; task_start(task->info); bg_cache = btrfs_lookup_first_block_group(fs_info, 0); } while (1) { ret = scrub_one_block_group(fs_info, &scrub_ctx, bg_cache, write); if (ret < 0 && ret != -EIO) break; if (task) task->cur++; bg_cache = btrfs_lookup_first_block_group(fs_info, bg_cache->key.objectid + bg_cache->key.offset); if (!bg_cache) break; } if (task) task_stop(task->info); printf("Scrub result:\n"); printf("Tree bytes scrubbed: %llu\n", scrub_ctx.tree_bytes_scrubbed); printf("Tree extents scrubbed: %llu\n", scrub_ctx.tree_extents_scrubbed); printf("Data bytes scrubbed: %llu\n", scrub_ctx.data_bytes_scrubbed); printf("Data extents scrubbed: %llu\n", scrub_ctx.data_extents_scrubbed); printf("Data bytes without csum: %llu\n", scrub_ctx.csum_discards * fs_info->sectorsize); printf("Read error: %llu\n", scrub_ctx.read_errors); printf("Verify error: %llu\n", scrub_ctx.verify_errors); printf("Csum error: %llu\n", scrub_ctx.csum_errors); if (scrub_ctx.csum_errors || scrub_ctx.read_errors || scrub_ctx.uncorrectable_errors || scrub_ctx.verify_errors) ret = 1; else ret = 0; return ret; }