diff --git a/Makefile b/Makefile index 48949039..decf8ec0 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ root-tree.o dir-item.o file-item.o inode-item.o \ inode-map.o crc32c.o rbtree.o extent-cache.o extent_io.o \ volumes.o utils.o btrfs-list.o btrfslabel.o repair.o \ - send-stream.o send-utils.o qgroup.o + send-stream.o send-utils.o qgroup.o raid6.o cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \ cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \ cmds-quota.o cmds-qgroup.o diff --git a/btrfs-corrupt-block.c b/btrfs-corrupt-block.c index b57e7573..8176fad3 100644 --- a/btrfs-corrupt-block.c +++ b/btrfs-corrupt-block.c @@ -50,7 +50,8 @@ struct extent_buffer *debug_corrupt_block(struct btrfs_root *root, u64 bytenr, length = blocksize; while (1) { ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - eb->start, &length, &multi, mirror_num); + eb->start, &length, &multi, + mirror_num, NULL); BUG_ON(ret); device = multi->stripes[0].dev; eb->fd = device->fd; @@ -63,7 +64,7 @@ struct extent_buffer *debug_corrupt_block(struct btrfs_root *root, u64 bytenr, kfree(multi); if (!copy || mirror_num == copy) { - ret = read_extent_from_disk(eb); + ret = read_extent_from_disk(eb, 0, eb->len); printf("corrupting %llu copy %d\n", eb->start, mirror_num); memset(eb->data, 0, eb->len); diff --git a/btrfs-map-logical.c b/btrfs-map-logical.c index fa4fb3f5..b9635f77 100644 --- a/btrfs-map-logical.c +++ b/btrfs-map-logical.c @@ -55,7 +55,8 @@ struct extent_buffer *debug_read_block(struct btrfs_root *root, u64 bytenr, length = blocksize; while (1) { ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - eb->start, &length, &multi, mirror_num); + eb->start, &length, &multi, + mirror_num, NULL); BUG_ON(ret); device = multi->stripes[0].dev; eb->fd = device->fd; @@ -68,7 +69,7 @@ struct extent_buffer *debug_read_block(struct btrfs_root *root, u64 bytenr, kfree(multi); if (!copy || mirror_num == copy) - ret = read_extent_from_disk(eb); + ret = read_extent_from_disk(eb, 0, eb->len); num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); diff --git a/cmds-balance.c b/cmds-balance.c index 38a74269..c5552492 100644 --- a/cmds-balance.c +++ b/cmds-balance.c @@ -47,6 +47,10 @@ static int parse_one_profile(const char *profile, u64 *flags) *flags |= BTRFS_BLOCK_GROUP_RAID1; } else if (!strcmp(profile, "raid10")) { *flags |= BTRFS_BLOCK_GROUP_RAID10; + } else if (!strcmp(profile, "raid5")) { + *flags |= BTRFS_BLOCK_GROUP_RAID5; + } else if (!strcmp(profile, "raid6")) { + *flags |= BTRFS_BLOCK_GROUP_RAID6; } else if (!strcmp(profile, "dup")) { *flags |= BTRFS_BLOCK_GROUP_DUP; } else if (!strcmp(profile, "single")) { diff --git a/cmds-filesystem.c b/cmds-filesystem.c index 507239ad..5332f801 100644 --- a/cmds-filesystem.c +++ b/cmds-filesystem.c @@ -148,6 +148,12 @@ static int cmd_df(int argc, char **argv) } else if (flags & BTRFS_BLOCK_GROUP_RAID10) { snprintf(description+written, 9, "%s", ", RAID10"); written += 8; + } else if (flags & BTRFS_BLOCK_GROUP_RAID5) { + snprintf(description+written, 9, "%s", ", RAID5"); + written += 7; + } else if (flags & BTRFS_BLOCK_GROUP_RAID6) { + snprintf(description+written, 9, "%s", ", RAID6"); + written += 7; } total_bytes = pretty_sizes(sargs->spaces[i].total_bytes); diff --git a/convert.c b/convert.c index 1de2a441..2b3f42f2 100644 --- a/convert.c +++ b/convert.c @@ -2430,7 +2430,7 @@ static int may_rollback(struct btrfs_root *root) while (1) { ret = btrfs_map_block(&info->mapping_tree, WRITE, bytenr, - &length, &multi, 0); + &length, &multi, 0, NULL); if (ret) goto fail; diff --git a/ctree.h b/ctree.h index 06759896..e82f4c9f 100644 --- a/ctree.h +++ b/ctree.h @@ -437,6 +437,7 @@ struct btrfs_super_block { * code was pretty buggy. Lets not let them try anymore. */ #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) +#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL @@ -446,6 +447,7 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) /* @@ -779,6 +781,8 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE /* used in struct btrfs_balance_args fields */ diff --git a/disk-io.c b/disk-io.c index f00fe4e7..8b369050 100644 --- a/disk-io.c +++ b/disk-io.c @@ -89,8 +89,8 @@ int csum_tree_block_size(struct extent_buffer *buf, u16 csum_size, if (verify) { if (memcmp_extent_buffer(buf, result, 0, csum_size)) { - printk("checksum verify failed on %llu wanted %X " - "found %X\n", (unsigned long long)buf->start, + printk("checksum verify failed on %llu found %X " + "wanted %X\n", (unsigned long long)buf->start, *((int *)result), *((char *)buf->data)); free(result); return 1; @@ -141,7 +141,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, length = blocksize; ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &length, &multi, 0); + bytenr, &length, &multi, 0, NULL); BUG_ON(ret); device = multi->stripes[0].dev; device->total_ios++; @@ -182,15 +182,52 @@ out: } +static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror) +{ + unsigned long offset = 0; + struct btrfs_multi_bio *multi = NULL; + struct btrfs_device *device; + int ret = 0; + u64 read_len; + unsigned long bytes_left = eb->len; + + while (bytes_left) { + read_len = bytes_left; + ret = btrfs_map_block(&info->mapping_tree, READ, + eb->start + offset, &read_len, &multi, + mirror, NULL); + if (ret) { + printk("Couldn't map the block %Lu\n", eb->start + offset); + return -EIO; + } + device = multi->stripes[0].dev; + + if (device->fd == 0) + return -EIO; + + eb->fd = device->fd; + device->total_ios++; + eb->dev_bytenr = multi->stripes[0].physical; + kfree(multi); + + if (read_len > bytes_left) + read_len = bytes_left; + + ret = read_extent_from_disk(eb, offset, read_len); + if (ret) + return -EIO; + offset += read_len; + bytes_left -= read_len; + } + return 0; +} + struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid) { int ret; struct extent_buffer *eb; - u64 length; u64 best_transid = 0; - struct btrfs_multi_bio *multi = NULL; - struct btrfs_device *device; int mirror_num = 0; int good_mirror = 0; int num_copies; @@ -203,21 +240,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, if (btrfs_buffer_uptodate(eb, parent_transid)) return eb; - length = blocksize; while (1) { - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - eb->start, &length, &multi, mirror_num); - if (ret) { - printk("Couldn't map the block %Lu\n", bytenr); - break; - } - device = multi->stripes[0].dev; - eb->fd = device->fd; - device->total_ios++; - eb->dev_bytenr = multi->stripes[0].physical; - kfree(multi); - ret = read_extent_from_disk(eb); - + ret = read_whole_eb(root->fs_info, eb, mirror_num); if (ret == 0 && check_tree_block(root, eb) == 0 && csum_tree_block(root, eb, 1) == 0 && verify_parent_transid(eb->tree, eb, parent_transid, ignore) @@ -253,12 +277,156 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; } +static int rmw_eb(struct btrfs_fs_info *info, + struct extent_buffer *eb, struct extent_buffer *orig_eb) +{ + int ret; + unsigned long orig_off = 0; + unsigned long dest_off = 0; + unsigned long copy_len = eb->len; + + ret = read_whole_eb(info, eb, 0); + if (ret) + return ret; + + if (eb->start + eb->len <= orig_eb->start || + eb->start >= orig_eb->start + orig_eb->len) + return 0; + /* + * | ----- orig_eb ------- | + * | ----- stripe ------- | + * | ----- orig_eb ------- | + * | ----- orig_eb ------- | + */ + if (eb->start > orig_eb->start) + orig_off = eb->start - orig_eb->start; + if (orig_eb->start > eb->start) + dest_off = orig_eb->start - eb->start; + + if (copy_len > orig_eb->len - orig_off) + copy_len = orig_eb->len - orig_off; + if (copy_len > eb->len - dest_off) + copy_len = eb->len - dest_off; + + memcpy(eb->data + dest_off, orig_eb->data + orig_off, copy_len); + return 0; +} + +static void split_eb_for_raid56(struct btrfs_fs_info *info, + struct extent_buffer *orig_eb, + struct extent_buffer **ebs, + u64 stripe_len, u64 *raid_map, + int num_stripes) +{ + struct extent_buffer *eb; + u64 start = orig_eb->start; + u64 this_eb_start; + int i; + int ret; + + for (i = 0; i < num_stripes; i++) { + if (raid_map[i] >= BTRFS_RAID5_P_STRIPE) + break; + + eb = malloc(sizeof(struct extent_buffer) + stripe_len); + if (!eb) + BUG(); + memset(eb, 0, sizeof(struct extent_buffer) + stripe_len); + + eb->start = raid_map[i]; + eb->len = stripe_len; + eb->refs = 1; + eb->flags = 0; + eb->fd = -1; + eb->dev_bytenr = (u64)-1; + + this_eb_start = raid_map[i]; + + if (start > this_eb_start || + start + orig_eb->len < this_eb_start + stripe_len) { + ret = rmw_eb(info, eb, orig_eb); + BUG_ON(ret); + } else { + memcpy(eb->data, orig_eb->data + eb->start - start, stripe_len); + } + ebs[i] = eb; + } +} + +static int write_raid56_with_parity(struct btrfs_fs_info *info, + struct extent_buffer *eb, + struct btrfs_multi_bio *multi, + u64 stripe_len, u64 *raid_map) +{ + struct extent_buffer *ebs[multi->num_stripes], *p_eb = NULL, *q_eb = NULL; + int i; + int j; + int ret; + int alloc_size = eb->len; + + if (stripe_len > alloc_size) + alloc_size = stripe_len; + + split_eb_for_raid56(info, eb, ebs, stripe_len, raid_map, + multi->num_stripes); + + for (i = 0; i < multi->num_stripes; i++) { + struct extent_buffer *new_eb; + if (raid_map[i] < BTRFS_RAID5_P_STRIPE) { + ebs[i]->dev_bytenr = multi->stripes[i].physical; + ebs[i]->fd = multi->stripes[i].dev->fd; + multi->stripes[i].dev->total_ios++; + BUG_ON(ebs[i]->start != raid_map[i]); + continue; + } + new_eb = kmalloc(sizeof(*eb) + alloc_size, GFP_NOFS); + BUG_ON(!new_eb); + new_eb->dev_bytenr = multi->stripes[i].physical; + new_eb->fd = multi->stripes[i].dev->fd; + multi->stripes[i].dev->total_ios++; + new_eb->len = stripe_len; + + if (raid_map[i] == BTRFS_RAID5_P_STRIPE) + p_eb = new_eb; + else if (raid_map[i] == BTRFS_RAID6_Q_STRIPE) + q_eb = new_eb; + } + if (q_eb) { + void *pointers[multi->num_stripes]; + ebs[multi->num_stripes - 2] = p_eb; + ebs[multi->num_stripes - 1] = q_eb; + + for (i = 0; i < multi->num_stripes; i++) + pointers[i] = ebs[i]->data; + + raid6_gen_syndrome(multi->num_stripes, stripe_len, pointers); + } else { + ebs[multi->num_stripes - 1] = p_eb; + memcpy(p_eb->data, ebs[0]->data, stripe_len); + for (j = 1; j < multi->num_stripes - 1; j++) { + for (i = 0; i < stripe_len; i += sizeof(unsigned long)) { + *(unsigned long *)(p_eb->data + i) ^= + *(unsigned long *)(ebs[j]->data + i); + } + } + } + + for (i = 0; i < multi->num_stripes; i++) { + ret = write_extent_to_disk(ebs[i]); + BUG_ON(ret); + if (ebs[i] != eb) + kfree(ebs[i]); + } + return 0; +} + int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int ret; int dev_nr; u64 length; + u64 *raid_map = NULL; struct btrfs_multi_bio *multi = NULL; if (check_tree_block(root, eb)) @@ -272,9 +440,13 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, dev_nr = 0; length = eb->len; ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE, - eb->start, &length, &multi, 0); + eb->start, &length, &multi, 0, &raid_map); - while(dev_nr < multi->num_stripes) { + if (raid_map) { + ret = write_raid56_with_parity(root->fs_info, eb, multi, + length, raid_map); + BUG_ON(ret); + } else while (dev_nr < multi->num_stripes) { BUG_ON(ret); eb->fd = multi->stripes[dev_nr].dev->fd; eb->dev_bytenr = multi->stripes[dev_nr].physical; diff --git a/disk-io.h b/disk-io.h index 53e9b17a..53ef0238 100644 --- a/disk-io.h +++ b/disk-io.h @@ -82,3 +82,6 @@ int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); #endif + +/* raid6.c */ +void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs); diff --git a/extent-tree.c b/extent-tree.c index c7121171..7fb77019 100644 --- a/extent-tree.c +++ b/extent-tree.c @@ -1762,6 +1762,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP); if (extra_flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/extent_io.c b/extent_io.c index ebb35b28..45fa6bf6 100644 --- a/extent_io.c +++ b/extent_io.c @@ -663,13 +663,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, return eb; } -int read_extent_from_disk(struct extent_buffer *eb) +int read_extent_from_disk(struct extent_buffer *eb, + unsigned long offset, unsigned long len) { int ret; - ret = pread(eb->fd, eb->data, eb->len, eb->dev_bytenr); + ret = pread(eb->fd, eb->data + offset, len, eb->dev_bytenr); if (ret < 0) goto out; - if (ret != eb->len) { + if (ret != len) { ret = -EIO; goto out; } diff --git a/extent_io.h b/extent_io.h index a5d6bf0e..5df44f7d 100644 --- a/extent_io.h +++ b/extent_io.h @@ -95,7 +95,8 @@ struct extent_buffer *find_first_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, u64 bytenr, u32 blocksize); void free_extent_buffer(struct extent_buffer *eb); -int read_extent_from_disk(struct extent_buffer *eb); +int read_extent_from_disk(struct extent_buffer *eb, + unsigned long offset, unsigned long len); int write_extent_to_disk(struct extent_buffer *eb); int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len); diff --git a/find-root.c b/find-root.c index c0f38b88..43ac0b7e 100644 --- a/find-root.c +++ b/find-root.c @@ -378,7 +378,8 @@ static int find_root(struct btrfs_root *root) offset = metadata_offset; } err = __btrfs_map_block(&root->fs_info->mapping_tree, READ, - offset, &map_length, &type, &multi, 0); + offset, &map_length, &type, + &multi, 0, NULL); if (err) { offset += map_length; continue; diff --git a/mkfs.c b/mkfs.c index fbf83196..7cd0c23c 100644 --- a/mkfs.c +++ b/mkfs.c @@ -207,7 +207,8 @@ static int create_raid_groups(struct btrfs_trans_handle *trans, int metadata_profile_opt, int mixed) { u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy); - u64 allowed; + u64 allowed = 0; + u64 devices_for_raid = num_devices; int ret; /* @@ -223,13 +224,22 @@ static int create_raid_groups(struct btrfs_trans_handle *trans, BTRFS_BLOCK_GROUP_RAID0 : 0; /* raid0 or single */ } - if (num_devices == 1) - allowed = BTRFS_BLOCK_GROUP_DUP; - else if (num_devices >= 4) { - allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - } else - allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1; + if (devices_for_raid > 4) + devices_for_raid = 4; + + switch (devices_for_raid) { + default: + case 4: + allowed |= BTRFS_BLOCK_GROUP_RAID10; + case 3: + allowed |= BTRFS_BLOCK_GROUP_RAID6; + case 2: + allowed |= BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5; + break; + case 1: + allowed |= BTRFS_BLOCK_GROUP_DUP; + } if (metadata_profile & ~allowed) { fprintf(stderr, "unable to create FS with metadata " @@ -336,6 +346,10 @@ static u64 parse_profile(char *s) return BTRFS_BLOCK_GROUP_RAID0; } else if (strcmp(s, "raid1") == 0) { return BTRFS_BLOCK_GROUP_RAID1; + } else if (strcmp(s, "raid5") == 0) { + return BTRFS_BLOCK_GROUP_RAID5; + } else if (strcmp(s, "raid6") == 0) { + return BTRFS_BLOCK_GROUP_RAID6; } else if (strcmp(s, "raid10") == 0) { return BTRFS_BLOCK_GROUP_RAID10; } else if (strcmp(s, "dup") == 0) { @@ -1438,6 +1452,16 @@ raid_groups: btrfs_set_super_incompat_flags(super, flags); } + if ((data_profile | metadata_profile) & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + struct btrfs_super_block *super = &root->fs_info->super_copy; + u64 flags = btrfs_super_incompat_flags(super); + + flags |= BTRFS_FEATURE_INCOMPAT_RAID56; + btrfs_set_super_incompat_flags(super, flags); + printf("Setting RAID5/6 feature flag\n"); + } + printf("fs created label %s on %s\n\tnodesize %u leafsize %u " "sectorsize %u size %s\n", label, first_file, nodesize, leafsize, sectorsize, diff --git a/raid6.c b/raid6.c new file mode 100644 index 00000000..3a42bdf0 --- /dev/null +++ b/raid6.c @@ -0,0 +1,97 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6int1.c + * + * 1-way unrolled portable integer math RAID-6 instruction set + * + * This file was postprocessed using unroll.pl and then ported to userspace + */ +#include +#include +/* + * This is the C data type to use + */ + +/* Change this from BITS_PER_LONG if there is something better... */ +#if BITS_PER_LONG == 64 +# define NBYTES(x) ((x) * 0x0101010101010101UL) +# define NSIZE 8 +# define NSHIFT 3 +typedef uint64_t unative_t; +#else +# define NBYTES(x) ((x) * 0x01010101U) +# define NSIZE 4 +# define NSHIFT 2 +typedef uint32_t unative_t; +#endif + +/* + * These sub-operations are separate inlines since they can sometimes be + * specially optimized using architecture-specific hacks. + */ + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + unative_t vv; + + vv = (v << 1) & NBYTES(0xfe); + return vv; +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t vv; + + vv = v & NBYTES(0x80); + vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ + return vv; +} + + +void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + uint8_t **dptr = (uint8_t **)ptrs; + uint8_t *p, *q; + int d, z, z0; + + unative_t wd0, wq0, wp0, w10, w20; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*1 ) { + wq0 = wp0 = *(unative_t *)&dptr[z0][d+0*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd0 = *(unative_t *)&dptr[z][d+0*NSIZE]; + wp0 ^= wd0; + w20 = MASK(wq0); + w10 = SHLBYTE(wq0); + w20 &= NBYTES(0x1d); + w10 ^= w20; + wq0 = w10 ^ wd0; + } + *(unative_t *)&p[d+NSIZE*0] = wp0; + *(unative_t *)&q[d+NSIZE*0] = wq0; + } +} + diff --git a/restore.c b/restore.c index 80afb843..cd85092d 100644 --- a/restore.c +++ b/restore.c @@ -228,7 +228,7 @@ static int copy_one_extent(struct btrfs_root *root, int fd, again: length = size_left; ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &length, &multi, 0); + bytenr, &length, &multi, 0, NULL); if (ret) { free(inbuf); free(outbuf); diff --git a/volumes.c b/volumes.c index f3371b22..9df3efef 100644 --- a/volumes.c +++ b/volumes.c @@ -35,6 +35,23 @@ struct stripe { u64 physical; }; +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} + +#define is_parity_stripe(x) ( ((x) == BTRFS_RAID5_P_STRIPE) || ((x) == BTRFS_RAID6_Q_STRIPE) ) + #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) @@ -620,11 +637,21 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes, return calc_size; else if (type & BTRFS_BLOCK_GROUP_RAID10) return calc_size * (num_stripes / sub_stripes); + else if (type & BTRFS_BLOCK_GROUP_RAID5) + return calc_size * (num_stripes - 1); + else if (type & BTRFS_BLOCK_GROUP_RAID6) + return calc_size * (num_stripes - 2); else return calc_size * num_stripes; } +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ + /* TODO, add a way to store the preferred stripe size */ + return 64 * 1024; +} + int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 *start, u64 *num_bytes, u64 type) @@ -661,6 +688,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } if (type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { if (type & BTRFS_BLOCK_GROUP_SYSTEM) { @@ -700,6 +728,22 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = 2; min_stripes = 4; } + if (type & (BTRFS_BLOCK_GROUP_RAID5)) { + num_stripes = btrfs_super_num_devices(&info->super_copy); + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + stripe_len = find_raid56_stripe_len(num_stripes - 1, + btrfs_super_stripesize(&info->super_copy)); + } + if (type & (BTRFS_BLOCK_GROUP_RAID6)) { + num_stripes = btrfs_super_num_devices(&info->super_copy); + if (num_stripes < 3) + return -ENOSPC; + min_stripes = 3; + stripe_len = find_raid56_stripe_len(num_stripes - 2, + btrfs_super_stripesize(&info->super_copy)); + } /* we don't want a chunk larger than 10% of the FS */ percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1); @@ -976,6 +1020,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; return ret; @@ -1015,6 +1063,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; ce = find_first_cache_extent(&map_tree->cache_tree, chunk_start); @@ -1022,10 +1071,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map = container_of(ce, struct map_lookup, ce); length = ce->size; + rmap_len = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID10) length = ce->size / (map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) length = ce->size / map->num_stripes; + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + length = ce->size / nr_data_stripes(map); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); @@ -1044,8 +1099,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map->sub_stripes; } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = ce->start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = ce->start + stripe_nr * rmap_len; for (j = 0; j < nr; j++) { if (buf[j] == bytenr) break; @@ -1056,28 +1114,60 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; return 0; } +static inline int parity_smaller(u64 a, u64 b) +{ + return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_multi_bio *bbio, u64 *raid_map) +{ + struct btrfs_bio_stripe s; + int i; + u64 l; + int again = 1; + + while (again) { + again = 0; + for (i = 0; i < bbio->num_stripes - 1; i++) { + if (parity_smaller(raid_map[i], raid_map[i+1])) { + s = bbio->stripes[i]; + l = raid_map[i]; + bbio->stripes[i] = bbio->stripes[i+1]; + raid_map[i] = raid_map[i+1]; + bbio->stripes[i+1] = s; + raid_map[i+1] = l; + again = 1; + } + } + } +} + int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_multi_bio **multi_ret, int mirror_num, + u64 **raid_map_ret) { return __btrfs_map_block(map_tree, rw, logical, length, NULL, - multi_ret, mirror_num); + multi_ret, mirror_num, raid_map_ret); } int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, u64 *type, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_multi_bio **multi_ret, int mirror_num, + u64 **raid_map_ret) { struct cache_extent *ce; struct map_lookup *map; u64 offset; u64 stripe_offset; u64 stripe_nr; + u64 *raid_map = NULL; int stripes_allocated = 8; int stripes_required = 1; int stripe_index; @@ -1117,10 +1207,24 @@ again: stripes_required = map->sub_stripes; } } + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) + && multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) { + /* RAID[56] write or recovery. Return all stripes */ + stripes_required = map->num_stripes; + + /* Only allocate the map if we've already got a large enough multi_ret */ + if (stripes_allocated >= stripes_required) { + raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + if (!raid_map) { + kfree(multi); + return -ENOMEM; + } + } + } + /* if our multi bio struct is too small, back off and try again */ - if (multi_ret && rw == WRITE && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; + if (multi_ret && stripes_allocated < stripes_required) { + stripes_allocated = stripes_required; kfree(multi); goto again; } @@ -1138,6 +1242,7 @@ again: stripe_offset = offset - stripe_offset; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ @@ -1176,6 +1281,59 @@ again: multi->num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + + if (raid_map) { + int i, rot; + u64 tmp; + u64 raid56_full_stripe_start; + u64 full_stripe_len = nr_data_stripes(map) * map->stripe_len; + + /* + * align the start of our data stripe in the logical + * address space + */ + raid56_full_stripe_start = offset / full_stripe_len; + raid56_full_stripe_start *= full_stripe_len; + + /* get the data stripe number */ + stripe_nr = raid56_full_stripe_start / map->stripe_len; + stripe_nr = stripe_nr / nr_data_stripes(map); + + /* Work out the disk rotation on this stripe-set */ + rot = stripe_nr % map->num_stripes; + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + + for (i = 0; i < nr_data_stripes(map); i++) + raid_map[(i+rot) % map->num_stripes] = + ce->start + (tmp + i) * map->stripe_len; + + raid_map[(i+rot) % map->num_stripes] = BTRFS_RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + raid_map[(i+rot+1) % map->num_stripes] = BTRFS_RAID6_Q_STRIPE; + + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + multi->num_stripes = map->num_stripes; + } else { + stripe_index = stripe_nr % nr_data_stripes(map); + stripe_nr = stripe_nr / nr_data_stripes(map); + + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + stripe_index = (stripe_nr + stripe_index) % map->num_stripes; + } } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -1195,8 +1353,14 @@ again: stripe_index++; } *multi_ret = multi; + if (type) *type = map->type; + + if (raid_map) { + sort_parity_stripes(multi, raid_map); + *raid_map_ret = raid_map; + } out: return 0; } diff --git a/volumes.h b/volumes.h index 9ff6182e..59d00b6d 100644 --- a/volumes.h +++ b/volumes.h @@ -135,6 +135,10 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) #define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) +#define BTRFS_RAID5_P_STRIPE ((u64)-2) +#define BTRFS_RAID6_Q_STRIPE ((u64)-1) + + int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 chunk_tree, u64 chunk_objectid, @@ -142,10 +146,12 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *start); int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, u64 *type, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_multi_bio **multi_ret, int mirror_num, + u64 **raid_map); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_multi_bio **multi_ret, int mirror_num, + u64 **raid_map_ret); int btrfs_next_metadata(struct btrfs_mapping_tree *map_tree, u64 *logical, u64 *size); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,