Skip to content

Commit 53b381b

Browse files
David WoodhouseChris Mason
authored andcommitted
Btrfs: RAID5 and RAID6
This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason <[email protected]>
1 parent 64a1670 commit 53b381b

File tree

15 files changed

+2283
-102
lines changed

15 files changed

+2283
-102
lines changed

fs/btrfs/Kconfig

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ config BTRFS_FS
66
select ZLIB_DEFLATE
77
select LZO_COMPRESS
88
select LZO_DECOMPRESS
9+
select RAID6_PQ
10+
911
help
1012
Btrfs is a new filesystem with extents, writable snapshotting,
1113
support for multiple devices and many more features.

fs/btrfs/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
88
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
99
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
1010
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11-
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
11+
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
1212

1313
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
1414
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

fs/btrfs/ctree.h

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,7 @@ struct btrfs_super_block {
502502
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
503503

504504
#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
505+
#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
505506

506507
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
507508
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -511,6 +512,7 @@ struct btrfs_super_block {
511512
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
512513
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
513514
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
515+
BTRFS_FEATURE_INCOMPAT_RAID56 | \
514516
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
515517

516518
/*
@@ -952,15 +954,19 @@ struct btrfs_dev_replace_item {
952954
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
953955
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
954956
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
957+
#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
958+
#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
955959
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
956-
#define BTRFS_NR_RAID_TYPES 5
960+
#define BTRFS_NR_RAID_TYPES 7
957961

958962
#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
959963
BTRFS_BLOCK_GROUP_SYSTEM | \
960964
BTRFS_BLOCK_GROUP_METADATA)
961965

962966
#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
963967
BTRFS_BLOCK_GROUP_RAID1 | \
968+
BTRFS_BLOCK_GROUP_RAID5 | \
969+
BTRFS_BLOCK_GROUP_RAID6 | \
964970
BTRFS_BLOCK_GROUP_DUP | \
965971
BTRFS_BLOCK_GROUP_RAID10)
966972
/*
@@ -1185,6 +1191,10 @@ struct btrfs_block_group_cache {
11851191
u64 flags;
11861192
u64 sectorsize;
11871193
u64 cache_generation;
1194+
1195+
/* for raid56, this is a full stripe, without parity */
1196+
unsigned long full_stripe_len;
1197+
11881198
unsigned int ro:1;
11891199
unsigned int dirty:1;
11901200
unsigned int iref:1;
@@ -1225,6 +1235,20 @@ struct seq_list {
12251235
u64 seq;
12261236
};
12271237

1238+
/* used by the raid56 code to lock stripes for read/modify/write */
1239+
struct btrfs_stripe_hash {
1240+
struct list_head hash_list;
1241+
wait_queue_head_t wait;
1242+
spinlock_t lock;
1243+
};
1244+
1245+
/* used by the raid56 code to lock stripes for read/modify/write */
1246+
struct btrfs_stripe_hash_table {
1247+
struct btrfs_stripe_hash *table;
1248+
};
1249+
1250+
#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1251+
12281252
/* fs_info */
12291253
struct reloc_control;
12301254
struct btrfs_device;
@@ -1307,6 +1331,13 @@ struct btrfs_fs_info {
13071331
struct mutex cleaner_mutex;
13081332
struct mutex chunk_mutex;
13091333
struct mutex volume_mutex;
1334+
1335+
/* this is used during read/modify/write to make sure
1336+
* no two ios are trying to mod the same stripe at the same
1337+
* time
1338+
*/
1339+
struct btrfs_stripe_hash_table *stripe_hash_table;
1340+
13101341
/*
13111342
* this protects the ordered operations list only while we are
13121343
* processing all of the entries on it. This way we make
@@ -1395,6 +1426,8 @@ struct btrfs_fs_info {
13951426
struct btrfs_workers flush_workers;
13961427
struct btrfs_workers endio_workers;
13971428
struct btrfs_workers endio_meta_workers;
1429+
struct btrfs_workers endio_raid56_workers;
1430+
struct btrfs_workers rmw_workers;
13981431
struct btrfs_workers endio_meta_write_workers;
13991432
struct btrfs_workers endio_write_workers;
14001433
struct btrfs_workers endio_freespace_worker;

fs/btrfs/disk-io.c

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "check-integrity.h"
4747
#include "rcu-string.h"
4848
#include "dev-replace.h"
49+
#include "raid56.h"
4950

5051
#ifdef CONFIG_X86
5152
#include <asm/cpufeature.h>
@@ -639,8 +640,15 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
639640
btree_readahead_hook(root, eb, eb->start, ret);
640641
}
641642

642-
if (ret)
643+
if (ret) {
644+
/*
645+
* our io error hook is going to dec the io pages
646+
* again, we have to make sure it has something
647+
* to decrement
648+
*/
649+
atomic_inc(&eb->io_pages);
643650
clear_extent_buffer_uptodate(eb);
651+
}
644652
free_extent_buffer(eb);
645653
out:
646654
return ret;
@@ -654,6 +662,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
654662
eb = (struct extent_buffer *)page->private;
655663
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
656664
eb->read_mirror = failed_mirror;
665+
atomic_dec(&eb->io_pages);
657666
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
658667
btree_readahead_hook(root, eb, eb->start, -EIO);
659668
return -EIO; /* we fixed nothing */
@@ -670,17 +679,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
670679
end_io_wq->work.flags = 0;
671680

672681
if (bio->bi_rw & REQ_WRITE) {
673-
if (end_io_wq->metadata == 1)
682+
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
674683
btrfs_queue_worker(&fs_info->endio_meta_write_workers,
675684
&end_io_wq->work);
676-
else if (end_io_wq->metadata == 2)
685+
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
677686
btrfs_queue_worker(&fs_info->endio_freespace_worker,
678687
&end_io_wq->work);
688+
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
689+
btrfs_queue_worker(&fs_info->endio_raid56_workers,
690+
&end_io_wq->work);
679691
else
680692
btrfs_queue_worker(&fs_info->endio_write_workers,
681693
&end_io_wq->work);
682694
} else {
683-
if (end_io_wq->metadata)
695+
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
696+
btrfs_queue_worker(&fs_info->endio_raid56_workers,
697+
&end_io_wq->work);
698+
else if (end_io_wq->metadata)
684699
btrfs_queue_worker(&fs_info->endio_meta_workers,
685700
&end_io_wq->work);
686701
else
@@ -695,6 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
695710
* 0 - if data
696711
* 1 - if normal metadta
697712
* 2 - if writing to the free space cache area
713+
* 3 - raid parity work
698714
*/
699715
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
700716
int metadata)
@@ -2165,6 +2181,12 @@ int open_ctree(struct super_block *sb,
21652181
init_waitqueue_head(&fs_info->transaction_blocked_wait);
21662182
init_waitqueue_head(&fs_info->async_submit_wait);
21672183

2184+
ret = btrfs_alloc_stripe_hash_table(fs_info);
2185+
if (ret) {
2186+
err = -ENOMEM;
2187+
goto fail_alloc;
2188+
}
2189+
21682190
__setup_root(4096, 4096, 4096, 4096, tree_root,
21692191
fs_info, BTRFS_ROOT_TREE_OBJECTID);
21702192

@@ -2332,6 +2354,12 @@ int open_ctree(struct super_block *sb,
23322354
btrfs_init_workers(&fs_info->endio_meta_write_workers,
23332355
"endio-meta-write", fs_info->thread_pool_size,
23342356
&fs_info->generic_worker);
2357+
btrfs_init_workers(&fs_info->endio_raid56_workers,
2358+
"endio-raid56", fs_info->thread_pool_size,
2359+
&fs_info->generic_worker);
2360+
btrfs_init_workers(&fs_info->rmw_workers,
2361+
"rmw", fs_info->thread_pool_size,
2362+
&fs_info->generic_worker);
23352363
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
23362364
fs_info->thread_pool_size,
23372365
&fs_info->generic_worker);
@@ -2350,6 +2378,8 @@ int open_ctree(struct super_block *sb,
23502378
*/
23512379
fs_info->endio_workers.idle_thresh = 4;
23522380
fs_info->endio_meta_workers.idle_thresh = 4;
2381+
fs_info->endio_raid56_workers.idle_thresh = 4;
2382+
fs_info->rmw_workers.idle_thresh = 2;
23532383

23542384
fs_info->endio_write_workers.idle_thresh = 2;
23552385
fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2396,8 @@ int open_ctree(struct super_block *sb,
23662396
ret |= btrfs_start_workers(&fs_info->fixup_workers);
23672397
ret |= btrfs_start_workers(&fs_info->endio_workers);
23682398
ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2399+
ret |= btrfs_start_workers(&fs_info->rmw_workers);
2400+
ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
23692401
ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
23702402
ret |= btrfs_start_workers(&fs_info->endio_write_workers);
23712403
ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2710,6 +2742,8 @@ int open_ctree(struct super_block *sb,
27102742
btrfs_stop_workers(&fs_info->workers);
27112743
btrfs_stop_workers(&fs_info->endio_workers);
27122744
btrfs_stop_workers(&fs_info->endio_meta_workers);
2745+
btrfs_stop_workers(&fs_info->endio_raid56_workers);
2746+
btrfs_stop_workers(&fs_info->rmw_workers);
27132747
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
27142748
btrfs_stop_workers(&fs_info->endio_write_workers);
27152749
btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2728,6 +2762,7 @@ int open_ctree(struct super_block *sb,
27282762
fail_srcu:
27292763
cleanup_srcu_struct(&fs_info->subvol_srcu);
27302764
fail:
2765+
btrfs_free_stripe_hash_table(fs_info);
27312766
btrfs_close_devices(fs_info->fs_devices);
27322767
return err;
27332768

@@ -3076,11 +3111,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
30763111
((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
30773112
== 0)))
30783113
num_tolerated_disk_barrier_failures = 0;
3079-
else if (num_tolerated_disk_barrier_failures > 1
3080-
&&
3081-
(flags & (BTRFS_BLOCK_GROUP_RAID1 |
3082-
BTRFS_BLOCK_GROUP_RAID10)))
3083-
num_tolerated_disk_barrier_failures = 1;
3114+
else if (num_tolerated_disk_barrier_failures > 1) {
3115+
if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3116+
BTRFS_BLOCK_GROUP_RAID5 |
3117+
BTRFS_BLOCK_GROUP_RAID10)) {
3118+
num_tolerated_disk_barrier_failures = 1;
3119+
} else if (flags &
3120+
BTRFS_BLOCK_GROUP_RAID5) {
3121+
num_tolerated_disk_barrier_failures = 2;
3122+
}
3123+
}
30843124
}
30853125
}
30863126
up_read(&sinfo->groups_sem);
@@ -3384,6 +3424,8 @@ int close_ctree(struct btrfs_root *root)
33843424
btrfs_stop_workers(&fs_info->workers);
33853425
btrfs_stop_workers(&fs_info->endio_workers);
33863426
btrfs_stop_workers(&fs_info->endio_meta_workers);
3427+
btrfs_stop_workers(&fs_info->endio_raid56_workers);
3428+
btrfs_stop_workers(&fs_info->rmw_workers);
33873429
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
33883430
btrfs_stop_workers(&fs_info->endio_write_workers);
33893431
btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3404,6 +3446,8 @@ int close_ctree(struct btrfs_root *root)
34043446
bdi_destroy(&fs_info->bdi);
34053447
cleanup_srcu_struct(&fs_info->subvol_srcu);
34063448

3449+
btrfs_free_stripe_hash_table(fs_info);
3450+
34073451
return 0;
34083452
}
34093453

fs/btrfs/disk-io.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,13 @@
2525
#define BTRFS_SUPER_MIRROR_MAX 3
2626
#define BTRFS_SUPER_MIRROR_SHIFT 12
2727

28+
enum {
29+
BTRFS_WQ_ENDIO_DATA = 0,
30+
BTRFS_WQ_ENDIO_METADATA = 1,
31+
BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32+
BTRFS_WQ_ENDIO_RAID56 = 3,
33+
};
34+
2835
static inline u64 btrfs_sb_offset(int mirror)
2936
{
3037
u64 start = 16 * 1024;

0 commit comments

Comments
 (0)