From ef740c372dfd80e706dbf955d4e4aedda6c0c148 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Mar 2009 14:27:03 +1100 Subject: md: move headers out of include/linux/raid/ Move the headers with the local structures for the disciplines and bitmap.h into drivers/md/ so that they are more easily grepable for hacking and not far away. md.h is left where it is for now as there are some uses from the outside. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/raid5.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5ba080d303..f75698b1f63 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -44,10 +44,9 @@ */ #include -#include "raid6.h" - -#include #include +#include "raid6.h" +#include "bitmap.h" /* * Stripe cache -- cgit v1.2.3 From bff61975b3d6c18ee31457cc5b4d73042f44915f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move lots of #include lines out of .h files and into .c This makes the includes more explicit, and is preparation for moving md_k.h to drivers/md/md.h Remove include/raid/md.h as its only remaining use was to #include other files. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f75698b1f63..816157e7d8e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -43,8 +43,12 @@ * miss any bits. */ +#include +#include #include #include +#include +#include "raid5.h" #include "raid6.h" #include "bitmap.h" @@ -1467,7 +1471,7 @@ static void copy_data(int frombio, struct bio *bio, static void compute_parity6(struct stripe_head *sh, int method) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ @@ -2795,7 +2799,7 @@ static bool handle_stripe5(struct stripe_head *sh) static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; struct bio *return_bi = NULL; int i, pd_idx = sh->pd_idx; -- cgit v1.2.3 From 43b2e5d86d8bdd77386226db0bc961529492c043 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move md_k.h from include/linux/raid/ to drivers/md/ It really is nicer to keep related code together.. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 816157e7d8e..849478e9afd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -44,10 +44,10 @@ */ #include -#include #include #include #include +#include "md.h" #include "raid5.h" #include "raid6.h" #include "bitmap.h" -- cgit v1.2.3 From 58c0fed400603a802968b23ddf78f029c5a84e41 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: Make mddev->size sector-based. This patch renames the "size" field of struct mddev_s to "dev_sectors" and stores the number of 512-byte sectors instead of the number of 1K-blocks in it. All users of that field, including raid levels 1,4-6,10, are adjusted accordingly. This simplifies the code a bit because it allows to get rid of a couple of divisions/multiplications by two. In order to make checkpatch happy, some minor coding style issues have also been addressed. In particular, size_store() now uses strict_strtoull() instead of simple_strtoull(). Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid5.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 849478e9afd..4d7142376e5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3629,8 +3629,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped *(new_data_disks) -1, raid_disks, data_disks, &dd_idx, &pd_idx, conf); - if (last_sector >= (mddev->size<<1)) - last_sector = (mddev->size<<1)-1; + if (last_sector >= mddev->dev_sectors) + last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); @@ -3670,7 +3670,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski struct stripe_head *sh; int pd_idx; int raid_disks = conf->raid_disks; - sector_t max_sector = mddev->size << 1; + sector_t max_sector = mddev->dev_sectors; int sync_blocks; int still_degraded = 0; int i; @@ -3708,7 +3708,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski */ if (mddev->degraded >= conf->max_degraded && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - sector_t rv = (mddev->size << 1) - sector_nr; + sector_t rv = mddev->dev_sectors - sector_nr; *skipped = 1; return rv; } @@ -4146,8 +4146,8 @@ static int run(mddev_t *mddev) conf->expand_progress = mddev->reshape_position; /* device size must be a multiple of chunk size */ - mddev->size &= ~(mddev->chunk_size/1024 -1); - mddev->resync_max_sectors = mddev->size << 1; + mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->resync_max_sectors = mddev->dev_sectors; if (conf->level == 6 && conf->raid_disks < 4) { printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", @@ -4254,8 +4254,8 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - - conf->max_degraded); + mddev->array_sectors = mddev->dev_sectors * + (conf->previous_raid_disks - conf->max_degraded); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4482,11 +4482,11 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) - conf->max_degraded); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = sectors /2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } @@ -4615,7 +4615,7 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_sectors = 2 * conf->mddev->size * + conf->mddev->array_sectors = conf->mddev->dev_sectors * (conf->raid_disks - conf->max_degraded); set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); conf->mddev->changed = 1; -- cgit v1.2.3 From b5663ba405fe3e51176ddb6c91a5e186590c26b5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: simplify interface for init_stripe and get_active_stripe Rather than passing 'pd_idx' and 'disks' to these functions, just pass 'previous' which tells whether to use the 'previous' or 'current' geometry during a reshape, and let init_stripe calculate disks and pd_idx and anything else it might need. This is not a substantial simplification and even adds a division. However we will shortly be adding more complexity to init_stripe to handle more interesting 'reshape' activities, and without this change, the interface to these functions would get very complex. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4d7142376e5..c38310be0d9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -274,8 +274,9 @@ static int grow_buffers(struct stripe_head *sh, int num) } static void raid5_build_block(struct stripe_head *sh, int i); +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks); -static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) +static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; @@ -290,11 +291,11 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int remove_hash(sh); + sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = pd_idx; + sh->pd_idx = stripe_to_pdidx(sector, conf, sh->disks); sh->state = 0; - sh->disks = disks; for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -330,10 +331,12 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in static void unplug_slaves(mddev_t *mddev); static void raid5_unplug_device(struct request_queue *q); -static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, - int pd_idx, int noblock) +static struct stripe_head * +get_active_stripe(raid5_conf_t *conf, sector_t sector, + int previous, int noblock) { struct stripe_head *sh; + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -361,7 +364,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector ); conf->inactive_blocked = 0; } else - init_stripe(sh, sector, pd_idx, disks); + init_stripe(sh, sector, previous); } else { if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru)); @@ -2479,8 +2482,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, conf->raid_disks - conf->max_degraded, &dd_idx, &pd_idx, conf); - sh2 = get_active_stripe(conf, s, conf->raid_disks, - pd_idx, 1); + sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -3413,8 +3415,10 @@ static int make_request(struct request_queue *q, struct bio * bi) for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int disks, data_disks; + int previous; retry: + previous = 0; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); if (likely(conf->expand_progress == MaxSector)) disks = conf->raid_disks; @@ -3429,9 +3433,10 @@ static int make_request(struct request_queue *q, struct bio * bi) */ spin_lock_irq(&conf->device_lock); disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) + if (logical_sector >= conf->expand_progress) { disks = conf->previous_raid_disks; - else { + previous = 1; + } else { if (logical_sector >= conf->expand_lo) { spin_unlock_irq(&conf->device_lock); schedule(); @@ -3448,7 +3453,8 @@ static int make_request(struct request_queue *q, struct bio * bi) (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); + sh = get_active_stripe(conf, new_sector, previous, + (bi->bi_rw&RWA_MASK)); if (sh) { if (unlikely(conf->expand_progress != MaxSector)) { /* expansion might have moved on while waiting for a @@ -3582,9 +3588,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { int j; int skipped = 0; - pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); - sh = get_active_stripe(conf, sector_nr+i, - conf->raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr+i, 0, 0); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3632,10 +3636,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, - conf->previous_raid_disks); - sh = get_active_stripe(conf, first_sector, - conf->previous_raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, first_sector, 1, 0); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -3725,9 +3726,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -3793,7 +3794,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector, 0, 1); if (!sh) { /* failed to get a stripe - must wait */ -- cgit v1.2.3 From 112bf8970dbdfc00bd4667da5996e57c2ce58066 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: change raid5_compute_sector and stripe_to_pdidx to take a 'previous' argument This similar to the recent change to get_active_stripe. There is no functional change, just come rearrangement to make future patches cleaner. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 78 ++++++++++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 44 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c38310be0d9..c33073fe742 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -274,7 +274,7 @@ static int grow_buffers(struct stripe_head *sh, int num) } static void raid5_build_block(struct stripe_head *sh, int i); -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks); +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous); static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { @@ -293,7 +293,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = stripe_to_pdidx(sector, conf, sh->disks); + sh->pd_idx = stripe_to_pdidx(sector, conf, previous); sh->state = 0; @@ -1233,15 +1233,18 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, - unsigned int data_disks, unsigned int * dd_idx, - unsigned int * pd_idx, raid5_conf_t *conf) +static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, + int previous, + int *dd_idx, int *pd_idx) { long stripe; unsigned long chunk_number; unsigned int chunk_offset; sector_t new_sector; int sectors_per_chunk = conf->chunk_size >> 9; + int raid_disks = previous ? conf->previous_raid_disks + : conf->raid_disks; + int data_disks = raid_disks - conf->max_degraded; /* First compute the information on this sector */ @@ -1406,7 +1409,9 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) chunk_number = stripe * data_disks + i; r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; - check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); + check = raid5_compute_sector(conf, r_sector, + (raid_disks != conf->raid_disks), + &dummy1, &dummy2); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; @@ -1806,16 +1811,18 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous) { int sectors_per_chunk = conf->chunk_size >> 9; int pd_idx, dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; - raid5_compute_sector(stripe * (disks - conf->max_degraded) + raid5_compute_sector(conf, + stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, - disks, disks - conf->max_degraded, - &dd_idx, &pd_idx, conf); + previous, + &dd_idx, &pd_idx); return pd_idx; } @@ -2478,10 +2485,8 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head *sh2; sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(bn, conf->raid_disks, - conf->raid_disks - - conf->max_degraded, &dd_idx, - &pd_idx, conf); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, &pd_idx); sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe @@ -2768,8 +2773,7 @@ static bool handle_stripe5(struct stripe_head *sh) !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0); schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); @@ -2987,8 +2991,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0); compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3260,8 +3263,6 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - conf->max_degraded; unsigned int dd_idx, pd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3285,12 +3286,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) /* * compute position */ - align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, - raid_disks, - data_disks, - &dd_idx, - &pd_idx, - conf); + align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, + 0, + &dd_idx, &pd_idx); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3447,8 +3445,9 @@ static int make_request(struct request_queue *q, struct bio * bi) } data_disks = disks - conf->max_degraded; - new_sector = raid5_compute_sector(logical_sector, disks, data_disks, - &dd_idx, &pd_idx, conf); + new_sector = raid5_compute_sector(conf, logical_sector, + previous, + &dd_idx, &pd_idx); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -3625,14 +3624,12 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * block on the destination stripes. */ first_sector = - raid5_compute_sector(sector_nr*(new_data_disks), - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + raid5_compute_sector(conf, sector_nr*(new_data_disks), + 1, &dd_idx, &pd_idx); last_sector = - raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(new_data_disks) -1, - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) + *(new_data_disks) - 1), + 1, &dd_idx, &pd_idx); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { @@ -3669,8 +3666,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; - int raid_disks = conf->raid_disks; sector_t max_sector = mddev->dev_sectors; int sync_blocks; int still_degraded = 0; @@ -3725,7 +3720,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); sh = get_active_stripe(conf, sector_nr, 0, 1); if (sh == NULL) { sh = get_active_stripe(conf, sector_nr, 0, 0); @@ -3777,12 +3771,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) int handled = 0; logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - sector = raid5_compute_sector( logical_sector, - conf->raid_disks, - conf->raid_disks - conf->max_degraded, - &dd_idx, - &pd_idx, - conf); + sector = raid5_compute_sector(conf, logical_sector, + 0, &dd_idx, &pd_idx); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; -- cgit v1.2.3 From d0dabf7e577411c2bf6b616c751544dc241213d4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid6: remove expectation that Q device is immediately after P device. Code currently assumes that the devices in a raid6 stripe are 0 1 ... N-1 P Q in some rotated order. We will shortly add new layouts in which this strict pattern is broken. So remove this expectation. We still assume that the data disks are roughly in-order. However P and Q can be inserted anywhere within that order. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 211 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 124 insertions(+), 87 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c33073fe742..cb3e157b52d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -133,12 +133,36 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); } +/* Find first data disk in a raid6 stripe */ +static inline int raid6_d0(struct stripe_head *sh) +{ + if (sh->qd_idx == sh->disks - 1) + return 0; + else + return sh->qd_idx + 1; +} static inline int raid6_next_disk(int disk, int raid_disks) { disk++; return (disk < raid_disks) ? disk : 0; } +/* When walking through the disks in a raid5, starting at raid6_d0, + * We need to map each disk to a 'slot', where the data disks are slot + * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk + * is raid_disks-1. This help does that mapping. + */ +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, int *count) +{ + int slot; + if (idx == sh->pd_idx) + return sh->disks - 2; + if (idx == sh->qd_idx) + return sh->disks - 1; + slot = (*count)++; + return slot; +} + static void return_io(struct bio *return_bi) { struct bio *bi = return_bi; @@ -196,6 +220,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } } } + static void release_stripe(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; @@ -274,12 +299,14 @@ static int grow_buffers(struct stripe_head *sh, int num) } static void raid5_build_block(struct stripe_head *sh, int i); -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous); +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, + int *qd_idx); static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; + int qd_idx; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -293,7 +320,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = stripe_to_pdidx(sector, conf, previous); + sh->pd_idx = stripe_to_pdidx(sector, conf, previous, &qd_idx); + sh->qd_idx = qd_idx; sh->state = 0; @@ -1235,7 +1263,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) */ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, int previous, - int *dd_idx, int *pd_idx) + int *dd_idx, int *pd_idx, int *qd_idx) { long stripe; unsigned long chunk_number; @@ -1268,6 +1296,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /* * Select the parity disk based on the user selected algorithm. */ + *qd_idx = ~0; switch(conf->level) { case 4: *pd_idx = data_disks; @@ -1303,24 +1332,30 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: *pd_idx = raid_disks - 1 - (stripe % raid_disks); - if (*pd_idx == raid_disks-1) + *qd_idx = *pd_idx + 1; + if (*pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + *qd_idx = 0; + } else if (*dd_idx >= *pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: *pd_idx = stripe % raid_disks; - if (*pd_idx == raid_disks-1) + *qd_idx = *pd_idx + 1; + if (*pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + *qd_idx = 0; + } else if (*dd_idx >= *pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: *pd_idx = raid_disks - 1 - (stripe % raid_disks); + *qd_idx = (*pd_idx + 1) % raid_disks; *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: *pd_idx = stripe % raid_disks; + *qd_idx = (*pd_idx + 1) % raid_disks; *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; break; default: @@ -1347,7 +1382,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dummy2, dd_idx = i; + int chunk_number, dummy1, dummy2, dummy3, dd_idx = i; sector_t r_sector; @@ -1378,7 +1413,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) } break; case 6: - if (i == raid6_next_disk(sh->pd_idx, raid_disks)) + if (i == sh->qd_idx) return 0; /* It is the Q disk */ switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: @@ -1411,7 +1446,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) check = raid5_compute_sector(conf, r_sector, (raid_disks != conf->raid_disks), - &dummy1, &dummy2); + &dummy1, &dummy2, &dummy3); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; @@ -1480,13 +1515,14 @@ static void copy_data(int frombio, struct bio *bio, static void compute_parity6(struct stripe_head *sh, int method) { raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ void *ptrs[disks]; - qd_idx = raid6_next_disk(pd_idx, disks); - d0_idx = raid6_next_disk(qd_idx, disks); + pd_idx = sh->pd_idx; + qd_idx = sh->qd_idx; + d0_idx = raid6_d0(sh); pr_debug("compute_parity, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); @@ -1524,22 +1560,22 @@ static void compute_parity6(struct stripe_head *sh, int method) set_bit(R5_UPTODATE, &sh->dev[i].flags); } -// switch(method) { -// case RECONSTRUCT_WRITE: -// case CHECK_PARITY: -// case UPDATE_PARITY: - /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ - /* FIX: Is this ordering of drives even remotely optimal? */ - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("block %d/%d not uptodate on parity calc\n", i,count); - i = raid6_next_disk(i, disks); - } while ( i != d0_idx ); -// break; -// } + /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ + /* FIX: Is this ordering of drives even remotely optimal? */ + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count); + ptrs[slot] = page_address(sh->dev[i].page); + if (slot < sh->disks - 2 && + !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { + printk(KERN_ERR "block %d/%d not uptodate " + "on parity calc\n", i, count); + BUG(); + } + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count+2 != disks); raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); @@ -1563,8 +1599,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { int i, count, disks = sh->disks; void *ptr[MAX_XOR_BLOCKS], *dest, *p; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); + int qd_idx = sh->qd_idx; pr_debug("compute_block_1, stripe %llu, idx %d\n", (unsigned long long)sh->sector, dd_idx); @@ -1600,21 +1635,31 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { int i, count, disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int d0_idx = raid6_next_disk(qd_idx, disks); - int faila, failb; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[disks]; - /* faila and failb are disk numbers relative to d0_idx */ - /* pd_idx become disks-2 and qd_idx become disks-1 */ - faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; - failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; + count = 0; + i = d0_idx; + do { + int slot; + slot = raid6_idx_to_slot(i, sh, &count); + ptrs[slot] = page_address(sh->dev[i].page); + if (i == dd_idx1) + faila = slot; + if (i == dd_idx2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count+2 != disks); BUG_ON(faila == failb); if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); + (unsigned long long)sh->sector, dd_idx1, dd_idx2, + faila, failb); if ( failb == disks-1 ) { /* Q disk is one of the missing disks */ @@ -1624,39 +1669,26 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) return; } else { /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); + compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? + dd_idx2 : dd_idx1), + 0); compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ return; } } - /* We're missing D+P or D+D; build pointer table */ - { - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - i = raid6_next_disk(i, disks); - if (i != dd_idx1 && i != dd_idx2 && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("compute_2 with missing block %d/%d\n", count, i); - } while ( i != d0_idx ); - - if ( failb == disks-2 ) { - /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); + /* We're missing D+P or D+D; */ + if (failb == disks-2) { + /* We're missing D+P. */ + raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); + } else { + /* We're missing D+D. */ + raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); } + + /* Both the above update both missing blocks */ + set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); + set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); } static void @@ -1811,7 +1843,8 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous) +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, + int *qd_idxp) { int sectors_per_chunk = conf->chunk_size >> 9; int pd_idx, dd_idx; @@ -1822,7 +1855,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous) stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, previous, - &dd_idx, &pd_idx); + &dd_idx, &pd_idx, qd_idxp); return pd_idx; } @@ -2481,12 +2514,13 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { - int dd_idx, pd_idx, j; + int dd_idx, pd_idx, qd_idx, j; struct stripe_head *sh2; sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(conf, bn, 0, - &dd_idx, &pd_idx); + sector_t s = + raid5_compute_sector(conf, bn, 0, + &dd_idx, &pd_idx, &qd_idx); sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe @@ -2510,8 +2544,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j = 0; j < conf->raid_disks; j++) if (j != sh2->pd_idx && - (!r6s || j != raid6_next_disk(sh2->pd_idx, - sh2->disks)) && + (!r6s || j != sh2->qd_idx) && !test_bit(R5_Expanded, &sh2->dev[j].flags)) break; if (j == conf->raid_disks) { @@ -2771,9 +2804,11 @@ static bool handle_stripe5(struct stripe_head *sh) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && !sh->reconstruct_state) { + int qd_idx; /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0); + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx); + sh->qd_idx = qd_idx; schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); @@ -2814,7 +2849,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) struct r5dev *dev, *pdev, *qdev; mdk_rdev_t *blocked_rdev = NULL; - r6s.qd_idx = raid6_next_disk(pd_idx, disks); + r6s.qd_idx = sh->qd_idx; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n", (unsigned long long)sh->sector, sh->state, @@ -2990,8 +3025,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { /* Need to write out all blocks after computing P&Q */ + int qd_idx; sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0); + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx); + sh->qd_idx = qd_idx; compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3263,7 +3300,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx; + unsigned int dd_idx, pd_idx, qd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3288,7 +3325,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) */ align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 0, - &dd_idx, &pd_idx); + &dd_idx, &pd_idx, &qd_idx); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3380,7 +3417,7 @@ static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx; + int dd_idx, pd_idx, qd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; @@ -3447,7 +3484,7 @@ static int make_request(struct request_queue *q, struct bio * bi) new_sector = raid5_compute_sector(conf, logical_sector, previous, - &dd_idx, &pd_idx); + &dd_idx, &pd_idx, &qd_idx); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -3535,7 +3572,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; + int pd_idx, qd_idx; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -3598,7 +3635,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (j == sh->pd_idx) continue; if (conf->level == 6 && - j == raid6_next_disk(sh->pd_idx, sh->disks)) + j == sh->qd_idx) continue; s = compute_blocknr(sh, j); if (s < mddev->array_sectors) { @@ -3625,11 +3662,11 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ first_sector = raid5_compute_sector(conf, sector_nr*(new_data_disks), - 1, &dd_idx, &pd_idx); + 1, &dd_idx, &pd_idx, &qd_idx); last_sector = raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) *(new_data_disks) - 1), - 1, &dd_idx, &pd_idx); + 1, &dd_idx, &pd_idx, &qd_idx); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { @@ -3764,7 +3801,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. */ struct stripe_head *sh; - int dd_idx, pd_idx; + int dd_idx, pd_idx, qd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; int remaining; @@ -3772,7 +3809,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); sector = raid5_compute_sector(conf, logical_sector, - 0, &dd_idx, &pd_idx); + 0, &dd_idx, &pd_idx, &qd_idx); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; -- cgit v1.2.3 From 911d4ee8536d89ea8a6cd3e96b1c95a3ebc5ea66 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: simplify raid5_compute_sector interface Rather than passing 'pd_idx' and 'qd_idx' to be filled in, pass a 'struct stripe_head *' and fill in the relevant fields. This is more extensible. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 118 ++++++++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 60 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cb3e157b52d..2e2e64f6ef7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -299,14 +299,13 @@ static int grow_buffers(struct stripe_head *sh, int num) } static void raid5_build_block(struct stripe_head *sh, int i); -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, - int *qd_idx); +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh); static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; - int qd_idx; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -320,8 +319,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = stripe_to_pdidx(sector, conf, previous, &qd_idx); - sh->qd_idx = qd_idx; + stripe_set_idx(sector, conf, previous, sh); sh->state = 0; @@ -1262,12 +1260,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * Output: index of the data and parity disk, and the sector # in them. */ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, - int previous, - int *dd_idx, int *pd_idx, int *qd_idx) + int previous, int *dd_idx, + struct stripe_head *sh) { long stripe; unsigned long chunk_number; unsigned int chunk_offset; + int pd_idx, qd_idx; sector_t new_sector; int sectors_per_chunk = conf->chunk_size >> 9; int raid_disks = previous ? conf->previous_raid_disks @@ -1296,30 +1295,30 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /* * Select the parity disk based on the user selected algorithm. */ - *qd_idx = ~0; + pd_idx = qd_idx = ~0; switch(conf->level) { case 4: - *pd_idx = data_disks; + pd_idx = data_disks; break; case 5: switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = data_disks - stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = data_disks - stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", @@ -1331,32 +1330,32 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /**** FIX THIS ****/ switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *qd_idx = *pd_idx + 1; - if (*pd_idx == raid_disks-1) { + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ - *qd_idx = 0; - } else if (*dd_idx >= *pd_idx) + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - *qd_idx = *pd_idx + 1; - if (*pd_idx == raid_disks-1) { + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ - *qd_idx = 0; - } else if (*dd_idx >= *pd_idx) + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *qd_idx = (*pd_idx + 1) % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *qd_idx = (*pd_idx + 1) % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", @@ -1365,6 +1364,10 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, break; } + if (sh) { + sh->pd_idx = pd_idx; + sh->qd_idx = qd_idx; + } /* * Finally, compute the new sector number */ @@ -1382,8 +1385,9 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dummy2, dummy3, dd_idx = i; + int chunk_number, dummy1, dd_idx = i; sector_t r_sector; + struct stripe_head sh2; chunk_offset = sector_div(new_sector, sectors_per_chunk); @@ -1446,8 +1450,9 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) check = raid5_compute_sector(conf, r_sector, (raid_disks != conf->raid_disks), - &dummy1, &dummy2, &dummy3); - if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { + &dummy1, &sh2); + if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx + || sh2.qd_idx != sh->qd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; } @@ -1843,11 +1848,11 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, - int *qd_idxp) +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh) { int sectors_per_chunk = conf->chunk_size >> 9; - int pd_idx, dd_idx; + int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); int disks = previous ? conf->previous_raid_disks : conf->raid_disks; @@ -1855,8 +1860,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous, stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, previous, - &dd_idx, &pd_idx, qd_idxp); - return pd_idx; + &dd_idx, sh); } static void @@ -2514,13 +2518,12 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { - int dd_idx, pd_idx, qd_idx, j; + int dd_idx, j; struct stripe_head *sh2; sector_t bn = compute_blocknr(sh, i); - sector_t s = - raid5_compute_sector(conf, bn, 0, - &dd_idx, &pd_idx, &qd_idx); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, NULL); sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe @@ -2804,11 +2807,9 @@ static bool handle_stripe5(struct stripe_head *sh) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && !sh->reconstruct_state) { - int qd_idx; /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx); - sh->qd_idx = qd_idx; + stripe_set_idx(sh->sector, conf, 0, sh); schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); @@ -3025,10 +3026,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { /* Need to write out all blocks after computing P&Q */ - int qd_idx; sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx); - sh->qd_idx = qd_idx; + stripe_set_idx(sh->sector, conf, 0, sh); compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3300,7 +3299,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx, qd_idx; + unsigned int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3325,7 +3324,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) */ align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 0, - &dd_idx, &pd_idx, &qd_idx); + &dd_idx, NULL); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3417,7 +3416,7 @@ static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - int dd_idx, pd_idx, qd_idx; + int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; @@ -3484,7 +3483,7 @@ static int make_request(struct request_queue *q, struct bio * bi) new_sector = raid5_compute_sector(conf, logical_sector, previous, - &dd_idx, &pd_idx, &qd_idx); + &dd_idx, NULL); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -3572,7 +3571,6 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx, qd_idx; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -3662,11 +3660,11 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ first_sector = raid5_compute_sector(conf, sector_nr*(new_data_disks), - 1, &dd_idx, &pd_idx, &qd_idx); + 1, &dd_idx, NULL); last_sector = raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) *(new_data_disks) - 1), - 1, &dd_idx, &pd_idx, &qd_idx); + 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { @@ -3801,7 +3799,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. */ struct stripe_head *sh; - int dd_idx, pd_idx, qd_idx; + int dd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; int remaining; @@ -3809,7 +3807,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); sector = raid5_compute_sector(conf, logical_sector, - 0, &dd_idx, &pd_idx, &qd_idx); + 0, &dd_idx, NULL); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; -- cgit v1.2.3 From 99c0fb5f92828ae96909d390f2df137b89093b37 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: Add support for new layouts for raid5 and raid6. DDF uses different layouts for P and Q blocks than current md/raid6 so add those that are missing. Also add support for RAID6 layouts that are identical to various raid5 layouts with the simple addition of one device to hold all of the 'Q' blocks. Finally add 'raid5' layouts to match raid4. These last to will allow online level conversion. Note that this does not provide correct support for DDF/raid6 yet as the order in which data blocks are summed to produce the Q block is significant and different between current md code and DDF requirements. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 136 insertions(+), 15 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2e2e64f6ef7..c1d94ed9718 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1098,7 +1098,7 @@ static void shrink_stripes(raid5_conf_t *conf) static void raid5_end_read_request(struct bio * bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1180,7 +1180,7 @@ static void raid5_end_read_request(struct bio * bi, int error) static void raid5_end_write_request(struct bio *bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1320,20 +1320,27 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = stripe % raid_disks; *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; + case ALGORITHM_PARITY_0: + pd_idx = 0; + (*dd_idx)++; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; case 6: - /**** FIX THIS ****/ switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: pd_idx = raid_disks - 1 - (stripe % raid_disks); qd_idx = pd_idx + 1; if (pd_idx == raid_disks-1) { - (*dd_idx)++; /* Q D D D P */ + (*dd_idx)++; /* Q D D D P */ qd_idx = 0; } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ @@ -1342,7 +1349,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = stripe % raid_disks; qd_idx = pd_idx + 1; if (pd_idx == raid_disks-1) { - (*dd_idx)++; /* Q D D D P */ + (*dd_idx)++; /* Q D D D P */ qd_idx = 0; } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ @@ -1357,9 +1364,89 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = (pd_idx + 1) % raid_disks; *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; + + case ALGORITHM_PARITY_0: + pd_idx = 0; + qd_idx = 1; + (*dd_idx) += 2; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + qd_idx = data_disks + 1; + break; + + case ALGORITHM_ROTATING_ZERO_RESTART: + /* Exactly the same as RIGHT_ASYMMETRIC, but or + * of blocks for computing Q is different. + */ + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + + case ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + + case ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + raid_disks - 1) % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + + case ALGORITHM_LEFT_ASYMMETRIC_6: + /* RAID5 left_asymmetric, with Q on last device */ + pd_idx = data_disks - stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_ASYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_LEFT_SYMMETRIC_6: + pd_idx = data_disks - stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_SYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_PARITY_0_6: + pd_idx = 0; + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; } @@ -1411,9 +1498,15 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i += raid_disks; i -= (sh->pd_idx + 1); break; + case ALGORITHM_PARITY_0: + i -= 1; + break; + case ALGORITHM_PARITY_N: + break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; case 6: @@ -1422,8 +1515,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ + case ALGORITHM_ROTATING_ZERO_RESTART: + case ALGORITHM_ROTATING_N_RESTART: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ else if (i > sh->pd_idx) i -= 2; /* D D P Q D */ break; @@ -1438,9 +1533,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i -= (sh->pd_idx + 2); } break; + case ALGORITHM_PARITY_0: + i -= 2; + break; + case ALGORITHM_PARITY_N: + break; + case ALGORITHM_ROTATING_N_CONTINUE: + if (sh->pd_idx == 0) + i--; /* P D D D Q */ + else if (i > sh->pd_idx) + i -= 2; /* D D Q P D */ + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (i < sh->pd_idx) + i += data_disks + 1; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0_6: + i -= 1; + break; default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; } @@ -3308,7 +3429,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) return 0; } /* - * use bio_clone to make a copy of the bio + * use bio_clone to make a copy of the bio */ align_bi = bio_clone(raid_bio, GFP_NOIO); if (!align_bi) @@ -3439,7 +3560,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if (rw == READ && mddev->reshape_position == MaxSector && chunk_aligned_read(q,bi)) - return 0; + return 0; logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); @@ -4034,6 +4155,12 @@ static int run(mddev_t *mddev) mdname(mddev), mddev->level); return -EIO; } + if ((mddev->level == 5 && !algorithm_valid_raid5(mddev->layout)) || + (mddev->level == 6 && !algorithm_valid_raid6(mddev->layout))) { + printk(KERN_ERR "raid5: %s: layout %d not supported\n", + mdname(mddev), mddev->layout); + return -EIO; + } if (mddev->chunk_size < PAGE_SIZE) { printk(KERN_ERR "md/raid5: chunk_size must be at least " @@ -4185,12 +4312,6 @@ static int run(mddev_t *mddev) conf->chunk_size, mdname(mddev)); goto abort; } - if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR - "raid5: unsupported parity algorithm %d for %s\n", - conf->algorithm, mdname(mddev)); - goto abort; - } if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", -- cgit v1.2.3 From 67cc2b8165857ba019920d1f00d64bcc4140075d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md/raid5: finish support for DDF/raid6 DDF requires RAID6 calculations over different devices in a different order. For md/raid6, we calculate over just the data devices, starting immediately after the 'Q' block. For ddf/raid6 we calculate over all devices, using zeros in place of the P and Q blocks. This requires unfortunately complex loops... Signed-off-by: NeilBrown --- drivers/md/raid5.c | 58 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 18 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c1d94ed9718..edbc80c4d34 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -136,6 +136,10 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) /* Find first data disk in a raid6 stripe */ static inline int raid6_d0(struct stripe_head *sh) { + if (sh->ddf_layout) + /* ddf always start from first device */ + return 0; + /* md starts just after Q block */ if (sh->qd_idx == sh->disks - 1) return 0; else @@ -152,13 +156,15 @@ static inline int raid6_next_disk(int disk, int raid_disks) * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk * is raid_disks-1. This help does that mapping. */ -static int raid6_idx_to_slot(int idx, struct stripe_head *sh, int *count) +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, + int *count, int syndrome_disks) { int slot; + if (idx == sh->pd_idx) - return sh->disks - 2; + return syndrome_disks; if (idx == sh->qd_idx) - return sh->disks - 1; + return syndrome_disks + 1; slot = (*count)++; return slot; } @@ -1267,6 +1273,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, unsigned long chunk_number; unsigned int chunk_offset; int pd_idx, qd_idx; + int ddf_layout = 0; sector_t new_sector; int sectors_per_chunk = conf->chunk_size >> 9; int raid_disks = previous ? conf->previous_raid_disks @@ -1386,6 +1393,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = 0; } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; break; case ALGORITHM_ROTATING_N_RESTART: @@ -1400,6 +1408,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = 0; } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; break; case ALGORITHM_ROTATING_N_CONTINUE: @@ -1407,6 +1416,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = raid_disks - 1 - (stripe % raid_disks); qd_idx = (pd_idx + raid_disks - 1) % raid_disks; *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + ddf_layout = 1; break; case ALGORITHM_LEFT_ASYMMETRIC_6: @@ -1454,6 +1464,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, if (sh) { sh->pd_idx = pd_idx; sh->qd_idx = qd_idx; + sh->ddf_layout = ddf_layout; } /* * Finally, compute the new sector number @@ -1642,9 +1653,10 @@ static void compute_parity6(struct stripe_head *sh, int method) { raid5_conf_t *conf = sh->raid_conf; int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; + void *ptrs[syndrome_disks+2]; pd_idx = sh->pd_idx; qd_idx = sh->qd_idx; @@ -1687,23 +1699,28 @@ static void compute_parity6(struct stripe_head *sh, int method) } /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ - /* FIX: Is this ordering of drives even remotely optimal? */ + + for (i = 0; i < disks; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + count = 0; i = d0_idx; do { - int slot = raid6_idx_to_slot(i, sh, &count); + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + ptrs[slot] = page_address(sh->dev[i].page); - if (slot < sh->disks - 2 && + if (slot < syndrome_disks && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { printk(KERN_ERR "block %d/%d not uptodate " "on parity calc\n", i, count); BUG(); } + i = raid6_next_disk(i, disks); } while (i != d0_idx); - BUG_ON(count+2 != disks); + BUG_ON(count != syndrome_disks); - raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); + raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); switch(method) { case RECONSTRUCT_WRITE: @@ -1761,24 +1778,28 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { int i, count, disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; int d0_idx = raid6_d0(sh); int faila = -1, failb = -1; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; + void *ptrs[syndrome_disks+2]; + for (i = 0; i < disks ; i++) + ptrs[i] = (void *)raid6_empty_zero_page; count = 0; i = d0_idx; do { - int slot; - slot = raid6_idx_to_slot(i, sh, &count); + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + ptrs[slot] = page_address(sh->dev[i].page); + if (i == dd_idx1) faila = slot; if (i == dd_idx2) failb = slot; i = raid6_next_disk(i, disks); } while (i != d0_idx); - BUG_ON(count+2 != disks); + BUG_ON(count != syndrome_disks); BUG_ON(faila == failb); if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } @@ -1787,9 +1808,9 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); - if ( failb == disks-1 ) { + if (failb == syndrome_disks+1) { /* Q disk is one of the missing disks */ - if ( faila == disks-2 ) { + if (faila == syndrome_disks) { /* Missing P+Q, just recompute */ compute_parity6(sh, UPDATE_PARITY); return; @@ -1804,12 +1825,13 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } /* We're missing D+P or D+D; */ - if (failb == disks-2) { + if (failb == syndrome_disks) { /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); + raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); } else { /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); + raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, + ptrs); } /* Both the above update both missing blocks */ -- cgit v1.2.3 From 91adb56473febeeb3ef657bb5147ddd355465700 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md/raid5: refactor raid5 "run" .. so that the code to create the private data structures is separate. This will help with future code to change the level of an active array. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 267 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 152 insertions(+), 115 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index edbc80c4d34..d019a85547b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4164,95 +4164,49 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int run(mddev_t *mddev) +static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; int raid_disk, memory; mdk_rdev_t *rdev; struct disk_info *disk; - int working_disks = 0; - if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { + if (mddev->new_level != 5 + && mddev->new_level != 4 + && mddev->new_level != 6) { printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->level); - return -EIO; + mdname(mddev), mddev->new_level); + return ERR_PTR(-EIO); } - if ((mddev->level == 5 && !algorithm_valid_raid5(mddev->layout)) || - (mddev->level == 6 && !algorithm_valid_raid6(mddev->layout))) { + if ((mddev->new_level == 5 + && !algorithm_valid_raid5(mddev->new_layout)) || + (mddev->new_level == 6 + && !algorithm_valid_raid6(mddev->new_layout))) { printk(KERN_ERR "raid5: %s: layout %d not supported\n", - mdname(mddev), mddev->layout); - return -EIO; + mdname(mddev), mddev->new_layout); + return ERR_PTR(-EIO); } - - if (mddev->chunk_size < PAGE_SIZE) { - printk(KERN_ERR "md/raid5: chunk_size must be at least " - "PAGE_SIZE but %d < %ld\n", - mddev->chunk_size, PAGE_SIZE); - return -EINVAL; + if (mddev->new_level == 6 && mddev->raid_disks < 4) { + printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); + return ERR_PTR(-EINVAL); } - if (mddev->reshape_position != MaxSector) { - /* Check that we can continue the reshape. - * Currently only disks can change, it must - * increase, and we must be past the point where - * a stripe over-writes itself - */ - sector_t here_new, here_old; - int old_disks; - int max_degraded = (mddev->level == 5 ? 1 : 2); - - if (mddev->new_level != mddev->level || - mddev->new_layout != mddev->layout || - mddev->new_chunk != mddev->chunk_size) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "(reduce disks) required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - old_disks = mddev->raid_disks - mddev->delta_disks; - /* reshape_position must be on a new-stripe boundary, and one - * further up in new geometry must map after here in old - * geometry. - */ - here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)* - (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "raid5: reshape_position not " - "on a stripe boundary\n"); - return -EINVAL; - } - /* here_new is the stripe we will write to */ - here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)* - (old_disks-max_degraded)); - /* here_old is the first stripe that we might need to read - * from */ - if (here_new >= here_old) { - /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for " - "auto-recovery - aborting.\n"); - return -EINVAL; - } - printk(KERN_INFO "raid5: reshape will continue\n"); - /* OK, we should be able to continue; */ + if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { + printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", + mddev->new_chunk, mdname(mddev)); + return ERR_PTR(-EINVAL); } - - mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); - if ((conf = mddev->private) == NULL) + conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); + if (conf == NULL) goto abort; - if (mddev->reshape_position == MaxSector) { - conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; - } else { - conf->raid_disks = mddev->raid_disks; + + conf->raid_disks = mddev->raid_disks; + if (mddev->reshape_position == MaxSector) + conf->previous_raid_disks = mddev->raid_disks; + else conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; - } conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), GFP_KERNEL); @@ -4264,13 +4218,12 @@ static int run(mddev_t *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; - if (mddev->level == 6) { + if (mddev->new_level == 6) { conf->spare_page = alloc_page(GFP_KERNEL); if (!conf->spare_page) goto abort; } spin_lock_init(&conf->device_lock); - mddev->queue->queue_lock = &conf->device_lock; init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); @@ -4299,41 +4252,136 @@ static int run(mddev_t *mddev) printk(KERN_INFO "raid5: device %s operational as raid" " disk %d\n", bdevname(rdev->bdev,b), raid_disk); - working_disks++; } else /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; } - /* - * 0 for a fully functional array, 1 or 2 for a degraded array. - */ - mddev->degraded = conf->raid_disks - working_disks; - conf->mddev = mddev; - conf->chunk_size = mddev->chunk_size; - conf->level = mddev->level; + conf->chunk_size = mddev->new_chunk; + conf->level = mddev->new_level; if (conf->level == 6) conf->max_degraded = 2; else conf->max_degraded = 1; - conf->algorithm = mddev->layout; + conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; conf->expand_progress = mddev->reshape_position; - /* device size must be a multiple of chunk size */ - mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); - mddev->resync_max_sectors = mddev->dev_sectors; + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes)) { + printk(KERN_ERR + "raid5: couldn't allocate %dkB for buffers\n", memory); + goto abort; + } else + printk(KERN_INFO "raid5: allocated %dkB for %s\n", + memory, mdname(mddev)); - if (conf->level == 6 && conf->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", - mdname(mddev), conf->raid_disks); + conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); + if (!conf->thread) { + printk(KERN_ERR + "raid5: couldn't allocate thread for %s\n", + mdname(mddev)); goto abort; } - if (!conf->chunk_size || conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - conf->chunk_size, mdname(mddev)); - goto abort; + + return conf; + + abort: + if (conf) { + shrink_stripes(conf); + safe_put_page(conf->spare_page); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); + return ERR_PTR(-EIO); + } else + return ERR_PTR(-ENOMEM); +} + +static int run(mddev_t *mddev) +{ + raid5_conf_t *conf; + int working_disks = 0; + mdk_rdev_t *rdev; + + if (mddev->reshape_position != MaxSector) { + /* Check that we can continue the reshape. + * Currently only disks can change, it must + * increase, and we must be past the point where + * a stripe over-writes itself + */ + sector_t here_new, here_old; + int old_disks; + int max_degraded = (mddev->level == 5 ? 1 : 2); + + if (mddev->new_level != mddev->level || + mddev->new_layout != mddev->layout || + mddev->new_chunk != mddev->chunk_size) { + printk(KERN_ERR "raid5: %s: unsupported reshape " + "required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->delta_disks <= 0) { + printk(KERN_ERR "raid5: %s: unsupported reshape " + "(reduce disks) required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + old_disks = mddev->raid_disks - mddev->delta_disks; + /* reshape_position must be on a new-stripe boundary, and one + * further up in new geometry must map after here in old + * geometry. + */ + here_new = mddev->reshape_position; + if (sector_div(here_new, (mddev->chunk_size>>9)* + (mddev->raid_disks - max_degraded))) { + printk(KERN_ERR "raid5: reshape_position not " + "on a stripe boundary\n"); + return -EINVAL; + } + /* here_new is the stripe we will write to */ + here_old = mddev->reshape_position; + sector_div(here_old, (mddev->chunk_size>>9)* + (old_disks-max_degraded)); + /* here_old is the first stripe that we might need to read + * from */ + if (here_new >= here_old) { + /* Reading from the same stripe as writing to - bad */ + printk(KERN_ERR "raid5: reshape_position too early for " + "auto-recovery - aborting.\n"); + return -EINVAL; + } + printk(KERN_INFO "raid5: reshape will continue\n"); + /* OK, we should be able to continue; */ + } else { + BUG_ON(mddev->level != mddev->new_level); + BUG_ON(mddev->layout != mddev->new_layout); + BUG_ON(mddev->chunk_size != mddev->new_chunk); + BUG_ON(mddev->delta_disks != 0); } + conf = setup_conf(mddev); + + if (conf == NULL) + return -EIO; + if (IS_ERR(conf)) + return PTR_ERR(conf); + + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + + /* + * 0 for a fully functional array, 1 or 2 for a degraded array. + */ + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + test_bit(In_sync, &rdev->flags)) + working_disks++; + + mddev->degraded = conf->raid_disks - working_disks; + if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", @@ -4341,6 +4389,10 @@ static int run(mddev_t *mddev) goto abort; } + /* device size must be a multiple of chunk size */ + mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->resync_max_sectors = mddev->dev_sectors; + if (mddev->degraded > 0 && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) @@ -4356,27 +4408,6 @@ static int run(mddev_t *mddev) } } - { - mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); - if (!mddev->thread) { - printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", - mdname(mddev)); - goto abort; - } - } - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { - printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(conf); - md_unregister_thread(mddev->thread); - goto abort; - } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); - if (mddev->degraded == 0) printk("raid5: raid level %d set %s active with %d out of %d" " devices, algorithm %d\n", conf->level, mdname(mddev), @@ -4419,6 +4450,8 @@ static int run(mddev_t *mddev) "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); + mddev->queue->queue_lock = &conf->device_lock; + mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; @@ -4430,7 +4463,11 @@ static int run(mddev_t *mddev) return 0; abort: + if (mddev->thread) + md_unregister_thread(mddev->thread); + mddev->thread = NULL; if (conf) { + shrink_stripes(conf); print_raid5_conf(conf); safe_put_page(conf->spare_page); kfree(conf->disks); -- cgit v1.2.3 From e0cf8f045b2023b0b3f919ee93eb94345f648434 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: md_unregister_thread should cope with being passed NULL Mostly md_unregister_thread is only called when we know that the thread is NULL, but sometimes we need to check first. It is safer to put the check inside md_unregister_thread itself. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d019a85547b..81789fa7a02 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4463,8 +4463,7 @@ static int run(mddev_t *mddev) return 0; abort: - if (mddev->thread) - md_unregister_thread(mddev->thread); + md_unregister_thread(mddev->thread); mddev->thread = NULL; if (conf) { shrink_stripes(conf); -- cgit v1.2.3 From 245f46c2c221ef09c7db892f0e3fc2149be42052 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: add ->takeover method to support changing the personality managing an array Implement this for RAID6 to be able to 'takeover' a RAID5 array. The new RAID6 will use a layout which places Q on the last device, and that device will be missing. If there are any available spares, one will immediately have Q recovered onto it. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81789fa7a02..5b346b41e51 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -933,8 +933,10 @@ static int grow_stripes(raid5_conf_t *conf, int num) struct kmem_cache *sc; int devs = conf->raid_disks; - sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); - sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + sprintf(conf->cache_name[1], + "raid%d-%s-alt", conf->level, mdname(conf->mddev)); conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), @@ -4361,10 +4363,12 @@ static int run(mddev_t *mddev) BUG_ON(mddev->chunk_size != mddev->new_chunk); BUG_ON(mddev->delta_disks != 0); } - conf = setup_conf(mddev); - if (conf == NULL) - return -EIO; + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + if (IS_ERR(conf)) return PTR_ERR(conf); @@ -4880,6 +4884,55 @@ static void raid5_quiesce(mddev_t *mddev, int state) } } +static struct mdk_personality raid5_personality; + +static void *raid6_takeover(mddev_t *mddev) +{ + /* Currently can only take over a raid5. We map the + * personality to an equivalent raid6 personality + * with the Q block at the end. + */ + int new_layout; + + if (mddev->pers != &raid5_personality) + return ERR_PTR(-EINVAL); + if (mddev->degraded > 1) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks > 253) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks < 3) + return ERR_PTR(-EINVAL); + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + new_layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + new_layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 6; + mddev->new_layout = new_layout; + mddev->delta_disks = 1; + mddev->raid_disks += 1; + return setup_conf(mddev); +} + + static struct mdk_personality raid6_personality = { .name = "raid6", @@ -4900,6 +4953,7 @@ static struct mdk_personality raid6_personality = .start_reshape = raid5_start_reshape, #endif .quiesce = raid5_quiesce, + .takeover = raid6_takeover, }; static struct mdk_personality raid5_personality = { -- cgit v1.2.3 From d562b0c4313e3ddea402a400371afa47ddf679f9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: add ->takeover method for raid5 to be able to take over raid1 The RAID1 must have two drives and be a suitable size to be a multiple of a chunksize that isn't too small. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5b346b41e51..611ea7bbf47 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4884,6 +4884,53 @@ static void raid5_quiesce(mddev_t *mddev, int state) } } + +static void *raid5_takeover_raid1(mddev_t *mddev) +{ + int chunksect; + + if (mddev->raid_disks != 2 || + mddev->degraded > 1) + return ERR_PTR(-EINVAL); + + /* Should check if there are write-behind devices? */ + + chunksect = 64*2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect-1))) + chunksect >>= 1; + + if ((chunksect<<9) < STRIPE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; + mddev->new_chunk = chunksect << 9; + + return setup_conf(mddev); +} + + +static void *raid5_takeover(mddev_t *mddev) +{ + /* raid5 can take over: + * raid0 - if all devices are the same - make it a raid4 layout + * raid1 - if there are two drives. We need to know the chunk size + * raid4 - trivial - just use a raid4 layout. + * raid6 - Providing it is a *_6 layout + * + * For now, just do raid1 + */ + + if (mddev->level == 1) + return raid5_takeover_raid1(mddev); + + return ERR_PTR(-EINVAL); +} + + static struct mdk_personality raid5_personality; static void *raid6_takeover(mddev_t *mddev) @@ -4975,6 +5022,7 @@ static struct mdk_personality raid5_personality = .start_reshape = raid5_start_reshape, #endif .quiesce = raid5_quiesce, + .takeover = raid5_takeover, }; static struct mdk_personality raid4_personality = -- cgit v1.2.3 From b3546035277847028df650b147469fc943cf5c71 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:56:41 +1100 Subject: md/raid5: allow layout/chunksize to be changed on an active 2-drive raid5. 2-drive raid5's aren't very interesting. But if you are converting a raid1 into a raid5, you will at least temporarily have one. And that it a good time to set the layout/chunksize for the new RAID5 if you aren't happy with the defaults. layout and chunksize don't actually affect the placement of data on a 2-drive raid5, so we just do some internal book-keeping. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 611ea7bbf47..8a5e14e4a85 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4913,6 +4913,47 @@ static void *raid5_takeover_raid1(mddev_t *mddev) } +static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +{ + /* Currently the layout and chunk size can only be changed + * for a 2-drive raid array, as in that case no data shuffling + * is required. + * Later we might validate these and set new_* so a reshape + * can complete the change. + */ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (new_chunk & (new_chunk-1)) + /* not a power of 2 */ + return -EINVAL; + if (new_chunk < PAGE_SIZE) + return -EINVAL; + if (mddev->array_sectors & ((new_chunk>>9)-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (mddev->raid_disks != 2) + return -EINVAL; + + if (new_layout >= 0) { + conf->algorithm = new_layout; + mddev->layout = mddev->new_layout = new_layout; + } + if (new_chunk > 0) { + conf->chunk_size = new_chunk; + mddev->chunk_size = mddev->new_chunk = new_chunk; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + return 0; +} + static void *raid5_takeover(mddev_t *mddev) { /* raid5 can take over: @@ -5023,6 +5064,7 @@ static struct mdk_personality raid5_personality = #endif .quiesce = raid5_quiesce, .takeover = raid5_takeover, + .reconfig = raid5_reconfig, }; static struct mdk_personality raid4_personality = -- cgit v1.2.3 From e9d4758f6e93488dc719a1445ce54659a570938f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:57:09 +1100 Subject: md: add takeover support for raid4 -> raid5 conversion. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8a5e14e4a85..95953273c4d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4967,6 +4967,11 @@ static void *raid5_takeover(mddev_t *mddev) if (mddev->level == 1) return raid5_takeover_raid1(mddev); + if (mddev->level == 4) { + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_level = 5; + return setup_conf(mddev); + } return ERR_PTR(-EINVAL); } -- cgit v1.2.3 From fc9739c6d626ee79a148ec367d143b0601299a9d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:57:20 +1100 Subject: md: add takeover support for converting raid6 back into raid5 If a raid6 is still in the layout that comes from converting raid5 into a raid6. this will allow us to convert it back again. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 95953273c4d..70b50af2bcd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4912,6 +4912,39 @@ static void *raid5_takeover_raid1(mddev_t *mddev) return setup_conf(mddev); } +static void *raid5_takeover_raid6(mddev_t *mddev) +{ + int new_layout; + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC_6: + new_layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case ALGORITHM_RIGHT_ASYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + new_layout = ALGORITHM_LEFT_SYMMETRIC; + break; + case ALGORITHM_RIGHT_SYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_SYMMETRIC; + break; + case ALGORITHM_PARITY_0_6: + new_layout = ALGORITHM_PARITY_0; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 5; + mddev->new_layout = new_layout; + mddev->delta_disks = -1; + mddev->raid_disks -= 1; + return setup_conf(mddev); +} + static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) { @@ -4972,6 +5005,8 @@ static void *raid5_takeover(mddev_t *mddev) mddev->new_level = 5; return setup_conf(mddev); } + if (mddev->level == 6) + return raid5_takeover_raid6(mddev); return ERR_PTR(-EINVAL); } -- cgit v1.2.3 From 80c3a6ce4ba4470379b9e6a4d9bcd9d2ee26ae03 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 17 Mar 2009 18:10:40 -0700 Subject: md: add 'size' as a personality method In preparation for giving userspace control over ->array_sectors we need to be able to retrieve the 'default' size, and the 'anticipated' size when a reshape is requested. For personalities that do not reshape emit a warning if anything but the default size is requested. In the raid5 case we need to update ->previous_raid_disks to make the new 'default' size available. Reviewed-by: Andre Noll Signed-off-by: Dan Williams --- drivers/md/raid5.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 70b50af2bcd..2cd619ff076 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4166,6 +4166,20 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; +static sector_t +raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (!sectors) + sectors = mddev->dev_sectors; + if (!raid_disks) + raid_disks = conf->previous_raid_disks; + + sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + return sectors * (raid_disks - conf->max_degraded); +} + static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; @@ -4460,8 +4474,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_sectors = mddev->dev_sectors * - (conf->previous_raid_disks - conf->max_degraded); + mddev->array_sectors = raid5_size(mddev, 0, 0); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4684,11 +4697,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - raid5_conf_t *conf = mddev_to_conf(mddev); - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_sectors = sectors * (mddev->raid_disks - - conf->max_degraded); + mddev->array_sectors = raid5_size(mddev, sectors, mddev->raid_disks); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { @@ -4824,10 +4834,12 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_sectors = conf->mddev->dev_sectors * - (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); - conf->mddev->changed = 1; + mddev_t *mddev = conf->mddev; + + mddev->array_sectors = raid5_size(mddev, 0, conf->raid_disks); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev->changed = 1; + conf->previous_raid_disks = conf->raid_disks; bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { @@ -5076,6 +5088,7 @@ static struct mdk_personality raid6_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .size = raid5_size, #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, @@ -5098,6 +5111,7 @@ static struct mdk_personality raid5_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .size = raid5_size, #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, @@ -5122,6 +5136,7 @@ static struct mdk_personality raid4_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .size = raid5_size, #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -- cgit v1.2.3 From 1f403624bde3c678a166984b1e6a727a0ce06f2b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 14:59:03 +1100 Subject: md: centralize ->array_sectors modifications Get personalities out of the business of directly modifying ->array_sectors. Lays groundwork to introduce policy on when ->array_sectors can be modified. Reviewed-by: Andre Noll Signed-off-by: Dan Williams --- drivers/md/raid5.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2cd619ff076..2930fc26a85 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4474,7 +4474,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_sectors = raid5_size(mddev, 0, 0); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4698,7 +4698,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * worth it. */ sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_sectors = raid5_size(mddev, sectors, mddev->raid_disks); + md_set_array_sectors(mddev, raid5_size(mddev, sectors, + mddev->raid_disks)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { @@ -4836,7 +4837,8 @@ static void end_reshape(raid5_conf_t *conf) if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { mddev_t *mddev = conf->mddev; - mddev->array_sectors = raid5_size(mddev, 0, conf->raid_disks); + md_set_array_sectors(mddev, raid5_size(mddev, 0, + conf->raid_disks)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; conf->previous_raid_disks = conf->raid_disks; -- cgit v1.2.3 From b522adcde9c4d3fb7b579cfa9160d8bde7744be8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 15:00:31 +1100 Subject: md: 'array_size' sysfs attribute Allow userspace to set the size of the array according to the following semantics: 1/ size must be <= to the size returned by mddev->pers->size(mddev, 0, 0) a) If size is set before the array is running, do_md_run will fail if size is greater than the default size b) A reshape attempt that reduces the default size to less than the set array size should be blocked 2/ once userspace sets the size the kernel will not change it 3/ writing 'default' to this attribute returns control of the size to the kernel and reverts to the size reported by the personality Also, convert locations that need to know the default size from directly reading ->array_sectors to _size. Resync/reshape operations always follow the default size. Finally, fixup other locations that read a number of 1k-blocks from userspace to use strict_blocks_to_sectors() which checks for unsigned long long to sector_t overflow and blocks to sectors overflow. Reviewed-by: Andre Noll Signed-off-by: Dan Williams --- drivers/md/raid5.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2930fc26a85..1aebd3ef80b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3703,6 +3703,8 @@ static int make_request(struct request_queue *q, struct bio * bi) return 0; } +static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); + static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) { /* reshaping is quite different to recovery/resync so it is @@ -3781,7 +3783,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped j == sh->qd_idx) continue; s = compute_blocknr(sh, j); - if (s < mddev->array_sectors) { + if (s < raid5_size(mddev, 0, 0)) { skipped = 1; continue; } @@ -4700,6 +4702,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) sectors &= ~((sector_t)mddev->chunk_size/512 - 1); md_set_array_sectors(mddev, raid5_size(mddev, sectors, mddev->raid_disks)); + if (mddev->array_sectors > + raid5_size(mddev, sectors, mddev->raid_disks)) + return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { @@ -4837,7 +4842,7 @@ static void end_reshape(raid5_conf_t *conf) if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { mddev_t *mddev = conf->mddev; - md_set_array_sectors(mddev, raid5_size(mddev, 0, + md_set_array_sectors_lock(mddev, raid5_size(mddev, 0, conf->raid_disks)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; -- cgit v1.2.3 From 18b0033491f584a2d79697da714b1ef9d6b27d22 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 31 Mar 2009 15:00:56 +1100 Subject: md: raid5 run(): Fix max_degraded for raid level 4. raid4 allows only one failed disk. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1aebd3ef80b..e1ee181b79b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4331,7 +4331,7 @@ static int run(mddev_t *mddev) */ sector_t here_new, here_old; int old_disks; - int max_degraded = (mddev->level == 5 ? 1 : 2); + int max_degraded = (mddev->level == 6 ? 2 : 1); if (mddev->new_level != mddev->level || mddev->new_layout != mddev->layout || -- cgit v1.2.3 From f701d589aa34d7531183c9ac6f7713ba14212b02 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 15:09:39 +1100 Subject: md/raid6: move raid6 data processing to raid6_pq.ko Move the raid6 data processing routines into a standalone module (raid6_pq) to prepare them to be called from async_tx wrappers and other non-md drivers/modules. This precludes a circular dependency of raid456 needing the async modules for data processing while those modules in turn depend on raid456 for the base level synchronous raid6 routines. To support this move: 1/ The exportable definitions in raid6.h move to include/linux/raid/pq.h 2/ The raid6_call, recovery calls, and table symbols are exported 3/ Extra #ifdef __KERNEL__ statements to enable the userspace raid6test to compile Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/raid5.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e1ee181b79b..1f1b054ff0b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -45,11 +45,11 @@ #include #include +#include #include #include #include "md.h" #include "raid5.h" -#include "raid6.h" #include "bitmap.h" /* @@ -94,11 +94,6 @@ #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -#endif - /* * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits @@ -5153,11 +5148,6 @@ static struct mdk_personality raid4_personality = static int __init raid5_init(void) { - int e; - - e = raid6_select_algo(); - if ( e ) - return e; register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); -- cgit v1.2.3 From 34e04e87fb8b2c62c9e8868f41c8179d0e15f51a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:10:16 +1100 Subject: md/raid5: drop qd_idx from r6_state We now have this value in stripe_head so we don't need to duplicate it. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1f1b054ff0b..3930b3e9aa0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2371,7 +2371,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf, struct r6_state *r6s, int disks) { int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Would I have to read this buffer for reconstruct_write */ @@ -2561,7 +2561,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, int update_p = 0, update_q = 0; struct r5dev *dev; int pd_idx = sh->pd_idx; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; set_bit(STRIPE_HANDLE, &sh->state); @@ -2657,7 +2657,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) - if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { + if (i != sh->pd_idx && i != sh->qd_idx) { int dd_idx, j; struct stripe_head *sh2; @@ -2984,17 +2984,16 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; struct bio *return_bi = NULL; - int i, pd_idx = sh->pd_idx; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; struct stripe_head_state s; struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; mdk_rdev_t *blocked_rdev = NULL; - r6s.qd_idx = sh->qd_idx; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, r6s.qd_idx); + atomic_read(&sh->count), pd_idx, qd_idx); memset(&s, 0, sizeof(s)); spin_lock(&sh->lock); @@ -3105,9 +3104,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) pdev = &sh->dev[pd_idx]; r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); - qdev = &sh->dev[r6s.qd_idx]; - r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) - || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); + qdev = &sh->dev[qd_idx]; + r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) + || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); if ( s.written && ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) -- cgit v1.2.3 From 7ec0547838976d088dfb9cb0adb073e6e8a15aa3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:10:36 +1100 Subject: md/raid5: enhance raid5_size to work correctly with negative delta_disks This is the first of four patches which combine to allow md/raid5 to reduce the number of devices in the array by restriping the data over a subset of the devices. If the number of disks in a raid4/5/6 is being reduced, then the default size must be based on the new number, not the old number of devices. In general, it should be based on the smaller of new and old. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3930b3e9aa0..5694eb8941b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4169,8 +4169,13 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) if (!sectors) sectors = mddev->dev_sectors; - if (!raid_disks) - raid_disks = conf->previous_raid_disks; + if (!raid_disks) { + /* size is defined by the smallest of previous and new size */ + if (conf->raid_disks < conf->previous_raid_disks) + raid_disks = conf->raid_disks; + else + raid_disks = conf->previous_raid_disks; + } sectors &= ~((sector_t)mddev->chunk_size/512 - 1); return sectors * (raid_disks - conf->max_degraded); -- cgit v1.2.3 From cea9c22800773cecb1d41f4a6139f9eb6a95368b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:15:05 +1100 Subject: md: add explicit method to signal the end of a reshape. Currently raid5 (the only module that supports restriping) notices that the reshape has finished be sync_request being given a large value, and handles any cleanup them. This patch changes it so md_check_recovery calls into an explicit finish_reshape method as well. The clean-up from sync_request can do things that need to be done promptly, typically things local to the raid5_conf_t structure. The "finish_reshape" method is called under the mddev_lock so it can do things involving reconfiguring the device. This allows us to get rid of md_set_array_sectors_locked, which would have caused a deadlock if you tried to stop and array while a reshape was happening. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 50 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5694eb8941b..a0f22dd3323 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3850,6 +3850,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); return 0; @@ -4836,43 +4837,49 @@ static int raid5_start_reshape(mddev_t *mddev) static void end_reshape(raid5_conf_t *conf) { - struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - mddev_t *mddev = conf->mddev; - - md_set_array_sectors_lock(mddev, raid5_size(mddev, 0, - conf->raid_disks)); - set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; - conf->previous_raid_disks = conf->raid_disks; - bdev = bdget_disk(conf->mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)conf->mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } spin_lock_irq(&conf->device_lock); + conf->previous_raid_disks = conf->raid_disks; conf->expand_progress = MaxSector; spin_unlock_irq(&conf->device_lock); - conf->mddev->reshape_position = MaxSector; /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices */ { - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - (conf->mddev->chunk_size / PAGE_SIZE); + int data_disks = conf->raid_disks - conf->max_degraded; + int stripe = data_disks * (conf->chunk_size + / PAGE_SIZE); if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } } } +static void raid5_finish_reshape(mddev_t *mddev) +{ + struct block_device *bdev; + + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev->changed = 1; + mddev->reshape_position = MaxSector; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } +} + static void raid5_quiesce(mddev_t *mddev, int state) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -5098,6 +5105,7 @@ static struct mdk_personality raid6_personality = #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, #endif .quiesce = raid5_quiesce, .takeover = raid6_takeover, @@ -5121,6 +5129,7 @@ static struct mdk_personality raid5_personality = #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, #endif .quiesce = raid5_quiesce, .takeover = raid5_takeover, @@ -5146,6 +5155,7 @@ static struct mdk_personality raid4_personality = #ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, #endif .quiesce = raid5_quiesce, }; -- cgit v1.2.3 From fef9c61fdfabf97a307c2cf3621a6949f0a4b995 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:16:46 +1100 Subject: md/raid5: change reshape-progress measurement to cope with reshaping backwards. When reducing the number of devices in a raid4/5/6, the reshape process has to start at the end of the array and work down to the beginning. So we need to handle expand_progress and expand_lo differently. This patch renames "expand_progress" and "expand_lo" to avoid the implication that anything is getting bigger (expand->reshape) and every place they are used, we make sure that they are used the right way depending on whether delta_disks is positive or negative. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 94 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 33 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a0f22dd3323..1023c4e48a9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3593,24 +3593,28 @@ static int make_request(struct request_queue *q, struct bio * bi) retry: previous = 0; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (likely(conf->expand_progress == MaxSector)) + if (likely(conf->reshape_progress == MaxSector)) disks = conf->raid_disks; else { - /* spinlock is needed as expand_progress may be + /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value - * Ofcourse expand_progress could change after + * Ofcourse reshape_progress could change after * the lock is dropped, so once we get a reference * to the stripe that we think it is, we will have * to check again. */ spin_lock_irq(&conf->device_lock); disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) { + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_progress + : logical_sector >= conf->reshape_progress) { disks = conf->previous_raid_disks; previous = 1; } else { - if (logical_sector >= conf->expand_lo) { + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_safe + : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); schedule(); goto retry; @@ -3630,7 +3634,7 @@ static int make_request(struct request_queue *q, struct bio * bi) sh = get_active_stripe(conf, new_sector, previous, (bi->bi_rw&RWA_MASK)); if (sh) { - if (unlikely(conf->expand_progress != MaxSector)) { + if (unlikely(conf->reshape_progress != MaxSector)) { /* expansion might have moved on while waiting for a * stripe, so we must do the range check again. * Expansion could still move past after this @@ -3641,8 +3645,10 @@ static int make_request(struct request_queue *q, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if (logical_sector < conf->expand_progress && - disks == conf->previous_raid_disks) + if ((mddev->delta_disks < 0 + ? logical_sector >= conf->reshape_progress + : logical_sector < conf->reshape_progress) + && disks == conf->previous_raid_disks) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); @@ -3720,13 +3726,20 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped int dd_idx; sector_t writepos, safepos, gap; - if (sector_nr == 0 && - conf->expand_progress != 0) { - /* restarting in the middle, skip the initial sectors */ - sector_nr = conf->expand_progress; + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->delta_disks < 0 && + conf->reshape_progress < raid5_size(mddev, 0, 0)) { + sector_nr = raid5_size(mddev, 0, 0) + - conf->reshape_progress; + } else if (mddev->delta_disks > 0 && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); - *skipped = 1; - return sector_nr; + if (sector_nr) { + *skipped = 1; + return sector_nr; + } } /* we update the metadata when there is more than 3Meg @@ -3734,28 +3747,37 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * probably be time based) or when the data about to be * copied would over-write the source of the data at * the front of the range. - * i.e. one new_stripe forward from expand_progress new_maps - * to after where expand_lo old_maps to + * i.e. one new_stripe along from reshape_progress new_maps + * to after where reshape_safe old_maps to */ - writepos = conf->expand_progress + - conf->chunk_size/512*(new_data_disks); + writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); - safepos = conf->expand_lo; + safepos = conf->reshape_safe; sector_div(safepos, data_disks); - gap = conf->expand_progress - conf->expand_lo; + if (mddev->delta_disks < 0) { + writepos -= conf->chunk_size/512; + safepos += conf->chunk_size/512; + gap = conf->reshape_safe - conf->reshape_progress; + } else { + writepos += conf->chunk_size/512; + safepos -= conf->chunk_size/512; + gap = conf->reshape_progress - conf->reshape_safe; + } - if (writepos >= safepos || + if ((mddev->delta_disks < 0 + ? writepos < safepos + : writepos > safepos) || gap > (new_data_disks)*3000*2 /*3Meg*/) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); - mddev->reshape_position = conf->expand_progress; + mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || kthread_should_stop()); spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } @@ -3792,7 +3814,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped release_stripe(sh); } spin_lock_irq(&conf->device_lock); - conf->expand_progress = (sector_nr + i) * new_data_disks; + if (mddev->delta_disks < 0) + conf->reshape_progress -= i * new_data_disks; + else + conf->reshape_progress += i * new_data_disks; spin_unlock_irq(&conf->device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. @@ -3823,14 +3848,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); - mddev->reshape_position = conf->expand_progress; + mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) || kthread_should_stop()); spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } @@ -4283,7 +4308,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->max_degraded = 1; conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; - conf->expand_progress = mddev->reshape_position; + conf->reshape_progress = mddev->reshape_position; memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; @@ -4441,9 +4466,9 @@ static int run(mddev_t *mddev) print_raid5_conf(conf); - if (conf->expand_progress != MaxSector) { + if (conf->reshape_progress != MaxSector) { printk("...ok start reshape thread\n"); - conf->expand_lo = conf->expand_progress; + conf->reshape_safe = conf->reshape_progress; atomic_set(&conf->reshape_stripes, 0); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -4782,8 +4807,11 @@ static int raid5_start_reshape(mddev_t *mddev) spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; - conf->expand_progress = 0; - conf->expand_lo = 0; + if (mddev->delta_disks < 0) + conf->reshape_progress = raid5_size(mddev, 0, 0); + else + conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; spin_unlock_irq(&conf->device_lock); /* Add some new drives, as many as will fit. @@ -4825,7 +4853,7 @@ static int raid5_start_reshape(mddev_t *mddev) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - conf->expand_progress = MaxSector; + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; } @@ -4842,7 +4870,7 @@ static void end_reshape(raid5_conf_t *conf) spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; - conf->expand_progress = MaxSector; + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); /* read-ahead size must cover two whole stripes, which is -- cgit v1.2.3 From ec32a2bd35bd6b933a5db6542c48210ce069a376 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:17:38 +1100 Subject: md: allow number of drives in raid5 to be reduced When reshaping a raid5 to have fewer devices, we work from the end of the array to the beginning. md_do_sync gives addresses to sync_request that go from the beginning to the end. So largely ignore them use the internal state variable "reshape_progress" to keep track of what to do next. Never allow the size to be reduced below the minimum (4 for raid6, 3 otherwise). We require that the size of the array has already been reduced before the array is reshaped to a smaller size. This is because simply reducing the size is an easily reversible operation, while the reshape is immediately destructive and so is not reversible for the blocks at the ends of the devices. Thus to reshape an array to have fewer devices, you must first write an appropriately small size to md/array_size. When reshape finished, we remove any drives that are no longer needed and fix up ->degraded. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 124 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 87 insertions(+), 37 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1023c4e48a9..76eed592371 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3725,6 +3725,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped int i; int dd_idx; sector_t writepos, safepos, gap; + sector_t stripe_addr; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -3782,10 +3783,21 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wake_up(&conf->wait_for_overlap); } + if (mddev->delta_disks < 0) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)mddev->chunk_size / 512 - 1)) + - (conf->chunk_size / 512) - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + conf->chunk_size / 512); + stripe_addr = sector_nr; + } for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { int j; int skipped = 0; - sh = get_active_stripe(conf, sector_nr+i, 0, 0); + sh = get_active_stripe(conf, stripe_addr+i, 0, 0); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3825,10 +3837,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * block on the destination stripes. */ first_sector = - raid5_compute_sector(conf, sector_nr*(new_data_disks), + raid5_compute_sector(conf, stripe_addr*(new_data_disks), 1, &dd_idx, NULL); last_sector = - raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) + raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) *(new_data_disks) - 1), 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) @@ -4366,12 +4378,6 @@ static int run(mddev_t *mddev) mdname(mddev)); return -EINVAL; } - if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "(reduce disks) required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one * further up in new geometry must map after here in old @@ -4648,6 +4654,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number) print_raid5_conf(conf); rdev = p->rdev; if (rdev) { + if (number >= conf->raid_disks && + conf->reshape_progress == MaxSector) + clear_bit(In_sync, &rdev->flags); + if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; @@ -4657,7 +4667,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded) { + mddev->degraded <= conf->max_degraded && + number < conf->raid_disks) { err = -EBUSY; goto abort; } @@ -4745,16 +4756,26 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); - int err; - if (mddev->delta_disks < 0 || - mddev->new_level != mddev->level) - return -EINVAL; /* Cannot shrink array or change level yet */ if (mddev->delta_disks == 0) return 0; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; + if (mddev->degraded > conf->max_degraded) + return -EINVAL; + if (mddev->delta_disks < 0) { + /* We might be able to shrink, but the devices must + * be made bigger first. + * For raid6, 4 is the minimum size. + * Otherwise 2 is the minimum + */ + int min = 2; + if (mddev->level == 6) + min = 4; + if (mddev->raid_disks + mddev->delta_disks < min) + return -EINVAL; + } /* Can only proceed if there are plenty of stripe_heads. * We need a minimum of one full stripe,, and for sensible progress @@ -4771,14 +4792,7 @@ static int raid5_check_reshape(mddev_t *mddev) return -ENOSPC; } - err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); - if (err) - return err; - - if (mddev->degraded > conf->max_degraded) - return -EINVAL; - /* looks like we might be able to manage this */ - return 0; + return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); } static int raid5_start_reshape(mddev_t *mddev) @@ -4803,6 +4817,17 @@ static int raid5_start_reshape(mddev_t *mddev) */ return -EINVAL; + /* Refuse to reduce size of the array. Any reductions in + * array size must be through explicit setting of array_size + * attribute. + */ + if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) + < mddev->array_sectors) { + printk(KERN_ERR "md: %s: array size must be reduced " + "before number of disks\n", mdname(mddev)); + return -EINVAL; + } + atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; @@ -4836,9 +4861,12 @@ static int raid5_start_reshape(mddev_t *mddev) break; } - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; - spin_unlock_irqrestore(&conf->device_lock, flags); + if (mddev->delta_disks > 0) { + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) + - added_devices; + spin_unlock_irqrestore(&conf->device_lock, flags); + } mddev->raid_disks = conf->raid_disks; mddev->reshape_position = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -4863,6 +4891,9 @@ static int raid5_start_reshape(mddev_t *mddev) } #endif +/* This is called from the reshape thread and should make any + * changes needed in 'conf' + */ static void end_reshape(raid5_conf_t *conf) { @@ -4886,25 +4917,44 @@ static void end_reshape(raid5_conf_t *conf) } } +/* This is called from the raid5d thread with mddev_lock held. + * It makes config changes to the device. + */ static void raid5_finish_reshape(mddev_t *mddev) { struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; - mddev->reshape_position = MaxSector; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); + if (mddev->delta_disks > 0) { + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev->changed = 1; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } else { + int d; + raid5_conf_t *conf = mddev_to_conf(mddev); + mddev->degraded = conf->raid_disks; + for (d = 0; d < conf->raid_disks ; d++) + if (conf->disks[d].rdev && + test_bit(In_sync, + &conf->disks[d].rdev->flags)) + mddev->degraded--; + for (d = conf->raid_disks ; + d < conf->raid_disks - mddev->delta_disks; + d++) + raid5_remove_disk(mddev, d); } + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; } } -- cgit v1.2.3 From 86b42c713be3e5f6807aa14b4cbdb005d35c64d5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:19:03 +1100 Subject: md/raid5: clearly differentiate 'before' and 'after' stripes during reshape. During a raid5 reshape, we have some stripes in the cache that are 'before' the reshape (and are still to be processed) and some that are 'after'. They are currently differentiated by having different ->disks values as the only reshape current supported involves changing the number of disks. However we will soon support reshapes that do not change the number of disks (chunk parity or chunk size). So make the difference more explicit with a 'generation' number. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 76eed592371..73cdf43a647 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -318,6 +318,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) remove_hash(sh); + sh->generation = conf->generation - previous; sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; stripe_set_idx(sector, conf, previous, sh); @@ -341,7 +342,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) insert_hash(conf, sh); } -static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) +static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, + short generation) { struct stripe_head *sh; struct hlist_node *hn; @@ -349,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in CHECK_DEVLOCK(); pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) - if (sh->sector == sector && sh->disks == disks) + if (sh->sector == sector && sh->generation == generation) return sh; pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); return NULL; @@ -363,7 +365,6 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, int previous, int noblock) { struct stripe_head *sh; - int disks = previous ? conf->previous_raid_disks : conf->raid_disks; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -373,7 +374,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, conf->device_lock, /* nothing */); - sh = __find_stripe(conf, sector, disks); + sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) sh = get_free_stripe(conf); @@ -3648,7 +3649,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if ((mddev->delta_disks < 0 ? logical_sector >= conf->reshape_progress : logical_sector < conf->reshape_progress) - && disks == conf->previous_raid_disks) + && previous) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); @@ -4837,6 +4838,7 @@ static int raid5_start_reshape(mddev_t *mddev) else conf->reshape_progress = 0; conf->reshape_safe = conf->reshape_progress; + conf->generation++; spin_unlock_irq(&conf->device_lock); /* Add some new drives, as many as will fit. -- cgit v1.2.3 From 784052ecc6ade6b6acf4f67e4ada8e5f2e6df446 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:19:07 +1100 Subject: md/raid5: prepare for allowing reshape to change chunksize. Add "prev_chunk" to raid5_conf_t, similar to "previous_raid_disks", to remember what the chunk size was before the reshape that is currently underway. This seems like duplication with "chunk_size" and "new_chunk" in mddev_t, and to some extent it is, but there are differences. The values in mddev_t are always defined and often the same. The prev* values are only defined if a reshape is underway. Also (and more significantly) the raid5_conf_t values will be changed at the same time (inside an appropriate lock) that the reshape is started by setting reshape_position. In contrast, the new_chunk value is set when the sysfs file is written which could be well before the reshape starts. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 73cdf43a647..7638cc31e7e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -299,7 +299,7 @@ static int grow_buffers(struct stripe_head *sh, int num) return 0; } -static void raid5_build_block(struct stripe_head *sh, int i); +static void raid5_build_block(struct stripe_head *sh, int i, int previous); static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh); @@ -337,7 +337,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) BUG(); } dev->flags = 0; - raid5_build_block(sh, i); + raid5_build_block(sh, i, previous); } insert_hash(conf, sh); } @@ -1212,9 +1212,9 @@ static void raid5_end_write_request(struct bio *bi, int error) } -static sector_t compute_blocknr(struct stripe_head *sh, int i); +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); -static void raid5_build_block(struct stripe_head *sh, int i) +static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; @@ -1230,7 +1230,7 @@ static void raid5_build_block(struct stripe_head *sh, int i) dev->req.bi_private = sh; dev->flags = 0; - dev->sector = compute_blocknr(sh, i); + dev->sector = compute_blocknr(sh, i, previous); } static void error(mddev_t *mddev, mdk_rdev_t *rdev) @@ -1273,7 +1273,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, int pd_idx, qd_idx; int ddf_layout = 0; sector_t new_sector; - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); int raid_disks = previous ? conf->previous_raid_disks : conf->raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -1472,13 +1473,14 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, } -static sector_t compute_blocknr(struct stripe_head *sh, int i) +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) { raid5_conf_t *conf = sh->raid_conf; int raid_disks = sh->disks; int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); sector_t stripe; int chunk_offset; int chunk_number, dummy1, dd_idx = i; @@ -1579,8 +1581,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; check = raid5_compute_sector(conf, r_sector, - (raid_disks != conf->raid_disks), - &dummy1, &sh2); + previous, &dummy1, &sh2); if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx || sh2.qd_idx != sh->qd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); @@ -1992,7 +1993,9 @@ static int page_is_zero(struct page *p) static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh) { - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = + previous ? (conf->prev_chunk >> 9) + : (conf->chunk_size >> 9); int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); int disks = previous ? conf->previous_raid_disks : conf->raid_disks; @@ -2662,7 +2665,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, int dd_idx, j; struct stripe_head *sh2; - sector_t bn = compute_blocknr(sh, i); + sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); sh2 = get_active_stripe(conf, s, 0, 1); @@ -3318,6 +3321,8 @@ static int raid5_mergeable_bvec(struct request_queue *q, if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ + if (mddev->new_chunk < mddev->chunk_size) + chunk_sectors = mddev->new_chunk >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; if (max <= biovec->bv_len && bio_sectors == 0) @@ -3333,6 +3338,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int bio_sectors = bio->bi_size >> 9; + if (mddev->new_chunk < mddev->chunk_size) + chunk_sectors = mddev->new_chunk >> 9; return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -3788,7 +3795,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped BUG_ON(conf->reshape_progress == 0); stripe_addr = writepos; BUG_ON((mddev->dev_sectors & - ~((sector_t)mddev->chunk_size / 512 - 1)) + ~((sector_t)conf->chunk_size / 512 - 1)) - (conf->chunk_size / 512) - stripe_addr != sector_nr); } else { @@ -3811,7 +3818,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (conf->level == 6 && j == sh->qd_idx) continue; - s = compute_blocknr(sh, j); + s = compute_blocknr(sh, j, 0); if (s < raid5_size(mddev, 0, 0)) { skipped = 1; continue; @@ -4217,6 +4224,7 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) } sectors &= ~((sector_t)mddev->chunk_size/512 - 1); + sectors &= ~((sector_t)mddev->new_chunk/512 - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -4322,6 +4330,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; + if (conf->reshape_progress != MaxSector) + conf->prev_chunk = mddev->chunk_size; memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; @@ -4385,7 +4395,7 @@ static int run(mddev_t *mddev) * geometry. */ here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)* + if (sector_div(here_new, (mddev->new_chunk>>9)* (mddev->raid_disks - max_degraded))) { printk(KERN_ERR "raid5: reshape_position not " "on a stripe boundary\n"); @@ -4789,7 +4799,8 @@ static int raid5_check_reshape(mddev_t *mddev) if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (mddev->chunk_size / STRIPE_SIZE)*4); + (max(mddev->chunk_size, mddev->new_chunk) + / STRIPE_SIZE)*4); return -ENOSPC; } -- cgit v1.2.3 From e183eaedd53807e33f02ee80573e2833890e1f21 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:20:22 +1100 Subject: md/raid5: prepare for allowing reshape to change layout Add prev_algo to raid5_conf_t along the same lines as prev_chunk and previous_raid_disks. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7638cc31e7e..80ec9a6d8a1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1273,6 +1273,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, int pd_idx, qd_idx; int ddf_layout = 0; sector_t new_sector; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) : (conf->chunk_size >> 9); int raid_disks = previous ? conf->previous_raid_disks @@ -1307,7 +1309,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = data_disks; break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: pd_idx = data_disks - stripe % raid_disks; if (*dd_idx >= pd_idx) @@ -1335,13 +1337,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + algorithm); BUG(); } break; case 6: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: pd_idx = raid_disks - 1 - (stripe % raid_disks); qd_idx = pd_idx + 1; @@ -1454,7 +1456,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + algorithm); BUG(); } break; @@ -1481,6 +1483,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) sector_t new_sector = sh->sector, check; int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) : (conf->chunk_size >> 9); + int algorithm = previous ? conf->prev_algo + : conf->algorithm; sector_t stripe; int chunk_offset; int chunk_number, dummy1, dd_idx = i; @@ -1497,7 +1501,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) switch(conf->level) { case 4: break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -1516,14 +1520,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + algorithm); BUG(); } break; case 6: if (i == sh->qd_idx) return 0; /* It is the Q disk */ - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: case ALGORITHM_ROTATING_ZERO_RESTART: @@ -1571,7 +1575,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) break; default: printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + algorithm); BUG(); } break; @@ -4330,8 +4334,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->algorithm = mddev->new_layout; conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; - if (conf->reshape_progress != MaxSector) + if (conf->reshape_progress != MaxSector) { conf->prev_chunk = mddev->chunk_size; + conf->prev_algo = mddev->layout; + } memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; @@ -4472,14 +4478,14 @@ static int run(mddev_t *mddev) if (mddev->degraded == 0) printk("raid5: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - conf->algorithm); + " devices, algorithm %d\n", conf->level, mdname(mddev), + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); else printk(KERN_ALERT "raid5: raid level %d set %s active with %d" " out of %d devices, algorithm %d\n", conf->level, mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, conf->algorithm); + mddev->raid_disks, mddev->new_layout); print_raid5_conf(conf); -- cgit v1.2.3 From 7a6613810785872b7c028fba22fc0bae1c91733d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:21:40 +1100 Subject: md/raid5: reshape using largest of old and new chunk size This ensures that even when old and new stripes are overlapping, we will try to read all of the old before having to write any of the new. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 80ec9a6d8a1..f7fb2b87ad8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3738,6 +3738,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped int dd_idx; sector_t writepos, safepos, gap; sector_t stripe_addr; + int reshape_sectors; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -3755,6 +3756,15 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped } } + /* We need to process a full chunk at a time. + * If old and new chunk sizes differ, we need to process the + * largest of these + */ + if (mddev->new_chunk > mddev->chunk_size) + reshape_sectors = mddev->new_chunk / 512; + else + reshape_sectors = mddev->chunk_size / 512; + /* we update the metadata when there is more than 3Meg * in the block range (that is rather arbitrary, should * probably be time based) or when the data about to be @@ -3768,12 +3778,12 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { - writepos -= conf->chunk_size/512; - safepos += conf->chunk_size/512; + writepos -= reshape_sectors; + safepos += reshape_sectors; gap = conf->reshape_safe - conf->reshape_progress; } else { - writepos += conf->chunk_size/512; - safepos -= conf->chunk_size/512; + writepos += reshape_sectors; + safepos -= reshape_sectors; gap = conf->reshape_progress - conf->reshape_safe; } @@ -3799,14 +3809,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped BUG_ON(conf->reshape_progress == 0); stripe_addr = writepos; BUG_ON((mddev->dev_sectors & - ~((sector_t)conf->chunk_size / 512 - 1)) - - (conf->chunk_size / 512) - stripe_addr + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr != sector_nr); } else { - BUG_ON(writepos != sector_nr + conf->chunk_size / 512); + BUG_ON(writepos != sector_nr + reshape_sectors); stripe_addr = sector_nr; } - for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { + for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; sh = get_active_stripe(conf, stripe_addr+i, 0, 0); @@ -3839,9 +3849,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped } spin_lock_irq(&conf->device_lock); if (mddev->delta_disks < 0) - conf->reshape_progress -= i * new_data_disks; + conf->reshape_progress -= reshape_sectors * new_data_disks; else - conf->reshape_progress += i * new_data_disks; + conf->reshape_progress += reshape_sectors * new_data_disks; spin_unlock_irq(&conf->device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. @@ -3867,7 +3877,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ - sector_nr += conf->chunk_size>>9; + sector_nr += reshape_sectors; if (sector_nr >= mddev->resync_max) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, @@ -3883,7 +3893,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); } - return conf->chunk_size>>9; + return reshape_sectors; } /* FIXME go_faster isn't used */ -- cgit v1.2.3 From 88ce4930e2b80378d45506ce2c3bb5820e156e85 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:24:23 +1100 Subject: md/raid5: allow layout and chunksize to be changed on active array. If an array has 3 or more devices, we allow the chunksize or layout to be changed and when a reshape starts, we use these as the 'new' values. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 76 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 20 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f7fb2b87ad8..4fdc6d02b44 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4397,9 +4397,7 @@ static int run(mddev_t *mddev) int old_disks; int max_degraded = (mddev->level == 6 ? 2 : 1); - if (mddev->new_level != mddev->level || - mddev->new_layout != mddev->layout || - mddev->new_chunk != mddev->chunk_size) { + if (mddev->new_level != mddev->level) { printk(KERN_ERR "raid5: %s: unsupported reshape " "required - aborting.\n", mdname(mddev)); @@ -4784,8 +4782,10 @@ static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); - if (mddev->delta_disks == 0) - return 0; /* nothing to do */ + if (mddev->delta_disks == 0 && + mddev->new_layout == mddev->layout && + mddev->new_chunk == mddev->chunk_size) + return -EINVAL; /* nothing to do */ if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; @@ -4860,6 +4860,10 @@ static int raid5_start_reshape(mddev_t *mddev) spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; + conf->prev_chunk = conf->chunk_size; + conf->chunk_size = mddev->new_chunk; + conf->prev_algo = conf->algorithm; + conf->algorithm = mddev->new_layout; if (mddev->delta_disks < 0) conf->reshape_progress = raid5_size(mddev, 0, 0); else @@ -4952,6 +4956,7 @@ static void end_reshape(raid5_conf_t *conf) static void raid5_finish_reshape(mddev_t *mddev) { struct block_device *bdev; + raid5_conf_t *conf = mddev_to_conf(mddev); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -4970,7 +4975,6 @@ static void raid5_finish_reshape(mddev_t *mddev) } } else { int d; - raid5_conf_t *conf = mddev_to_conf(mddev); mddev->degraded = conf->raid_disks; for (d = 0; d < conf->raid_disks ; d++) if (conf->disks[d].rdev && @@ -4982,6 +4986,8 @@ static void raid5_finish_reshape(mddev_t *mddev) d++) raid5_remove_disk(mddev, d); } + mddev->layout = conf->algorithm; + mddev->chunk_size = conf->chunk_size; mddev->reshape_position = MaxSector; mddev->delta_disks = 0; } @@ -5080,11 +5086,10 @@ static void *raid5_takeover_raid6(mddev_t *mddev) static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) { - /* Currently the layout and chunk size can only be changed - * for a 2-drive raid array, as in that case no data shuffling - * is required. - * Later we might validate these and set new_* so a reshape - * can complete the change. + /* For a 2-drive array, the layout and chunk size can be changed + * immediately as not restriping is needed. + * For larger arrays we record the new value - after validation + * to be used by a reshape pass. */ raid5_conf_t *conf = mddev_to_conf(mddev); @@ -5103,19 +5108,49 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) /* They look valid */ - if (mddev->raid_disks != 2) - return -EINVAL; + if (mddev->raid_disks == 2) { - if (new_layout >= 0) { - conf->algorithm = new_layout; - mddev->layout = mddev->new_layout = new_layout; + if (new_layout >= 0) { + conf->algorithm = new_layout; + mddev->layout = mddev->new_layout = new_layout; + } + if (new_chunk > 0) { + conf->chunk_size = new_chunk; + mddev->chunk_size = mddev->new_chunk = new_chunk; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } else { + if (new_layout >= 0) + mddev->new_layout = new_layout; + if (new_chunk > 0) + mddev->new_chunk = new_chunk; } + return 0; +} + +static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) +{ + if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) + return -EINVAL; if (new_chunk > 0) { - conf->chunk_size = new_chunk; - mddev->chunk_size = mddev->new_chunk = new_chunk; + if (new_chunk & (new_chunk-1)) + /* not a power of 2 */ + return -EINVAL; + if (new_chunk < PAGE_SIZE) + return -EINVAL; + if (mddev->array_sectors & ((new_chunk>>9)-1)) + /* not factor of array size */ + return -EINVAL; } - set_bit(MD_CHANGE_DEVS, &mddev->flags); - md_wakeup_thread(mddev->thread); + + /* They look valid */ + + if (new_layout >= 0) + mddev->new_layout = new_layout; + if (new_chunk > 0) + mddev->new_chunk = new_chunk; + return 0; } @@ -5216,6 +5251,7 @@ static struct mdk_personality raid6_personality = #endif .quiesce = raid5_quiesce, .takeover = raid6_takeover, + .reconfig = raid6_reconfig, }; static struct mdk_personality raid5_personality = { -- cgit v1.2.3 From ab69ae12ceef7f23c578a3c230144e94a167a821 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:26:47 +1100 Subject: md/raid5: be more careful about write ordering when reshaping. When we are reshaping an array, it is very important that we read the data from a particular sector offset before writing new data at that offset. In most cases when growing or shrinking an array we read long before we even consider writing. But when restriping an array without changing it size, there is a small possibility that we might have some data to available write before the read has happened at the same location. This would require some stripes to be in cache already. To guard against this small possibility, we check, before writing, that the 'old' stripe at the same location is not in the process of being read. And we ensure that we mark all 'source' stripes as such before allowing new 'destination' stripes to proceed. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4fdc6d02b44..062df846fd6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -395,7 +395,8 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, init_stripe(sh, sector, previous); } else { if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); + BUG_ON(!list_empty(&sh->lru) + && !test_bit(STRIPE_EXPANDING, &sh->state)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); @@ -2944,6 +2945,23 @@ static bool handle_stripe5(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); for (i = conf->raid_disks; i--; ) { @@ -3172,6 +3190,23 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); @@ -3739,6 +3774,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped sector_t writepos, safepos, gap; sector_t stripe_addr; int reshape_sectors; + struct list_head stripes; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -3816,6 +3852,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped BUG_ON(writepos != sector_nr + reshape_sectors); stripe_addr = sector_nr; } + INIT_LIST_HEAD(&stripes); for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; @@ -3845,7 +3882,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped set_bit(STRIPE_EXPAND_READY, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } - release_stripe(sh); + list_add(&sh->lru, &stripes); } spin_lock_irq(&conf->device_lock); if (mddev->delta_disks < 0) @@ -3874,6 +3911,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* Now that the sources are clearly marked, we can release + * the destination stripes + */ + while (!list_empty(&stripes)) { + sh = list_entry(stripes.next, struct stripe_head, lru); + list_del_init(&sh->lru); + release_stripe(sh); + } /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ -- cgit v1.2.3 From 2cffc4a01dd90a502324e3453d7b245d6d16e1c2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:27:05 +1100 Subject: md: remove CONFIG_MD_RAID_RESHAPE config option. This was only needed when the code was experimental. Most of it is well tested now, so the option is no longer useful. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 062df846fd6..fb11c13eb69 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -948,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE static int resize_stripes(raid5_conf_t *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -1073,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) conf->pool_size = newsize; return err; } -#endif static int drop_one_stripe(raid5_conf_t *conf) { @@ -4822,7 +4820,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -4967,7 +4964,6 @@ static int raid5_start_reshape(mddev_t *mddev) md_new_event(mddev); return 0; } -#endif /* This is called from the reshape thread and should make any * changes needed in 'conf' @@ -5289,11 +5285,9 @@ static struct mdk_personality raid6_personality = .sync_request = sync_request, .resize = raid5_resize, .size = raid5_size, -#ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, -#endif .quiesce = raid5_quiesce, .takeover = raid6_takeover, .reconfig = raid6_reconfig, @@ -5314,11 +5308,9 @@ static struct mdk_personality raid5_personality = .sync_request = sync_request, .resize = raid5_resize, .size = raid5_size, -#ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, -#endif .quiesce = raid5_quiesce, .takeover = raid5_takeover, .reconfig = raid5_reconfig, @@ -5340,11 +5332,9 @@ static struct mdk_personality raid4_personality = .sync_request = sync_request, .resize = raid5_resize, .size = raid5_size, -#ifdef CONFIG_MD_RAID5_RESHAPE .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, -#endif .quiesce = raid5_quiesce, }; -- cgit v1.2.3 From b0f9ec047b79a92e8b8a9dfbf97537c8fbef234a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:27:18 +1100 Subject: md/raid5: minor code cleanups in make_request. ... and to be certain the that make_request doesn't wait forever, add a 'wake_up' when ->reshape_progress has been set to MaxSector Signed-off-by: NeilBrown --- drivers/md/raid5.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index fb11c13eb69..bb4b12e370d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3637,10 +3637,9 @@ static int make_request(struct request_queue *q, struct bio * bi) retry: previous = 0; + disks = conf->raid_disks; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (likely(conf->reshape_progress == MaxSector)) - disks = conf->raid_disks; - else { + if (unlikely(conf->reshape_progress != MaxSector)) { /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value @@ -3650,7 +3649,6 @@ static int make_request(struct request_queue *q, struct bio * bi) * to check again. */ spin_lock_irq(&conf->device_lock); - disks = conf->raid_disks; if (mddev->delta_disks < 0 ? logical_sector < conf->reshape_progress : logical_sector >= conf->reshape_progress) { @@ -3679,7 +3677,7 @@ static int make_request(struct request_queue *q, struct bio * bi) sh = get_active_stripe(conf, new_sector, previous, (bi->bi_rw&RWA_MASK)); if (sh) { - if (unlikely(conf->reshape_progress != MaxSector)) { + if (unlikely(previous)) { /* expansion might have moved on while waiting for a * stripe, so we must do the range check again. * Expansion could still move past after this @@ -3690,10 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if ((mddev->delta_disks < 0 - ? logical_sector >= conf->reshape_progress - : logical_sector < conf->reshape_progress) - && previous) + if (mddev->delta_disks < 0 + ? logical_sector >= conf->reshape_progress + : logical_sector < conf->reshape_progress) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); @@ -4977,6 +4974,7 @@ static void end_reshape(raid5_conf_t *conf) conf->previous_raid_disks = conf->raid_disks; conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices -- cgit v1.2.3 From c8f517c444e4f9f55b5b5ca202b8404691a35805 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:28:40 +1100 Subject: md/raid5 revise rules for when to update metadata during reshape We currently update the metadata : 1/ every 3Megabytes 2/ When the place we will write new-layout data to is recorded in the metadata as still containing old-layout data. Rule one exists to avoid having to re-do too much reshaping in the face of a crash/restart. So it should really be time based rather than size based. So change it to "every 10 seconds". Rule two turns out to be too harsh when restriping an array 'in-place', as in that case the metadata much be updates for every stripe. For the in-place update, it can only possibly be safe from a crash if some user-space program data a backup of every e.g. few hundred stripes before allowing them to be reshaped. In that case, the constant metadata update is pointless. So only update the metadata if the new metadata will report that the end of the 'old-layout' data is beyond where we are currently writing 'new-layout' data. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bb4b12e370d..3bbc6d64704 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3766,7 +3766,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped int new_data_disks = conf->raid_disks - conf->max_degraded; int i; int dd_idx; - sector_t writepos, safepos, gap; + sector_t writepos, readpos, safepos; sector_t stripe_addr; int reshape_sectors; struct list_head stripes; @@ -3806,26 +3806,46 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); + readpos = conf->reshape_progress; + sector_div(readpos, data_disks); safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { writepos -= reshape_sectors; + readpos += reshape_sectors; safepos += reshape_sectors; - gap = conf->reshape_safe - conf->reshape_progress; } else { writepos += reshape_sectors; + readpos -= reshape_sectors; safepos -= reshape_sectors; - gap = conf->reshape_progress - conf->reshape_safe; } + /* 'writepos' is the most advanced device address we might write. + * 'readpos' is the least advanced device address we might read. + * 'safepos' is the least address recorded in the metadata as having + * been reshaped. + * If 'readpos' is behind 'writepos', then there is no way that we can + * ensure safety in the face of a crash - that must be done by userspace + * making a backup of the data. So in that case there is no particular + * rush to update metadata. + * Otherwise if 'safepos' is behind 'writepos', then we really need to + * update the metadata to advance 'safepos' to match 'readpos' so that + * we can be safe in the event of a crash. + * So we insist on updating metadata if safepos is behind writepos and + * readpos is beyond writepos. + * In any case, update the metadata every 10 seconds. + * Maybe that number should be configurable, but I'm not sure it is + * worth it.... maybe it could be a multiple of safemode_delay??? + */ if ((mddev->delta_disks < 0 - ? writepos < safepos - : writepos > safepos) || - gap > (new_data_disks)*3000*2 /*3Meg*/) { + ? (safepos > writepos && readpos < writepos) + : (safepos < writepos && readpos > writepos)) || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || @@ -3923,6 +3943,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, @@ -4957,6 +4978,7 @@ static int raid5_start_reshape(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); return -EAGAIN; } + conf->reshape_checkpoint = jiffies; md_wakeup_thread(mddev->sync_thread); md_new_event(mddev); return 0; -- cgit v1.2.3