From 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 16 Feb 2009 10:25:40 +0100 Subject: block: fix bad definition of BIO_RW_SYNC We can't OR shift values, so get rid of BIO_RW_SYNC and use BIO_RW_SYNCIO and BIO_RW_UNPLUG explicitly. This brings back the behaviour from before 213d9417fec62ef4c3675621b9364a667954d4dd. Signed-off-by: Jens Axboe --- drivers/md/dm-io.c | 2 +- drivers/md/dm-kcopyd.c | 2 +- drivers/md/md.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index a34338567a2..f14813be4ef 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -328,7 +328,7 @@ static void dispatch_io(int rw, unsigned int num_regions, struct dpages old_pages = *dp; if (sync) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); /* * For multiple regions we need to be careful to rewind diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3073618269e..0a225da2127 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -344,7 +344,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | (1 << BIO_RW_SYNC), + .bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG), .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, diff --git a/drivers/md/md.c b/drivers/md/md.c index 4495104f6c9..03b4cd0a634 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -474,7 +474,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<bi_bdev = rdev->bdev; bio->bi_sector = sector; @@ -531,7 +531,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct completion event; int ret; - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); bio->bi_bdev = bdev; bio->bi_sector = sector; -- cgit v1.2.3 From 09b4068a7fe442efc40e9dcbcf5ff37c3338ab15 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2009 13:18:47 +1100 Subject: md/raid10: Don't skip more than 1 bitmap-chunk at a time during recovery. When doing recovery on a raid10 with a write-intent bitmap, we only need to recovery chunks that are flagged in the bitmap. However if we choose to skip a chunk as it isn't flag, the code currently skips the whole raid10-chunk, thus it might not recovery some blocks that need recovering. This patch fixes it. In case that is confusing, it might help to understand that there is a 'raid10 chunk size' which guides how data is distributed across the devices, and a 'bitmap chunk size' which says how much data corresponds to a single bit in the bitmap. This bug only affects cases where the bitmap chunk size is smaller than the raid10 chunk size. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid10.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6736d6dff98..118f89e716e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2010,13 +2010,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* There is nowhere to write, so all non-sync * drives must be failed, so try the next chunk... */ - { - sector_t sec = max_sector - sector_nr; - sectors_skipped += sec; + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; + + sectors_skipped += (max_sector - sector_nr); chunks_skipped ++; sector_nr = max_sector; goto skipped; - } } static int run(mddev_t *mddev) -- cgit v1.2.3 From 78200d45cde2a79c0d0ae0407883bb264caa3c18 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2009 13:18:47 +1100 Subject: md/raid10: Don't call bitmap_cond_end_sync when we are doing recovery. For raid1/4/5/6, resync (fixing inconsistencies between devices) is very similar to recovery (rebuilding a failed device onto a spare). The both walk through the device addresses in order. For raid10 it can be quite different. resync follows the 'array' address, and makes sure all copies are the same. Recover walks through 'device' addresses and recreates each missing block. The 'bitmap_cond_end_sync' function allows the write-intent-bitmap (When present) to be updated to reflect a partially completed resync. It makes assumptions which mean that it does not work correctly for raid10 recovery at all. In particularly, it can cause bitmap-directed recovery of a raid10 to not recovery some of the blocks that need to be recovered. So move the call to bitmap_cond_end_sync into the resync path, rather than being in the common "resync or recovery" path. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 118f89e716e..e1feb87afc6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1749,8 +1749,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); - bitmap_cond_end_sync(mddev->bitmap, sector_nr); - /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that * have bi_end_io, bi_sector, bi_bdev set, @@ -1886,6 +1884,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* resync. Schedule a read for every block at this virt offset */ int count = 0; + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { -- cgit v1.2.3 From 73d5c38a9536142e062c35997b044e89166e063b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2009 13:18:47 +1100 Subject: md: avoid races when stopping resync. There has been a race in raid10 and raid1 for a long time which has only recently started showing up due to a scheduler changed. When a sync_read request finishes, as soon as reschedule_retry is called, another thread can mark the resync request as having completed, so md_do_sync can finish, ->stop can be called, and ->conf can be freed. So using conf after reschedule_retry is not safe. Similarly, when finishing a sync_write, calling md_done_sync must be the last thing we do, as it allows a chain of events which will free conf and other data structures. The first of these requires action in raid10.c The second requires action in raid1.c and raid10.c Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid1.c | 3 ++- drivers/md/raid10.c | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 01e3cffd03b..e2466425d9c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1237,8 +1237,9 @@ static void end_sync_write(struct bio *bio, int error) update_head_pos(mirror, r1_bio); if (atomic_dec_and_test(&r1_bio->remaining)) { - md_done_sync(mddev, r1_bio->sectors, uptodate); + sector_t s = r1_bio->sectors; put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); } } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e1feb87afc6..7301631abe0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1236,6 +1236,7 @@ static void end_sync_read(struct bio *bio, int error) /* for reconstruct, we always reschedule after a read. * for resync, only after all reads */ + rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); if (test_bit(R10BIO_IsRecover, &r10_bio->state) || atomic_dec_and_test(&r10_bio->remaining)) { /* we have read all the blocks, @@ -1243,7 +1244,6 @@ static void end_sync_read(struct bio *bio, int error) */ reschedule_retry(r10_bio); } - rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); } static void end_sync_write(struct bio *bio, int error) @@ -1264,11 +1264,13 @@ static void end_sync_write(struct bio *bio, int error) update_head_pos(i, r10_bio); + rdev_dec_pending(conf->mirrors[d].rdev, mddev); while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */ - md_done_sync(mddev, r10_bio->sectors, 1); + sector_t s = r10_bio->sectors; put_buf(r10_bio); + md_done_sync(mddev, s, 1); break; } else { r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; @@ -1276,7 +1278,6 @@ static void end_sync_write(struct bio *bio, int error) r10_bio = r10_bio2; } } - rdev_dec_pending(conf->mirrors[d].rdev, mddev); } /* -- cgit v1.2.3 From 5fd3a17ed456637a224cf4ca82b9ad9d005bc8d4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 4 Mar 2009 00:57:25 -0700 Subject: md: fix deadlock when stopping arrays Resolve a deadlock when stopping redundant arrays, i.e. ones that require a call to sysfs_remove_group when shutdown. The deadlock is summarized below: Thread1 Thread2 ------- ------- read sysfs attribute stop array take mddev lock sysfs_remove_group sysfs_get_active wait for mddev lock wait for active Sysrq-w: -------- mdmon S 00000017 2212 4163 1 f1982ea8 00000046 2dcf6b85 00000017 c0b23100 f2f83ed0 c0b23100 f2f8413c c0b23100 c0b23100 c0b1fb98 f2f8413c 00000000 f2f8413c c0b23100 f2291ecc 00000002 c0b23100 00000000 00000017 f2f83ed0 f1982eac 00000046 c044d9dd Call Trace: [] ? debug_mutex_add_waiter+0x1d/0x58 [] __mutex_lock_common+0x1d9/0x338 [] ? __mutex_lock_common+0x1d9/0x338 [] mutex_lock_interruptible_nested+0x33/0x3a [] ? mddev_lock+0x14/0x16 [] mddev_lock+0x14/0x16 [] md_attr_show+0x2a/0x49 [] sysfs_read_file+0x93/0xf9 mdadm D 00000017 2812 4177 1 f0401d78 00000046 430456f8 00000017 f0401d58 f0401d20 c0b23100 f2da2c4c c0b23100 c0b23100 c0b1fb98 f2da2c4c 0a10fc36 00000000 c0b23100 f0401d70 00000003 c0b23100 00000000 00000017 f2da29e0 00000001 00000002 00000000 Call Trace: [] schedule_timeout+0x1b/0x95 [] ? schedule_timeout+0x1b/0x95 [] ? wait_for_common+0x34/0xdc [] ? trace_hardirqs_on_caller+0x18/0x145 [] ? trace_hardirqs_on+0xb/0xd [] wait_for_common+0xa0/0xdc [] ? default_wake_function+0x0/0x12 [] wait_for_completion+0x17/0x19 [] sysfs_addrm_finish+0x19f/0x1d1 [] sysfs_hash_and_remove+0x42/0x55 [] sysfs_remove_group+0x57/0x86 [] do_md_stop+0x13a/0x499 This has been there for a while, but is easier to trigger now that mdmon is closely watching sysfs. Cc: Reported-by: Jacek Danecki Signed-off-by: Dan Williams --- drivers/md/md.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 03b4cd0a634..a307f87eb90 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -214,12 +214,7 @@ static inline mddev_t *mddev_get(mddev_t *mddev) return mddev; } -static void mddev_delayed_delete(struct work_struct *ws) -{ - mddev_t *mddev = container_of(ws, mddev_t, del_work); - kobject_del(&mddev->kobj); - kobject_put(&mddev->kobj); -} +static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(mddev_t *mddev) { @@ -3542,6 +3537,21 @@ static struct kobj_type md_ktype = { int mdp_major = 0; +static void mddev_delayed_delete(struct work_struct *ws) +{ + mddev_t *mddev = container_of(ws, mddev_t, del_work); + + if (mddev->private == &md_redundancy_group) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + mddev->private = NULL; + } + kobject_del(&mddev->kobj); + kobject_put(&mddev->kobj); +} + static int md_alloc(dev_t dev, char *name) { static DEFINE_MUTEX(disks_mutex); @@ -4033,13 +4043,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; - if (mddev->pers->sync_request) { - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; - } module_put(mddev->pers->owner); + if (mddev->pers->sync_request) + mddev->private = &md_redundancy_group; mddev->pers = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); -- cgit v1.2.3