From 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 16 Feb 2009 10:25:40 +0100
Subject: block: fix bad definition of BIO_RW_SYNC

We can't OR shift values, so get rid of BIO_RW_SYNC and use BIO_RW_SYNCIO
and BIO_RW_UNPLUG explicitly. This brings back the behaviour from before
213d9417fec62ef4c3675621b9364a667954d4dd.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/md/dm-io.c     | 2 +-
 drivers/md/dm-kcopyd.c | 2 +-
 drivers/md/md.c        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index a34338567a2..f14813be4ef 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -328,7 +328,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	struct dpages old_pages = *dp;
 
 	if (sync)
-		rw |= (1 << BIO_RW_SYNC);
+		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 
 	/*
 	 * For multiple regions we need to be careful to rewind
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3073618269e..0a225da2127 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -344,7 +344,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_rw = job->rw | (1 << BIO_RW_SYNC),
+		.bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG),
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = job->offset,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4495104f6c9..03b4cd0a634 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -474,7 +474,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	 * causes ENOTSUPP, we allocate a spare bio...
 	 */
 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
-	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
+	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
 
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_sector = sector;
@@ -531,7 +531,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	struct completion event;
 	int ret;
 
-	rw |= (1 << BIO_RW_SYNC);
+	rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 
 	bio->bi_bdev = bdev;
 	bio->bi_sector = sector;
-- 
cgit v1.2.3


From 09b4068a7fe442efc40e9dcbcf5ff37c3338ab15 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Feb 2009 13:18:47 +1100
Subject: md/raid10:  Don't skip more than 1 bitmap-chunk at a time during
 recovery.

When doing recovery on a raid10 with a write-intent bitmap, we only
need to recovery chunks that are flagged in the bitmap.

However if we choose to skip a chunk as it isn't flag, the code
currently skips the whole raid10-chunk, thus it might not recovery
some blocks that need recovering.

This patch fixes it.

In case that is confusing, it might help to understand that there
is a 'raid10 chunk size' which guides how data is distributed across
the devices, and a 'bitmap chunk size' which says how much data
corresponds to a single bit in the bitmap.

This bug only affects cases where the bitmap chunk size is smaller
than the raid10 chunk size.


Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6736d6dff98..118f89e716e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2010,13 +2010,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	/* There is nowhere to write, so all non-sync
 	 * drives must be failed, so try the next chunk...
 	 */
-	{
-	sector_t sec = max_sector - sector_nr;
-	sectors_skipped += sec;
+	if (sector_nr + max_sync < max_sector)
+		max_sector = sector_nr + max_sync;
+
+	sectors_skipped += (max_sector - sector_nr);
 	chunks_skipped ++;
 	sector_nr = max_sector;
 	goto skipped;
-	}
 }
 
 static int run(mddev_t *mddev)
-- 
cgit v1.2.3


From 78200d45cde2a79c0d0ae0407883bb264caa3c18 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Feb 2009 13:18:47 +1100
Subject: md/raid10:  Don't call bitmap_cond_end_sync when we are doing
 recovery.

For raid1/4/5/6, resync (fixing inconsistencies between devices) is
very similar to recovery (rebuilding a failed device onto a spare).
The both walk through the device addresses in order.

For raid10 it can be quite different.  resync follows the 'array'
address, and makes sure all copies are the same.  Recover walks
through 'device' addresses and recreates each missing block.

The 'bitmap_cond_end_sync' function allows the write-intent-bitmap
(When present) to be updated to reflect a partially completed resync.
It makes assumptions which mean that it does not work correctly for
raid10 recovery at all.

In particularly, it can cause bitmap-directed recovery of a raid10 to
not recovery some of the blocks that need to be recovered.

So move the call to bitmap_cond_end_sync into the resync path, rather
than being in the common "resync or recovery" path.


Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 118f89e716e..e1feb87afc6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1749,8 +1749,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
 
-	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
-
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
 	 * have bi_end_io, bi_sector, bi_bdev set,
@@ -1886,6 +1884,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		/* resync. Schedule a read for every block at this virt offset */
 		int count = 0;
 
+		bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+
 		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
 				       &sync_blocks, mddev->degraded) &&
 		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
-- 
cgit v1.2.3


From 73d5c38a9536142e062c35997b044e89166e063b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Feb 2009 13:18:47 +1100
Subject: md: avoid races when stopping resync.

There has been a race in raid10 and raid1 for a long time
which has only recently started showing up due to a scheduler changed.

When a sync_read request finishes, as soon as reschedule_retry
is called, another thread can mark the resync request as having
completed, so md_do_sync can finish, ->stop can be called, and
->conf can be freed.  So using conf after reschedule_retry is not
safe.

Similarly, when finishing a sync_write, calling md_done_sync must be
the last thing we do, as it allows a chain of events which will free
conf and other data structures.

The first of these requires action in raid10.c
The second requires action in raid1.c and raid10.c

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 3 ++-
 drivers/md/raid10.c | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 01e3cffd03b..e2466425d9c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1237,8 +1237,9 @@ static void end_sync_write(struct bio *bio, int error)
 	update_head_pos(mirror, r1_bio);
 
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
-		md_done_sync(mddev, r1_bio->sectors, uptodate);
+		sector_t s = r1_bio->sectors;
 		put_buf(r1_bio);
+		md_done_sync(mddev, s, uptodate);
 	}
 }
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e1feb87afc6..7301631abe0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1236,6 +1236,7 @@ static void end_sync_read(struct bio *bio, int error)
 	/* for reconstruct, we always reschedule after a read.
 	 * for resync, only after all reads
 	 */
+	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
 	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
 	    atomic_dec_and_test(&r10_bio->remaining)) {
 		/* we have read all the blocks,
@@ -1243,7 +1244,6 @@ static void end_sync_read(struct bio *bio, int error)
 		 */
 		reschedule_retry(r10_bio);
 	}
-	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
 }
 
 static void end_sync_write(struct bio *bio, int error)
@@ -1264,11 +1264,13 @@ static void end_sync_write(struct bio *bio, int error)
 
 	update_head_pos(i, r10_bio);
 
+	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 	while (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (r10_bio->master_bio == NULL) {
 			/* the primary of several recovery bios */
-			md_done_sync(mddev, r10_bio->sectors, 1);
+			sector_t s = r10_bio->sectors;
 			put_buf(r10_bio);
+			md_done_sync(mddev, s, 1);
 			break;
 		} else {
 			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
@@ -1276,7 +1278,6 @@ static void end_sync_write(struct bio *bio, int error)
 			r10_bio = r10_bio2;
 		}
 	}
-	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 }
 
 /*
-- 
cgit v1.2.3


From 5fd3a17ed456637a224cf4ca82b9ad9d005bc8d4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 4 Mar 2009 00:57:25 -0700
Subject: md: fix deadlock when stopping arrays

Resolve a deadlock when stopping redundant arrays, i.e. ones that
require a call to sysfs_remove_group when shutdown.  The deadlock is
summarized below:

Thread1                Thread2
-------                -------
read sysfs attribute   stop array
                       take mddev lock
                       sysfs_remove_group
sysfs_get_active
wait for mddev lock
                       wait for active

Sysrq-w:
--------
mdmon         S 00000017  2212  4163      1
  f1982ea8 00000046 2dcf6b85 00000017 c0b23100 f2f83ed0 c0b23100 f2f8413c
  c0b23100 c0b23100 c0b1fb98 f2f8413c 00000000 f2f8413c c0b23100 f2291ecc
  00000002 c0b23100 00000000 00000017 f2f83ed0 f1982eac 00000046 c044d9dd
Call Trace:
  [<c044d9dd>] ? debug_mutex_add_waiter+0x1d/0x58
  [<c06ef451>] __mutex_lock_common+0x1d9/0x338
  [<c06ef451>] ? __mutex_lock_common+0x1d9/0x338
  [<c06ef5e3>] mutex_lock_interruptible_nested+0x33/0x3a
  [<c0634553>] ? mddev_lock+0x14/0x16
  [<c0634553>] mddev_lock+0x14/0x16
  [<c0634eda>] md_attr_show+0x2a/0x49
  [<c04e9997>] sysfs_read_file+0x93/0xf9
mdadm         D 00000017  2812  4177      1
  f0401d78 00000046 430456f8 00000017 f0401d58 f0401d20 c0b23100 f2da2c4c
  c0b23100 c0b23100 c0b1fb98 f2da2c4c 0a10fc36 00000000 c0b23100 f0401d70
  00000003 c0b23100 00000000 00000017 f2da29e0 00000001 00000002 00000000
Call Trace:
  [<c06eed1b>] schedule_timeout+0x1b/0x95
  [<c06eed1b>] ? schedule_timeout+0x1b/0x95
  [<c06eeb97>] ? wait_for_common+0x34/0xdc
  [<c044fa8a>] ? trace_hardirqs_on_caller+0x18/0x145
  [<c044fbc2>] ? trace_hardirqs_on+0xb/0xd
  [<c06eec03>] wait_for_common+0xa0/0xdc
  [<c0428c7c>] ? default_wake_function+0x0/0x12
  [<c06eeccc>] wait_for_completion+0x17/0x19
  [<c04ea620>] sysfs_addrm_finish+0x19f/0x1d1
  [<c04e920e>] sysfs_hash_and_remove+0x42/0x55
  [<c04eb4db>] sysfs_remove_group+0x57/0x86
  [<c0638086>] do_md_stop+0x13a/0x499

This has been there for a while, but is easier to trigger now that mdmon
is closely watching sysfs.

Cc: <stable@kernel.org>
Reported-by: Jacek Danecki <jacek.danecki@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/md.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 03b4cd0a634..a307f87eb90 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -214,12 +214,7 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
 	return mddev;
 }
 
-static void mddev_delayed_delete(struct work_struct *ws)
-{
-	mddev_t *mddev = container_of(ws, mddev_t, del_work);
-	kobject_del(&mddev->kobj);
-	kobject_put(&mddev->kobj);
-}
+static void mddev_delayed_delete(struct work_struct *ws);
 
 static void mddev_put(mddev_t *mddev)
 {
@@ -3542,6 +3537,21 @@ static struct kobj_type md_ktype = {
 
 int mdp_major = 0;
 
+static void mddev_delayed_delete(struct work_struct *ws)
+{
+	mddev_t *mddev = container_of(ws, mddev_t, del_work);
+
+	if (mddev->private == &md_redundancy_group) {
+		sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+		if (mddev->sysfs_action)
+			sysfs_put(mddev->sysfs_action);
+		mddev->sysfs_action = NULL;
+		mddev->private = NULL;
+	}
+	kobject_del(&mddev->kobj);
+	kobject_put(&mddev->kobj);
+}
+
 static int md_alloc(dev_t dev, char *name)
 {
 	static DEFINE_MUTEX(disks_mutex);
@@ -4033,13 +4043,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			mddev->queue->merge_bvec_fn = NULL;
 			mddev->queue->unplug_fn = NULL;
 			mddev->queue->backing_dev_info.congested_fn = NULL;
-			if (mddev->pers->sync_request) {
-				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-				if (mddev->sysfs_action)
-					sysfs_put(mddev->sysfs_action);
-				mddev->sysfs_action = NULL;
-			}
 			module_put(mddev->pers->owner);
+			if (mddev->pers->sync_request)
+				mddev->private = &md_redundancy_group;
 			mddev->pers = NULL;
 			/* tell userspace to handle 'inactive' */
 			sysfs_notify_dirent(mddev->sysfs_state);
-- 
cgit v1.2.3