From b74fd2826c5acce20e6f691437b2d19372bc2057 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 7 May 2009 12:47:19 +1000
Subject: md: fix loading of out-of-date bitmap.

When md is loading a bitmap which it knows is out of date, it fills
each page with 1s and writes it back out again.  However the
write_page call makes used of bitmap->file_pages and
bitmap->last_page_size which haven't been set correctly yet.  So this
can sometimes fail.

Move the setting of file_pages and last_page_size to before the call
to write_page.

This bug can cause the assembly on an array to fail, thus making the
data inaccessible.  Hence I think it is a suitable candidate for
-stable.

Cc: stable@kernel.org
Reported-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'drivers')
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1fb91edc7de..bc1d64b7b63 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -986,6 +986,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 			oldindex = index;
 			oldpage = page;
 
+			bitmap->filemap[bitmap->file_pages++] = page;
+			bitmap->last_page_size = count;
+
 			if (outofdate) {
 				/*
 				 * if bitmap is out of date, dirty the
@@ -998,15 +1001,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				write_page(bitmap, page, 1);
 
 				ret = -EIO;
-				if (bitmap->flags & BITMAP_WRITE_ERROR) {
-					/* release, page not in filemap yet */
-					put_page(page);
+				if (bitmap->flags & BITMAP_WRITE_ERROR)
 					goto err;
-				}
 			}
-
-			bitmap->filemap[bitmap->file_pages++] = page;
-			bitmap->last_page_size = count;
 		}
 		paddr = kmap_atomic(page, KM_USER0);
 		if (bitmap->flags & BITMAP_HOSTENDIAN)
-- 
cgit v1.2.3


From 18055569127253755d01733f6ecc004ed02f88d0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 7 May 2009 12:48:10 +1000
Subject: md/raid10: don't clear bitmap during recovery if array will still be
 degraded.

If we have a raid10 with multiple missing devices, and we recover just
one of these to a spare, then we risk (depending on the bitmap and
array chunk size) clearing bits of the bitmap for which recovery isn't
complete (because a device is still missing).

This can lead to a subsequent "re-add" being recovered without
any IO happening, which would result in loss of data.

This patch takes the safe approach of not clearing bitmap bits
if the array will still be degraded.

This patch is suitable for all active -stable kernels.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 81a54f17417..499620afb44 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1809,17 +1809,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 				r10_bio->sector = sect;
 
 				raid10_find_phys(conf, r10_bio);
-				/* Need to check if this section will still be
+
+				/* Need to check if the array will still be
 				 * degraded
 				 */
-				for (j=0; j<conf->copies;j++) {
-					int d = r10_bio->devs[j].devnum;
-					if (conf->mirrors[d].rdev == NULL ||
-					    test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
+				for (j=0; j<conf->raid_disks; j++)
+					if (conf->mirrors[j].rdev == NULL ||
+					    test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
 						still_degraded = 1;
 						break;
 					}
-				}
+
 				must_sync = bitmap_start_sync(mddev->bitmap, sect,
 							      &sync_blocks, still_degraded);
 
-- 
cgit v1.2.3


From db305e507d554430a69ede901a6308e6ecb72349 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 7 May 2009 12:49:06 +1000
Subject: md: fix some (more) errors with bitmaps on devices larger than 2TB.

If a write intent bitmap covers more than 2TB, we sometimes work with
values beyond 32bit, so these need to be sector_t.  This patches
add the required casts to some unsigned longs that are being shifted
up.

This will affect any raid10 larger than 2TB, or any raid1/4/5/6 with
member devices that are larger than 2TB.

Signed-off-by: NeilBrown <neilb@suse.de>
Reported-by: "Mario 'BitKoenig' Holbe" <Mario.Holbe@TU-Ilmenau.DE>
Cc: stable@kernel.org
---
 drivers/md/bitmap.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index bc1d64b7b63..47c68bc75a1 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1013,9 +1013,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		kunmap_atomic(paddr, KM_USER0);
 		if (b) {
 			/* if the disk bit is set, set the memory bit */
-			bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
-					       ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start)
-				);
+			int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
+				      >= start);
+			bitmap_set_memory_bits(bitmap,
+					       (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
+					       needed);
 			bit_cnt++;
 			set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
 		}
@@ -1151,8 +1153,9 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 			spin_lock_irqsave(&bitmap->lock, flags);
 			clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
 		}
-		bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
-					&blocks, 0);
+		bmc = bitmap_get_counter(bitmap,
+					 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+					 &blocks, 0);
 		if (bmc) {
 /*
   if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
@@ -1166,7 +1169,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 			} else if (*bmc == 1) {
 				/* we can clear the bit */
 				*bmc = 0;
-				bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
+				bitmap_count_page(bitmap,
+						  (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
 						  -1);
 
 				/* clear the bit */
@@ -1511,7 +1515,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
 	unsigned long chunk;
 
 	for (chunk = s; chunk <= e; chunk++) {
-		sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap);
+		sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
 		bitmap_set_memory_bits(bitmap, sec, 1);
 		bitmap_file_set_bit(bitmap, sec);
 	}
-- 
cgit v1.2.3


From dd71cf6b2773310b01c6fe6c773064c80fd2476b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 7 May 2009 12:49:35 +1000
Subject: md: tidy up status_resync to handle large arrays.

Two problems in status_resync.
1/ It still used Kilobytes as the basic block unit, while most code
   now uses sectors uniformly.
2/ It doesn't allow for the possibility that max_sectors exceeds
   the range of "unsigned long".

So
 - change "max_blocks" to "max_sectors", and store sector numbers
   in there and in 'resync'
 - Make 'rt' a 'sector_t' so it can temporarily hold the number of
   remaining sectors.
 - use sector_div rather than normal division.
 - change the magic '100' used to preserve precision to '32'.
   + making it a power of 2 makes division easier
   + it doesn't need to be as large as it was chosen when we averaged
     speed over the entire run.  Now we average speed over the last 30
     seconds or so.

Reported-by: "Mario 'BitKoenig' Holbe" <Mario.Holbe@TU-Ilmenau.DE>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 612343fdde9..5eb01a4d27b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5705,37 +5705,38 @@ static void status_unused(struct seq_file *seq)
 
 static void status_resync(struct seq_file *seq, mddev_t * mddev)
 {
-	sector_t max_blocks, resync, res;
-	unsigned long dt, db, rt;
+	sector_t max_sectors, resync, res;
+	unsigned long dt, db;
+	sector_t rt;
 	int scale;
 	unsigned int per_milli;
 
-	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+	resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
 
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-		max_blocks = mddev->resync_max_sectors >> 1;
+		max_sectors = mddev->resync_max_sectors;
 	else
-		max_blocks = mddev->dev_sectors / 2;
+		max_sectors = mddev->dev_sectors;
 
 	/*
 	 * Should not happen.
 	 */
-	if (!max_blocks) {
+	if (!max_sectors) {
 		MD_BUG();
 		return;
 	}
 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
-	 * in a sector_t, and (max_blocks>>scale) will fit in a
+	 * in a sector_t, and (max_sectors>>scale) will fit in a
 	 * u32, as those are the requirements for sector_div.
 	 * Thus 'scale' must be at least 10
 	 */
 	scale = 10;
 	if (sizeof(sector_t) > sizeof(unsigned long)) {
-		while ( max_blocks/2 > (1ULL<<(scale+32)))
+		while ( max_sectors/2 > (1ULL<<(scale+32)))
 			scale++;
 	}
 	res = (resync>>scale)*1000;
-	sector_div(res, (u32)((max_blocks>>scale)+1));
+	sector_div(res, (u32)((max_sectors>>scale)+1));
 
 	per_milli = res;
 	{
@@ -5756,25 +5757,35 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
 		      "resync" : "recovery"))),
 		   per_milli/10, per_milli % 10,
-		   (unsigned long long) resync,
-		   (unsigned long long) max_blocks);
+		   (unsigned long long) resync/2,
+		   (unsigned long long) max_sectors/2);
 
 	/*
-	 * We do not want to overflow, so the order of operands and
-	 * the * 100 / 100 trick are important. We do a +1 to be
-	 * safe against division by zero. We only estimate anyway.
-	 *
 	 * dt: time from mark until now
 	 * db: blocks written from mark until now
 	 * rt: remaining time
+	 *
+	 * rt is a sector_t, so could be 32bit or 64bit.
+	 * So we divide before multiply in case it is 32bit and close
+	 * to the limit.
+	 * We scale the divisor (db) by 32 to avoid loosing precision
+	 * near the end of resync when the number of remaining sectors
+	 * is close to 'db'.
+	 * We then divide rt by 32 after multiplying by db to compensate.
+	 * The '+1' avoids division by zero if db is very small.
 	 */
 	dt = ((jiffies - mddev->resync_mark) / HZ);
 	if (!dt) dt++;
 	db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
 		- mddev->resync_mark_cnt;
-	rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
 
-	seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+	rt = max_sectors - resync;    /* number of remaining sectors */
+	sector_div(rt, db/32+1);
+	rt *= dt;
+	rt >>= 5;
+
+	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
+		   ((unsigned long)rt % 60)/6);
 
 	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
 }
-- 
cgit v1.2.3


From 110518bccf076726cc93bf604527d8019aae50ba Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Thu, 7 May 2009 12:49:37 +1000
Subject: md: constify VFTs

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5eb01a4d27b..8350bde60d1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5976,7 +5976,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static struct seq_operations md_seq_ops = {
+static const struct seq_operations md_seq_ops = {
 	.start  = md_seq_start,
 	.next   = md_seq_next,
 	.stop   = md_seq_stop,
-- 
cgit v1.2.3


From 5bf295975416f8e97117bbbcfb0191c00bc3e2b4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 7 May 2009 12:50:57 +1000
Subject: md: remove ability to explicit set an inactive array to 'clean'.

Being able to write 'clean' to an 'array_state' of an inactive array
to activate it in 'clean' mode is both unnecessary and inconvenient.

It is unnecessary because the same can be achieved by writing
'active'.  This activates and array, but it still remains 'clean'
until the first write.

It is inconvenient because writing 'clean' is more often used to
cause an 'active' array to revert to 'clean' mode (thus blocking
any writes until a 'write-pending' is promoted to 'active').

Allowing 'clean' to both activate an array and mark an active array as
clean can lead to races:  One program writes 'clean' to mark the
active array as clean at the same time as another program writes
'inactive' to deactivate (stop) and active array.  Depending on which
writes first, the array could be deactivated and immediately
reactivated which isn't what was desired.

So just disable the use of 'clean' to activate an array.

This avoids a race that can be triggered with mdadm-3.0 and external
metadata, so it suitable for -stable.

Reported-by: Rafal Marszewski <rafal.marszewski@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8350bde60d1..1dd723d3188 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3066,11 +3066,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 			} else
 				err = -EBUSY;
 			spin_unlock_irq(&mddev->write_lock);
-		} else {
-			mddev->ro = 0;
-			mddev->recovery_cp = MaxSector;
-			err = do_md_run(mddev);
-		}
+		} else
+			err = -EINVAL;
 		break;
 	case active:
 		if (mddev->pers) {
-- 
cgit v1.2.3


From c4647292fda0833bebe45be27f04453b736981fa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 7 May 2009 12:51:06 +1000
Subject: md: remove rd%d links immediately after stopping an array.

md maintains link in sys/mdXX/md/ to identify which device has
which role in the array. e.g.
   rd2 -> dev-sda

indicates that the device with role '2' in the array is sda.

These links are only present when the array is active.  They are
created immediately after ->run is called, and so should be removed
immediately after ->stop is called.
However they are currently removed a little bit later, and it is
possible for ->run to be called again, thus adding these links, before
they are removed.

So move the removal earlier so they are consistently only present when
the array is active.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1dd723d3188..fccc8343a25 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4294,6 +4294,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 {
 	int err = 0;
 	struct gendisk *disk = mddev->gendisk;
+	mdk_rdev_t *rdev;
 
 	if (atomic_read(&mddev->openers) > is_open) {
 		printk("md: %s still in use.\n",mdname(mddev));
@@ -4336,6 +4337,13 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			/* tell userspace to handle 'inactive' */
 			sysfs_notify_dirent(mddev->sysfs_state);
 
+			list_for_each_entry(rdev, &mddev->disks, same_set)
+				if (rdev->raid_disk >= 0) {
+					char nm[20];
+					sprintf(nm, "rd%d", rdev->raid_disk);
+					sysfs_remove_link(&mddev->kobj, nm);
+				}
+
 			set_capacity(disk, 0);
 			mddev->changed = 1;
 
@@ -4356,7 +4364,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	 * Free resources if final stop
 	 */
 	if (mode == 0) {
-		mdk_rdev_t *rdev;
 
 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
 
@@ -4368,13 +4375,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		}
 		mddev->bitmap_offset = 0;
 
-		list_for_each_entry(rdev, &mddev->disks, same_set)
-			if (rdev->raid_disk >= 0) {
-				char nm[20];
-				sprintf(nm, "rd%d", rdev->raid_disk);
-				sysfs_remove_link(&mddev->kobj, nm);
-			}
-
 		/* make sure all md_delayed_delete calls have finished */
 		flush_scheduled_work();
 
-- 
cgit v1.2.3