aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-09-02 08:17:56 +0200
committerIngo Molnar <mingo@elte.hu>2009-09-02 08:17:56 +0200
commit936e894a976dd3b0f07f1f6f43c17b77b7e6146d (patch)
tree5ed5c1f6735dcd26550594df23c8f7fe2aa21a15 /drivers/md
parent69575d388603365f2afbf4166df93152df59b165 (diff)
parent326ba5010a5429a5a528b268b36a5900d4ab0eba (diff)
Merge commit 'v2.6.31-rc8' into x86/txt
Conflicts: arch/x86/kernel/reboot.c security/Kconfig Merge reason: resolve the conflicts, bump up from rc3 to rc8. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-delay.c4
-rw-r--r--drivers/md/dm-linear.c2
-rw-r--r--drivers/md/dm-log-userspace-transfer.c2
-rw-r--r--drivers/md/dm-mpath.c2
-rw-r--r--drivers/md/dm-raid1.c3
-rw-r--r--drivers/md/dm-stripe.c7
-rw-r--r--drivers/md/dm-table.c15
-rw-r--r--drivers/md/dm.c10
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/linear.c2
-rw-r--r--drivers/md/md.c177
-rw-r--r--drivers/md/md.h12
-rw-r--r--drivers/md/multipath.c5
-rw-r--r--drivers/md/raid0.c1
-rw-r--r--drivers/md/raid1.c7
-rw-r--r--drivers/md/raid10.c4
-rw-r--r--drivers/md/raid5.c85
18 files changed, 206 insertions, 135 deletions
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 529e2ba505c..ed103816401 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1318,7 +1318,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
{
struct crypt_config *cc = ti->private;
- return fn(ti, cc->dev, cc->start, data);
+ return fn(ti, cc->dev, cc->start, ti->len, data);
}
static struct target_type crypt_target = {
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 4e5b843cd4d..ebe7381f47c 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -324,12 +324,12 @@ static int delay_iterate_devices(struct dm_target *ti,
struct delay_c *dc = ti->private;
int ret = 0;
- ret = fn(ti, dc->dev_read, dc->start_read, data);
+ ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
if (ret)
goto out;
if (dc->dev_write)
- ret = fn(ti, dc->dev_write, dc->start_write, data);
+ ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
out:
return ret;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9184b6deb86..82f7d6e6b1e 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -139,7 +139,7 @@ static int linear_iterate_devices(struct dm_target *ti,
{
struct linear_c *lc = ti->private;
- return fn(ti, lc->dev, lc->start, data);
+ return fn(ti, lc->dev, lc->start, ti->len, data);
}
static struct target_type linear_target = {
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 0ca1ee768a1..8ce74d95ae4 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -108,7 +108,7 @@ static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
*(pkg->data_size) = 0;
} else if (tfr->data_size > *(pkg->data_size)) {
DMERR("Insufficient space to receive package [%u] "
- "(%u vs %lu)", tfr->request_type,
+ "(%u vs %zu)", tfr->request_type,
tfr->data_size, *(pkg->data_size));
*(pkg->data_size) = 0;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c70604a2089..6f0d90d4a54 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1453,7 +1453,7 @@ static int multipath_iterate_devices(struct dm_target *ti,
list_for_each_entry(pg, &m->priority_groups, list) {
list_for_each_entry(p, &pg->pgpaths, list) {
- ret = fn(ti, p->path.dev, ti->begin, data);
+ ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
if (ret)
goto out;
}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ce8868c768c..9726577cde4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -638,6 +638,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
spin_lock_irq(&ms->lock);
bio_list_merge(&ms->writes, &requeue);
spin_unlock_irq(&ms->lock);
+ delayed_wake(ms);
}
/*
@@ -1292,7 +1293,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
for (i = 0; !ret && i < ms->nr_mirrors; i++)
ret = fn(ti, ms->mirror[i].dev,
- ms->mirror[i].offset, data);
+ ms->mirror[i].offset, ti->len, data);
return ret;
}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b240e85ae39..4e0e5937e42 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -320,10 +320,11 @@ static int stripe_iterate_devices(struct dm_target *ti,
int ret = 0;
unsigned i = 0;
- do
+ do {
ret = fn(ti, sc->stripe[i].dev,
- sc->stripe[i].physical_start, data);
- while (!ret && ++i < sc->stripes);
+ sc->stripe[i].physical_start,
+ sc->stripe_width, data);
+ } while (!ret && ++i < sc->stripes);
return ret;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2cba557d9e6..d952b344191 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -346,7 +346,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
* If possible, this checks an area of a destination device is valid.
*/
static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, void *data)
+ sector_t start, sector_t len, void *data)
{
struct queue_limits *limits = data;
struct block_device *bdev = dev->bdev;
@@ -359,7 +359,7 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
if (!dev_size)
return 1;
- if ((start >= dev_size) || (start + ti->len > dev_size)) {
+ if ((start >= dev_size) || (start + len > dev_size)) {
DMWARN("%s: %s too small for target",
dm_device_name(ti->table->md), bdevname(bdev, b));
return 0;
@@ -377,11 +377,11 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
- if (ti->len & (logical_block_size_sectors - 1)) {
+ if (len & (logical_block_size_sectors - 1)) {
DMWARN("%s: len=%llu not aligned to h/w "
"logical block size %hu of %s",
dm_device_name(ti->table->md),
- (unsigned long long)ti->len,
+ (unsigned long long)len,
limits->logical_block_size, bdevname(bdev, b));
return 0;
}
@@ -482,7 +482,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, void *data)
+ sector_t start, sector_t len, void *data)
{
struct queue_limits *limits = data;
struct block_device *bdev = dev->bdev;
@@ -830,11 +830,6 @@ unsigned dm_table_get_type(struct dm_table *t)
return t->type;
}
-bool dm_table_bio_based(struct dm_table *t)
-{
- return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
-}
-
bool dm_table_request_based(struct dm_table *t)
{
return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9acd54a5cff..8a311ea0d44 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2203,16 +2203,6 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
goto out;
}
- /*
- * It is enought that blk_queue_ordered() is called only once when
- * the first bio-based table is bound.
- *
- * This setting should be moved to alloc_dev() when request-based dm
- * supports barrier.
- */
- if (!md->map && dm_table_bio_based(table))
- blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
-
__unbind(md);
r = __bind(md, table, &limits);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 23278ae80f0..a7663eba17e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -61,7 +61,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
int dm_table_any_busy_target(struct dm_table *t);
int dm_table_set_type(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
-bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
int dm_table_alloc_md_mempools(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5810fa906af..5fe39c2a3d2 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -220,6 +220,7 @@ static int linear_run (mddev_t *mddev)
mddev->queue->unplug_fn = linear_unplug;
mddev->queue->backing_dev_info.congested_fn = linear_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
+ md_integrity_register(mddev);
return 0;
}
@@ -256,6 +257,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
call_rcu(&oldconf->rcu, free_conf);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4351ff0849..9dd872000ce 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit)
else
new->md_minor = MINOR(unit) >> MdpMinorShift;
+ mutex_init(&new->open_mutex);
mutex_init(&new->reconfig_mutex);
INIT_LIST_HEAD(&new->disks);
INIT_LIST_HEAD(&new->all_mddevs);
@@ -1308,7 +1309,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
}
if (mddev->level != LEVEL_MULTIPATH) {
int role;
- role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ if (rdev->desc_nr < 0 ||
+ rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
+ role = 0xffff;
+ rdev->desc_nr = -1;
+ } else
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
case 0xffff: /* spare */
break;
@@ -1394,8 +1400,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
if (rdev2->desc_nr+1 > max_dev)
max_dev = rdev2->desc_nr+1;
- if (max_dev > le32_to_cpu(sb->max_dev))
+ if (max_dev > le32_to_cpu(sb->max_dev)) {
+ int bmask;
sb->max_dev = cpu_to_le32(max_dev);
+ rdev->sb_size = max_dev * 2 + 256;
+ bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
+ if (rdev->sb_size & bmask)
+ rdev->sb_size = (rdev->sb_size | bmask) + 1;
+ }
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1487,37 +1499,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
static LIST_HEAD(pending_raid_disks);
-static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
+/*
+ * Try to register data integrity profile for an mddev
+ *
+ * This is called when an array is started and after a disk has been kicked
+ * from the array. It only succeeds if all working and active component devices
+ * are integrity capable with matching profiles.
+ */
+int md_integrity_register(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev, *reference = NULL;
+
+ if (list_empty(&mddev->disks))
+ return 0; /* nothing to do */
+ if (blk_get_integrity(mddev->gendisk))
+ return 0; /* already registered */
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ /* skip spares and non-functional disks */
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (rdev->raid_disk < 0)
+ continue;
+ /*
+ * If at least one rdev is not integrity capable, we can not
+ * enable data integrity for the md device.
+ */
+ if (!bdev_get_integrity(rdev->bdev))
+ return -EINVAL;
+ if (!reference) {
+ /* Use the first rdev as the reference */
+ reference = rdev;
+ continue;
+ }
+ /* does this rdev's profile match the reference profile? */
+ if (blk_integrity_compare(reference->bdev->bd_disk,
+ rdev->bdev->bd_disk) < 0)
+ return -EINVAL;
+ }
+ /*
+ * All component devices are integrity capable and have matching
+ * profiles, register the common profile for the md device.
+ */
+ if (blk_integrity_register(mddev->gendisk,
+ bdev_get_integrity(reference->bdev)) != 0) {
+ printk(KERN_ERR "md: failed to register integrity for %s\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ printk(KERN_NOTICE "md: data integrity on %s enabled\n",
+ mdname(mddev));
+ return 0;
+}
+EXPORT_SYMBOL(md_integrity_register);
+
+/* Disable data integrity if non-capable/non-matching disk is being added */
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
{
- struct mdk_personality *pers = mddev->pers;
- struct gendisk *disk = mddev->gendisk;
struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
- struct blk_integrity *bi_mddev = blk_get_integrity(disk);
+ struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
- /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
- if (pers && pers->level >= 4 && pers->level <= 6)
+ if (!bi_mddev) /* nothing to do */
return;
-
- /* If rdev is integrity capable, register profile for mddev */
- if (!bi_mddev && bi_rdev) {
- if (blk_integrity_register(disk, bi_rdev))
- printk(KERN_ERR "%s: %s Could not register integrity!\n",
- __func__, disk->disk_name);
- else
- printk(KERN_NOTICE "Enabling data integrity on %s\n",
- disk->disk_name);
+ if (rdev->raid_disk < 0) /* skip spares */
return;
- }
-
- /* Check that mddev and rdev have matching profiles */
- if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
- printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
- disk->disk_name, rdev->bdev->bd_disk->disk_name);
- printk(KERN_NOTICE "Disabling data integrity on %s\n",
- disk->disk_name);
- blk_integrity_unregister(disk);
- }
+ if (bi_rdev && blk_integrity_compare(mddev->gendisk,
+ rdev->bdev->bd_disk) >= 0)
+ return;
+ printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
+ blk_integrity_unregister(mddev->gendisk);
}
+EXPORT_SYMBOL(md_integrity_add_rdev);
static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
{
@@ -1591,7 +1642,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
/* May as well allow recovery to be retried once */
mddev->recovery_disabled = 0;
- md_integrity_check(rdev, mddev);
return 0;
fail:
@@ -1925,17 +1975,14 @@ repeat:
/* otherwise we have to go forward and ... */
mddev->events ++;
if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
- /* .. if the array isn't clean, insist on an odd 'events' */
- if ((mddev->events&1)==0) {
- mddev->events++;
+ /* .. if the array isn't clean, an 'even' event must also go
+ * to spares. */
+ if ((mddev->events&1)==0)
nospares = 0;
- }
} else {
- /* otherwise insist on an even 'events' (for clean states) */
- if ((mddev->events&1)) {
- mddev->events++;
+ /* otherwise an 'odd' event must go to spares */
+ if ((mddev->events&1))
nospares = 0;
- }
}
}
@@ -2657,6 +2704,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
ssize_t rv = len;
struct mdk_personality *pers;
void *priv;
+ mdk_rdev_t *rdev;
if (mddev->pers == NULL) {
if (len == 0)
@@ -2736,6 +2784,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
mddev_suspend(mddev);
mddev->pers->stop(mddev);
module_put(mddev->pers->owner);
+ /* Invalidate devices that are now superfluous */
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk >= mddev->raid_disks) {
+ rdev->raid_disk = -1;
+ clear_bit(In_sync, &rdev->flags);
+ }
mddev->pers = pers;
mddev->private = priv;
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
@@ -3545,6 +3599,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
if (max < mddev->resync_min)
return -EINVAL;
if (max < mddev->resync_max &&
+ mddev->ro == 0 &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
@@ -3685,17 +3740,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
mddev->array_sectors = sectors;
set_capacity(mddev->gendisk, mddev->array_sectors);
- if (mddev->pers) {
- struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
-
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
- }
+ if (mddev->pers)
+ revalidate_disk(mddev->gendisk);
return len;
}
@@ -4048,10 +4094,6 @@ static int do_md_run(mddev_t * mddev)
}
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
- if (pers->level >= 4 && pers->level <= 6)
- /* Cannot support integrity (yet) */
- blk_integrity_unregister(mddev->gendisk);
-
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
@@ -4189,6 +4231,7 @@ static int do_md_run(mddev_t * mddev)
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+ revalidate_disk(mddev->gendisk);
mddev->changed = 1;
md_new_event(mddev);
sysfs_notify_dirent(mddev->sysfs_state);
@@ -4260,12 +4303,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
struct gendisk *disk = mddev->gendisk;
mdk_rdev_t *rdev;
+ mutex_lock(&mddev->open_mutex);
if (atomic_read(&mddev->openers) > is_open) {
printk("md: %s still in use.\n",mdname(mddev));
- return -EBUSY;
- }
-
- if (mddev->pers) {
+ err = -EBUSY;
+ } else if (mddev->pers) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4322,8 +4364,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
if (mode == 1)
set_disk_ro(disk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ err = 0;
}
-
+out:
+ mutex_unlock(&mddev->open_mutex);
+ if (err)
+ return err;
/*
* Free resources if final stop
*/
@@ -4389,7 +4435,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
blk_integrity_unregister(disk);
md_new_event(mddev);
sysfs_notify_dirent(mddev->sysfs_state);
-out:
return err;
}
@@ -5087,18 +5132,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
return -ENOSPC;
}
rv = mddev->pers->resize(mddev, num_sectors);
- if (!rv) {
- struct block_device *bdev;
-
- bdev = bdget_disk(mddev->gendisk, 0);
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
- }
+ if (!rv)
+ revalidate_disk(mddev->gendisk);
return rv;
}
@@ -5484,12 +5519,12 @@ static int md_open(struct block_device *bdev, fmode_t mode)
}
BUG_ON(mddev != bdev->bd_disk->private_data);
- if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
+ if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
goto out;
err = 0;
atomic_inc(&mddev->openers);
- mddev_unlock(mddev);
+ mutex_unlock(&mddev->open_mutex);
check_disk_change(bdev);
out:
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 9430a110db9..f8fc188bc76 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -223,6 +223,16 @@ struct mddev_s
* so we don't loop trying */
int in_sync; /* know to not need resync */
+ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
+ * that we are never stopping an array while it is open.
+ * 'reconfig_mutex' protects all other reconfiguration.
+ * These locks are separate due to conflicting interactions
+ * with bdev->bd_mutex.
+ * Lock ordering is:
+ * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+ * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
+ */
+ struct mutex open_mutex;
struct mutex reconfig_mutex;
atomic_t active; /* general refcount */
atomic_t openers; /* number of active opens */
@@ -431,5 +441,7 @@ extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(mddev_t *mddev);
+extern int md_integrity_register(mddev_t *mddev);
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 237fe3fd235..7140909f666 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -313,6 +313,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev);
err = 0;
+ md_integrity_add_rdev(rdev, mddev);
break;
}
@@ -345,7 +346,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -519,7 +522,7 @@ static int multipath_run (mddev_t *mddev)
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
-
+ md_integrity_register(mddev);
return 0;
out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 335f490dcad..898e2bdfee4 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -351,6 +351,7 @@ static int raid0_run(mddev_t *mddev)
blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
+ md_integrity_register(mddev);
return 0;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0569efba0c0..8726fd7ebce 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1144,7 +1144,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
-
+ md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
}
@@ -1178,7 +1178,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -2067,7 +2069,7 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
-
+ md_integrity_register(mddev);
return 0;
out_no_mem:
@@ -2132,6 +2134,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
+ revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->dev_sectors;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7298a5e5a18..3d9020cf6f6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1170,6 +1170,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
break;
}
+ md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
}
@@ -1203,7 +1204,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -2225,6 +2228,7 @@ static int run(mddev_t *mddev)
if (conf->near_copies < mddev->raid_disks)
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+ md_integrity_register(mddev);
return 0;
out_free_conf:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 37835538b58..b8a2c5dc67b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3785,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
- } else if (mddev->delta_disks > 0 &&
+ } else if (mddev->delta_disks >= 0 &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks);
@@ -3999,6 +3999,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
return 0;
}
+ /* Allow raid5_quiesce to complete */
+ wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
@@ -4316,6 +4319,15 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
return sectors * (raid_disks - conf->max_degraded);
}
+static void free_conf(raid5_conf_t *conf)
+{
+ shrink_stripes(conf);
+ safe_put_page(conf->spare_page);
+ kfree(conf->disks);
+ kfree(conf->stripe_hashtbl);
+ kfree(conf);
+}
+
static raid5_conf_t *setup_conf(mddev_t *mddev)
{
raid5_conf_t *conf;
@@ -4447,11 +4459,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
abort:
if (conf) {
- shrink_stripes(conf);
- safe_put_page(conf->spare_page);
- kfree(conf->disks);
- kfree(conf->stripe_hashtbl);
- kfree(conf);
+ free_conf(conf);
return ERR_PTR(-EIO);
} else
return ERR_PTR(-ENOMEM);
@@ -4501,7 +4509,26 @@ static int run(mddev_t *mddev)
(old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
- if (here_new >= here_old) {
+ if (mddev->delta_disks == 0) {
+ /* We cannot be sure it is safe to start an in-place
+ * reshape. It is only safe if user-space if monitoring
+ * and taking constant backups.
+ * mdadm always starts a situation like this in
+ * readonly mode so it can take control before
+ * allowing any writes. So just check for that.
+ */
+ if ((here_new * mddev->new_chunk_sectors !=
+ here_old * mddev->chunk_sectors) ||
+ mddev->ro == 0) {
+ printk(KERN_ERR "raid5: in-place reshape must be started"
+ " in read-only mode - aborting\n");
+ return -EINVAL;
+ }
+ } else if (mddev->delta_disks < 0
+ ? (here_new * mddev->new_chunk_sectors <=
+ here_old * mddev->chunk_sectors)
+ : (here_new * mddev->new_chunk_sectors >=
+ here_old * mddev->chunk_sectors)) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "raid5: reshape_position too early for "
"auto-recovery - aborting.\n");
@@ -4629,12 +4656,8 @@ abort:
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
if (conf) {
- shrink_stripes(conf);
print_raid5_conf(conf);
- safe_put_page(conf->spare_page);
- kfree(conf->disks);
- kfree(conf->stripe_hashtbl);
- kfree(conf);
+ free_conf(conf);
}
mddev->private = NULL;
printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4649,13 +4672,10 @@ static int stop(mddev_t *mddev)
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
- shrink_stripes(conf);
- kfree(conf->stripe_hashtbl);
mddev->queue->backing_dev_info.congested_fn = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
- kfree(conf->disks);
- kfree(conf);
+ free_conf(conf);
mddev->private = NULL;
return 0;
}
@@ -4857,6 +4877,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
+ revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5002,7 +5023,7 @@ static int raid5_start_reshape(mddev_t *mddev)
spin_unlock_irqrestore(&conf->device_lock, flags);
}
mddev->raid_disks = conf->raid_disks;
- mddev->reshape_position = 0;
+ mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5057,7 +5078,6 @@ static void end_reshape(raid5_conf_t *conf)
*/
static void raid5_finish_reshape(mddev_t *mddev)
{
- struct block_device *bdev;
raid5_conf_t *conf = mddev->private;
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5066,15 +5086,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
-
- bdev = bdget_disk(mddev->gendisk, 0);
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
+ revalidate_disk(mddev->gendisk);
} else {
int d;
mddev->degraded = conf->raid_disks;
@@ -5085,8 +5097,15 @@ static void raid5_finish_reshape(mddev_t *mddev)
mddev->degraded--;
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
- d++)
- raid5_remove_disk(mddev, d);
+ d++) {
+ mdk_rdev_t *rdev = conf->disks[d].rdev;
+ if (rdev && raid5_remove_disk(mddev, d) == 0) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = -1;
+ }
+ }
}
mddev->layout = conf->algorithm;
mddev->chunk_sectors = conf->chunk_sectors;
@@ -5106,12 +5125,18 @@ static void raid5_quiesce(mddev_t *mddev, int state)
case 1: /* stop all writes */
spin_lock_irq(&conf->device_lock);
- conf->quiesce = 1;
+ /* '2' tells resync/reshape to pause so that all
+ * active stripes can drain
+ */
+ conf->quiesce = 2;
wait_event_lock_irq(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
conf->device_lock, /* nothing */);
+ conf->quiesce = 1;
spin_unlock_irq(&conf->device_lock);
+ /* allow reshape to continue */
+ wake_up(&conf->wait_for_overlap);
break;
case 0: /* re-enable writes */