From 6076905b5ef39e0ea58db32583c9e0036c05e47b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:52 +0000
Subject: dm: avoid _hash_lock deadlock

Fix a reported deadlock if there are still unprocessed multipath events
on a device that is being removed.

_hash_lock is held during dev_remove while trying to send the
outstanding events.  Sending the events requests the _hash_lock
again in dm_copy_name_and_uuid.

This patch introduces a separate lock around regions that modify the
link to the hash table (dm_set_mdptr) or the name or uuid so that
dm_copy_name_and_uuid no longer needs _hash_lock.

Additionally, dm_copy_name_and_uuid can only be called if md exists
so we can drop the dm_get() and dm_put() which can lead to a BUG()
while md is being freed.

The deadlock:
 #0 [ffff8106298dfb48] schedule at ffffffff80063035
 #1 [ffff8106298dfc20] __down_read at ffffffff8006475d
 #2 [ffff8106298dfc60] dm_copy_name_and_uuid at ffffffff8824f740
 #3 [ffff8106298dfc90] dm_send_uevents at ffffffff88252685
 #4 [ffff8106298dfcd0] event_callback at ffffffff8824c678
 #5 [ffff8106298dfd00] dm_table_event at ffffffff8824dd01
 #6 [ffff8106298dfd10] __hash_remove at ffffffff882507ad
 #7 [ffff8106298dfd30] dev_remove at ffffffff88250865
 #8 [ffff8106298dfd60] ctl_ioctl at ffffffff88250d80
 #9 [ffff8106298dfee0] do_ioctl at ffffffff800418c4
#10 [ffff8106298dff00] vfs_ioctl at ffffffff8002fab9
#11 [ffff8106298dff40] sys_ioctl at ffffffff8004bdaf
#12 [ffff8106298dff80] tracesys at ffffffff8005d28d (via system_call)

Cc: stable@kernel.org
Reported-by: guy keren <choo@actcom.co.il>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c  | 17 +++++++++++++----
 drivers/md/dm-uevent.c |  9 ++++-----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a6794293158..d19854c9818 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices);
  */
 static DECLARE_RWSEM(_hash_lock);
 
+/*
+ * Protects use of mdptr to obtain hash cell name and uuid from mapped device.
+ */
+static DEFINE_MUTEX(dm_hash_cells_mutex);
+
 static void init_buckets(struct list_head *buckets)
 {
 	unsigned int i;
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
 		list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
 	}
 	dm_get(md);
+	mutex_lock(&dm_hash_cells_mutex);
 	dm_set_mdptr(md, cell);
+	mutex_unlock(&dm_hash_cells_mutex);
 	up_write(&_hash_lock);
 
 	return 0;
@@ -224,7 +231,9 @@ static void __hash_remove(struct hash_cell *hc)
 	/* remove from the dev hash */
 	list_del(&hc->uuid_list);
 	list_del(&hc->name_list);
+	mutex_lock(&dm_hash_cells_mutex);
 	dm_set_mdptr(hc->md, NULL);
+	mutex_unlock(&dm_hash_cells_mutex);
 
 	table = dm_get_table(hc->md);
 	if (table) {
@@ -321,7 +330,9 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
 	 */
 	list_del(&hc->name_list);
 	old_name = hc->name;
+	mutex_lock(&dm_hash_cells_mutex);
 	hc->name = new_name;
+	mutex_unlock(&dm_hash_cells_mutex);
 	list_add(&hc->name_list, _name_buckets + hash_str(new_name));
 
 	/*
@@ -1582,8 +1593,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
 	if (!md)
 		return -ENXIO;
 
-	dm_get(md);
-	down_read(&_hash_lock);
+	mutex_lock(&dm_hash_cells_mutex);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
 		r = -ENXIO;
@@ -1596,8 +1606,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
 		strcpy(uuid, hc->uuid ? : "");
 
 out:
-	up_read(&_hash_lock);
-	dm_put(md);
+	mutex_unlock(&dm_hash_cells_mutex);
 
 	return r;
 }
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 6f65883aef1..c7c555a8c7b 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
 		list_del_init(&event->elist);
 
 		/*
-		 * Need to call dm_copy_name_and_uuid from here for now.
-		 * Context of previous var adds and locking used for
-		 * hash_cell not compatable.
+		 * When a device is being removed this copy fails and we
+		 * discard these unsent events.
 		 */
 		if (dm_copy_name_and_uuid(event->md, event->name,
 					  event->uuid)) {
-			DMERR("%s: dm_copy_name_and_uuid() failed",
-			      __func__);
+			DMINFO("%s: skipping sending uevent for lost device",
+			       __func__);
 			goto uevent_free;
 		}
 
-- 
cgit v1.2.3


From 613978f8711c7fd4d0aa856872375d2abd7c92ff Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 10 Dec 2009 23:51:52 +0000
Subject: dm exception store: free tmp_store on persistent flag error

Error handling code following a kmalloc should free the allocated data.

Cc: stable@kernel.org
Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 7dbe652efb5..205215968ff 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -216,7 +216,8 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 		type = get_type("N");
 	else {
 		ti->error = "Persistent flag is not P or N";
-		return -EINVAL;
+		r = -EINVAL;
+		goto bad_type;
 	}
 
 	if (!type) {
-- 
cgit v1.2.3


From d2bb7df8cac647b92f51fb84ae735771e7adbfa7 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:53 +0000
Subject: dm: sysfs add empty release function to avoid debug warning

This patch just removes an unnecessary warning:
 kobject: 'dm': does not have a release() function,
 it is broken and must be fixed.

The kobject is embedded in mapped device struct, so
code does not need to release memory explicitly here.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-sysfs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 4b045903a4e..b000de38c99 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -79,6 +79,13 @@ static struct sysfs_ops dm_sysfs_ops = {
 	.show	= dm_attr_show,
 };
 
+/*
+ * The sysfs structure is embedded in md struct, nothing to do here
+ */
+static void dm_sysfs_release(struct kobject *kobj)
+{
+}
+
 /*
  * dm kobject is embedded in mapped_device structure
  * no need to define release function here
@@ -86,6 +93,7 @@ static struct sysfs_ops dm_sysfs_ops = {
 static struct kobj_type dm_ktype = {
 	.sysfs_ops	= &dm_sysfs_ops,
 	.default_attrs	= dm_attrs,
+	.release	= dm_sysfs_release
 };
 
 /*
-- 
cgit v1.2.3


From 94e76572b5dd37b1f0f4b3742ee8a565daead932 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:53 +0000
Subject: dm snapshot: only take lock for statustype info not table

Take snapshot lock only for STATUSTYPE_INFO, not STATUSTYPE_TABLE.

Commit 4c6fff445d7aa753957856278d4d93bcad6e2c14
(dm-snapshot-lock-snapshot-while-supplying-status.patch)
introduced this use of the lock, but userspace applications using
libdevmapper have been found to request STATUSTYPE_TABLE while the device
is suspended and the lock is already held, leading to deadlock.  Since
the lock is not necessary in this case, don't try to take it.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3a3ba46e6d4..d135212958f 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1152,10 +1152,11 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 	unsigned sz = 0;
 	struct dm_snapshot *snap = ti->private;
 
-	down_write(&snap->lock);
-
 	switch (type) {
 	case STATUSTYPE_INFO:
+
+		down_write(&snap->lock);
+
 		if (!snap->valid)
 			DMEMIT("Invalid");
 		else {
@@ -1171,6 +1172,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 			else
 				DMEMIT("Unknown");
 		}
+
+		up_write(&snap->lock);
+
 		break;
 
 	case STATUSTYPE_TABLE:
@@ -1185,8 +1189,6 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 		break;
 	}
 
-	up_write(&snap->lock);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8e87b9b81b3c370f7e53c1ab6e1c3519ef37a644 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:54 +0000
Subject: dm snapshot: cope with chunk size larger than origin

Under some special conditions the snapshot hash_size is calculated as zero.
This patch instead sets a minimum value of 64, the same as for the
pending exception table.

rounddown_pow_of_two(0) is an undefined operation (it expands to shift
by -1).  init_exception_table with an argument of 0 would fail with -ENOMEM.

The way to trigger the problem is to create a snapshot with a chunk size
that is larger than the origin device.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d135212958f..8a4a9c838af 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -553,6 +553,8 @@ static int init_hash_tables(struct dm_snapshot *s)
 	hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
 	hash_size = min(hash_size, max_buckets);
 
+	if (hash_size < 64)
+		hash_size = 64;
 	hash_size = rounddown_pow_of_two(hash_size);
 	if (init_exception_table(&s->complete, hash_size,
 				 DM_CHUNK_CONSECUTIVE_BITS))
-- 
cgit v1.2.3


From 0b4309581b5be8749afdd5a9087fd82a2a5c9932 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:55 +0000
Subject: dm crypt: make wipe message also wipe tfm key

The "wipe key" message is used to wipe a volume key from memory
temporarily, for example when suspending to RAM.

There are two instances of the key in memory (inside crypto tfm)
but only one got wiped.  This patch wipes them both.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e412980763b..f2c139305e1 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -934,14 +934,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	return 0;
+	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
 {
 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 	memset(&cc->key, 0, cc->key_size * sizeof(u8));
-	return 0;
+	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
 }
 
 /*
@@ -983,11 +983,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 
- 	if (crypt_set_key(cc, argv[1])) {
-		ti->error = "Error decoding key";
-		goto bad_cipher;
-	}
-
 	/* Compatibility mode for old dm-crypt cipher strings */
 	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
 		chainmode = "cbc";
@@ -1015,6 +1010,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	strcpy(cc->chainmode, chainmode);
 	cc->tfm = tfm;
 
+	if (crypt_set_key(cc, argv[1]) < 0) {
+		ti->error = "Error decoding and setting key";
+		goto bad_ivmode;
+	}
+
 	/*
 	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
 	 * See comments at iv code
@@ -1085,11 +1085,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_bs;
 	}
 
-	if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
-		ti->error = "Error setting key";
-		goto bad_device;
-	}
-
 	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
 		ti->error = "Invalid iv_offset sector";
 		goto bad_device;
-- 
cgit v1.2.3


From 6047359277517c4e56d8bfd6ea4966d7a3924151 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:55 +0000
Subject: dm crypt: move private iv fields to structs

Define private structures for IV so it's easy to add further attributes
in a following patch which fixes the way key material is wiped from
memory.  Also move ESSIV destructor and remove unnecessary 'status'
operation.

There are no functional changes in this patch.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f2c139305e1..bec5ac54e23 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -71,10 +71,17 @@ struct crypt_iv_operations {
 	int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
 		   const char *opts);
 	void (*dtr)(struct crypt_config *cc);
-	const char *(*status)(struct crypt_config *cc);
 	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
 };
 
+struct iv_essiv_private {
+	struct crypto_cipher *tfm;
+};
+
+struct iv_benbi_private {
+	int shift;
+};
+
 /*
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
@@ -102,8 +109,8 @@ struct crypt_config {
 	struct crypt_iv_operations *iv_gen_ops;
 	char *iv_mode;
 	union {
-		struct crypto_cipher *essiv_tfm;
-		int benbi_shift;
+		struct iv_essiv_private essiv;
+		struct iv_benbi_private benbi;
 	} iv_gen_private;
 	sector_t iv_offset;
 	unsigned int iv_size;
@@ -169,6 +176,14 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 	return 0;
 }
 
+static void crypt_iv_essiv_dtr(struct crypt_config *cc)
+{
+	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+
+	crypto_free_cipher(essiv->tfm);
+	essiv->tfm = NULL;
+}
+
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
@@ -236,21 +251,15 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	}
 	kfree(salt);
 
-	cc->iv_gen_private.essiv_tfm = essiv_tfm;
+	cc->iv_gen_private.essiv.tfm = essiv_tfm;
 	return 0;
 }
 
-static void crypt_iv_essiv_dtr(struct crypt_config *cc)
-{
-	crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
-	cc->iv_gen_private.essiv_tfm = NULL;
-}
-
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
-	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
+	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
 	return 0;
 }
 
@@ -273,7 +282,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -EINVAL;
 	}
 
-	cc->iv_gen_private.benbi_shift = 9 - log;
+	cc->iv_gen_private.benbi.shift = 9 - log;
 
 	return 0;
 }
@@ -288,7 +297,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 
 	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
 
-	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
+	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
 	put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
 
 	return 0;
-- 
cgit v1.2.3


From 5861f1be00b3b70f8ab5e5a81392a6cf69666cd2 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:56 +0000
Subject: dm crypt: restructure essiv error path

Use kzfree for salt deallocation because it is derived from the volume
key.  Use a common error path in ESSIV constructor.

Required by a later patch which fixes the way key material is wiped
from memory.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bec5ac54e23..2301d223f2a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -187,15 +187,15 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
-	struct crypto_cipher *essiv_tfm;
-	struct crypto_hash *hash_tfm;
+	struct crypto_cipher *essiv_tfm = NULL;
+	struct crypto_hash *hash_tfm = NULL;
 	struct hash_desc desc;
 	struct scatterlist sg;
 	unsigned int saltsize;
-	u8 *salt;
+	u8 *salt = NULL;
 	int err;
 
-	if (opts == NULL) {
+	if (!opts) {
 		ti->error = "Digest algorithm missing for ESSIV mode";
 		return -EINVAL;
 	}
@@ -204,15 +204,16 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(hash_tfm)) {
 		ti->error = "Error initializing ESSIV hash";
-		return PTR_ERR(hash_tfm);
+		err = PTR_ERR(hash_tfm);
+		goto bad;
 	}
 
 	saltsize = crypto_hash_digestsize(hash_tfm);
-	salt = kmalloc(saltsize, GFP_KERNEL);
-	if (salt == NULL) {
+	salt = kzalloc(saltsize, GFP_KERNEL);
+	if (!salt) {
 		ti->error = "Error kmallocing salt storage in ESSIV";
-		crypto_free_hash(hash_tfm);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto bad;
 	}
 
 	sg_init_one(&sg, cc->key, cc->key_size);
@@ -220,39 +221,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 	err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
 	crypto_free_hash(hash_tfm);
+	hash_tfm = NULL;
 
 	if (err) {
 		ti->error = "Error calculating hash in ESSIV";
-		kfree(salt);
-		return err;
+		goto bad;
 	}
 
 	/* Setup the essiv_tfm with the given salt */
 	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(essiv_tfm)) {
 		ti->error = "Error allocating crypto tfm for ESSIV";
-		kfree(salt);
-		return PTR_ERR(essiv_tfm);
+		err = PTR_ERR(essiv_tfm);
+		goto bad;
 	}
 	if (crypto_cipher_blocksize(essiv_tfm) !=
 	    crypto_ablkcipher_ivsize(cc->tfm)) {
 		ti->error = "Block size of ESSIV cipher does "
 			    "not match IV size of block cipher";
-		crypto_free_cipher(essiv_tfm);
-		kfree(salt);
-		return -EINVAL;
+		err = -EINVAL;
+		goto bad;
 	}
 	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
 	if (err) {
 		ti->error = "Failed to set key for ESSIV cipher";
-		crypto_free_cipher(essiv_tfm);
-		kfree(salt);
-		return err;
+		goto bad;
 	}
-	kfree(salt);
+	kzfree(salt);
 
 	cc->iv_gen_private.essiv.tfm = essiv_tfm;
 	return 0;
+
+bad:
+	if (essiv_tfm && !IS_ERR(essiv_tfm))
+		crypto_free_cipher(essiv_tfm);
+	if (hash_tfm && !IS_ERR(hash_tfm))
+		crypto_free_hash(hash_tfm);
+	kzfree(salt);
+	return err;
 }
 
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
-- 
cgit v1.2.3


From b95bf2d3d5a48b095bffe2a0cd8c40453cf59557 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:56 +0000
Subject: dm crypt: separate essiv allocation from initialisation

This patch separates the construction of IV from its initialisation.
(For ESSIV it is a hash calculation based on volume key.)

Constructor code now preallocates hash tfm and salt array
and saves it in a private IV structure.

The next patch requires this to reinitialise the wiped IV
without reallocating memory when resuming a suspended device.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 69 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 2301d223f2a..446153a071d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -71,11 +71,14 @@ struct crypt_iv_operations {
 	int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
 		   const char *opts);
 	void (*dtr)(struct crypt_config *cc);
+	int (*init)(struct crypt_config *cc);
 	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
 };
 
 struct iv_essiv_private {
 	struct crypto_cipher *tfm;
+	struct crypto_hash *hash_tfm;
+	u8 *salt;
 };
 
 struct iv_benbi_private {
@@ -176,12 +179,38 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 	return 0;
 }
 
+/* Initialise ESSIV - compute salt but no local memory allocations */
+static int crypt_iv_essiv_init(struct crypt_config *cc)
+{
+	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+	struct hash_desc desc;
+	struct scatterlist sg;
+	int err;
+
+	sg_init_one(&sg, cc->key, cc->key_size);
+	desc.tfm = essiv->hash_tfm;
+	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
+	if (err)
+		return err;
+
+	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+				    crypto_hash_digestsize(essiv->hash_tfm));
+}
+
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
 	crypto_free_cipher(essiv->tfm);
 	essiv->tfm = NULL;
+
+	crypto_free_hash(essiv->hash_tfm);
+	essiv->hash_tfm = NULL;
+
+	kzfree(essiv->salt);
+	essiv->salt = NULL;
 }
 
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -189,9 +218,6 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 {
 	struct crypto_cipher *essiv_tfm = NULL;
 	struct crypto_hash *hash_tfm = NULL;
-	struct hash_desc desc;
-	struct scatterlist sg;
-	unsigned int saltsize;
 	u8 *salt = NULL;
 	int err;
 
@@ -200,7 +226,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -EINVAL;
 	}
 
-	/* Hash the cipher key with the given hash algorithm */
+	/* Allocate hash algorithm */
 	hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(hash_tfm)) {
 		ti->error = "Error initializing ESSIV hash";
@@ -208,27 +234,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		goto bad;
 	}
 
-	saltsize = crypto_hash_digestsize(hash_tfm);
-	salt = kzalloc(saltsize, GFP_KERNEL);
+	salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
 	if (!salt) {
 		ti->error = "Error kmallocing salt storage in ESSIV";
 		err = -ENOMEM;
 		goto bad;
 	}
 
-	sg_init_one(&sg, cc->key, cc->key_size);
-	desc.tfm = hash_tfm;
-	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-	err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
-	crypto_free_hash(hash_tfm);
-	hash_tfm = NULL;
-
-	if (err) {
-		ti->error = "Error calculating hash in ESSIV";
-		goto bad;
-	}
-
-	/* Setup the essiv_tfm with the given salt */
+	/* Allocate essiv_tfm */
 	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(essiv_tfm)) {
 		ti->error = "Error allocating crypto tfm for ESSIV";
@@ -242,14 +255,11 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		err = -EINVAL;
 		goto bad;
 	}
-	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
-	if (err) {
-		ti->error = "Failed to set key for ESSIV cipher";
-		goto bad;
-	}
-	kzfree(salt);
 
+	cc->iv_gen_private.essiv.salt = salt;
 	cc->iv_gen_private.essiv.tfm = essiv_tfm;
+	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+
 	return 0;
 
 bad:
@@ -257,7 +267,7 @@ bad:
 		crypto_free_cipher(essiv_tfm);
 	if (hash_tfm && !IS_ERR(hash_tfm))
 		crypto_free_hash(hash_tfm);
-	kzfree(salt);
+	kfree(salt);
 	return err;
 }
 
@@ -323,6 +333,7 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
 static struct crypt_iv_operations crypt_iv_essiv_ops = {
 	.ctr       = crypt_iv_essiv_ctr,
 	.dtr       = crypt_iv_essiv_dtr,
+	.init      = crypt_iv_essiv_init,
 	.generator = crypt_iv_essiv_gen
 };
 
@@ -1054,6 +1065,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	    cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
 		goto bad_ivmode;
 
+	if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
+	    cc->iv_gen_ops->init(cc) < 0) {
+		ti->error = "Error initialising IV";
+		goto bad_slab_pool;
+	}
+
 	cc->iv_size = crypto_ablkcipher_ivsize(tfm);
 	if (cc->iv_size)
 		/* at least a 64 bit sector number should fit in our buffer */
-- 
cgit v1.2.3


From 542da317668c35036e8471822a564b609d05af66 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:51:57 +0000
Subject: dm crypt: make wipe message also wipe essiv key

The "wipe key" message is used to wipe the volume key from memory
temporarily, for example when suspending to RAM.

But the initialisation vector in ESSIV mode is calculated from the
hashed volume key, so the wipe message should wipe this IV key too and
reinitialise it when the volume key is reinstated.

This patch adds an IV wipe method called from a wipe message callback.
ESSIV is then reinitialised using the init function added by the
last patch.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 446153a071d..91e1bf91769 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -72,6 +72,7 @@ struct crypt_iv_operations {
 		   const char *opts);
 	void (*dtr)(struct crypt_config *cc);
 	int (*init)(struct crypt_config *cc);
+	int (*wipe)(struct crypt_config *cc);
 	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
 };
 
@@ -199,6 +200,17 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 				    crypto_hash_digestsize(essiv->hash_tfm));
 }
 
+/* Wipe salt and reset key derived from volume key */
+static int crypt_iv_essiv_wipe(struct crypt_config *cc)
+{
+	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+
+	memset(essiv->salt, 0, salt_size);
+
+	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+}
+
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
@@ -334,6 +346,7 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = {
 	.ctr       = crypt_iv_essiv_ctr,
 	.dtr       = crypt_iv_essiv_dtr,
 	.init      = crypt_iv_essiv_init,
+	.wipe      = crypt_iv_essiv_wipe,
 	.generator = crypt_iv_essiv_gen
 };
 
@@ -1305,6 +1318,7 @@ static void crypt_resume(struct dm_target *ti)
 static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct crypt_config *cc = ti->private;
+	int ret = -EINVAL;
 
 	if (argc < 2)
 		goto error;
@@ -1314,10 +1328,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
 			DMWARN("not suspended during key manipulation.");
 			return -EINVAL;
 		}
-		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
-			return crypt_set_key(cc, argv[2]);
-		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
+		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
+			ret = crypt_set_key(cc, argv[2]);
+			if (ret)
+				return ret;
+			if (cc->iv_gen_ops && cc->iv_gen_ops->init)
+				ret = cc->iv_gen_ops->init(cc);
+			return ret;
+		}
+		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
+			if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
+				ret = cc->iv_gen_ops->wipe(cc);
+				if (ret)
+					return ret;
+			}
 			return crypt_wipe_key(cc);
+		}
 	}
 
 error:
-- 
cgit v1.2.3


From 952b355760c196ec014dd0b6878f85a11496e3da Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:57 +0000
Subject: dm io: use slab for struct io

Allocate "struct io" from a slab.

This patch changes dm-io, so that "struct io" is allocated from a slab cache.
It used to be allocated with kmalloc. Allocating from a slab will be needed
for the next patch, because it requires a special alignment of "struct io"
and kmalloc cannot meet this alignment.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c | 21 ++++++++++++++++++++-
 drivers/md/dm.c    |  2 ++
 drivers/md/dm.h    |  3 +++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3a2e6a2f8bd..b0d264e684f 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,6 +5,8 @@
  * This file is released under the GPL.
  */
 
+#include "dm.h"
+
 #include <linux/device-mapper.h>
 
 #include <linux/bio.h>
@@ -30,6 +32,8 @@ struct io {
 	void *context;
 };
 
+static struct kmem_cache *_dm_io_cache;
+
 /*
  * io contexts are only dynamically allocated for asynchronous
  * io.  Since async io is likely to be the majority of io we'll
@@ -53,7 +57,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
+	client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
@@ -472,3 +476,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 			&dp, io_req->notify.fn, io_req->notify.context);
 }
 EXPORT_SYMBOL(dm_io);
+
+int __init dm_io_init(void)
+{
+	_dm_io_cache = KMEM_CACHE(io, 0);
+	if (!_dm_io_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void dm_io_exit(void)
+{
+	kmem_cache_destroy(_dm_io_cache);
+	_dm_io_cache = NULL;
+}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 724efc63904..473f0c3c019 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -275,6 +275,7 @@ static int (*_inits[])(void) __initdata = {
 	dm_target_init,
 	dm_linear_init,
 	dm_stripe_init,
+	dm_io_init,
 	dm_kcopyd_init,
 	dm_interface_init,
 };
@@ -284,6 +285,7 @@ static void (*_exits[])(void) = {
 	dm_target_exit,
 	dm_linear_exit,
 	dm_stripe_exit,
+	dm_io_exit,
 	dm_kcopyd_exit,
 	dm_interface_exit,
 };
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a7663eba17e..4a95e8fa360 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -118,6 +118,9 @@ int dm_lock_for_deletion(struct mapped_device *md);
 void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 		       unsigned cookie);
 
+int dm_io_init(void);
+void dm_io_exit(void);
+
 int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
 
-- 
cgit v1.2.3


From f1e539874655ae9e74c1644fd54133b19f1b14e2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:58 +0000
Subject: dm io: remove extra bi_io_vec region hack

Remove the hack where we allocate an extra bi_io_vec to store additional
private data.  This hack prevents us from supporting barriers in
dm-raid1 without first making another little block layer change.
Instead of doing that, this patch eliminates the bi_io_vec abuse by
storing the region number directly in the low bits of bi_private.

We need to store two things for each bio, the pointer to the main io
structure and, if parallel writes were requested, an index indicating
which of these writes this bio belongs to.  There can be at most
BITS_PER_LONG regions - 32 or 64.

The index (region number) was stored in the last (hidden) bio vector and
the pointer to struct io was stored in bi_private.

This patch now aligns "struct io" on BITS_PER_LONG bytes and stores the
region number in the low BITS_PER_LONG bits of bi_private.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c | 89 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 55 insertions(+), 34 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b0d264e684f..f6a714c5aab 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -16,12 +16,19 @@
 #include <linux/slab.h>
 #include <linux/dm-io.h>
 
+#define DM_MSG_PREFIX "io"
+
+#define DM_IO_MAX_REGIONS	BITS_PER_LONG
+
 struct dm_io_client {
 	mempool_t *pool;
 	struct bio_set *bios;
 };
 
-/* FIXME: can we shrink this ? */
+/*
+ * Aligning 'struct io' reduces the number of bits required to store
+ * its address.  Refer to store_io_and_region_in_bio() below.
+ */
 struct io {
 	unsigned long error_bits;
 	unsigned long eopnotsupp_bits;
@@ -30,7 +37,7 @@ struct io {
 	struct dm_io_client *client;
 	io_notify_fn callback;
 	void *context;
-};
+} __attribute__((aligned(DM_IO_MAX_REGIONS)));
 
 static struct kmem_cache *_dm_io_cache;
 
@@ -92,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
 
 /*-----------------------------------------------------------------
  * We need to keep track of which region a bio is doing io for.
- * In order to save a memory allocation we store this the last
- * bvec which we know is unused (blech).
- * XXX This is ugly and can OOPS with some configs... find another way.
+ * To avoid a memory allocation to store just 5 or 6 bits, we
+ * ensure the 'struct io' pointer is aligned so enough low bits are
+ * always zero and then combine it with the region number directly in
+ * bi_private.
  *---------------------------------------------------------------*/
-static inline void bio_set_region(struct bio *bio, unsigned region)
+static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
+				       unsigned region)
 {
-	bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
+	if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
+		DMCRIT("Unaligned struct io pointer %p", io);
+		BUG();
+	}
+
+	bio->bi_private = (void *)((unsigned long)io | region);
 }
 
-static inline unsigned bio_get_region(struct bio *bio)
+static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
+				       unsigned *region)
 {
-	return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
+	unsigned long val = (unsigned long)bio->bi_private;
+
+	*io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
+	*region = val & (DM_IO_MAX_REGIONS - 1);
 }
 
 /*-----------------------------------------------------------------
@@ -144,10 +162,8 @@ static void endio(struct bio *bio, int error)
 	/*
 	 * The bio destructor in bio_put() may use the io object.
 	 */
-	io = bio->bi_private;
-	region = bio_get_region(bio);
+	retrieve_io_and_region_from_bio(bio, &io, &region);
 
-	bio->bi_max_vecs++;
 	bio_put(bio);
 
 	dec_count(io, region, error);
@@ -247,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
 
 static void dm_bio_destructor(struct bio *bio)
 {
-	struct io *io = bio->bi_private;
+	unsigned region;
+	struct io *io;
+
+	retrieve_io_and_region_from_bio(bio, &io, &region);
 
 	bio_free(bio, io->client->bios);
 }
@@ -292,24 +311,17 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 
 	while (remaining) {
 		/*
-		 * Allocate a suitably sized-bio: we add an extra
-		 * bvec for bio_get/set_region() and decrement bi_max_vecs
-		 * to hide it from bio_add_page().
+		 * Allocate a suitably sized-bio.
 		 */
 		num_bvecs = dm_sector_div_up(remaining,
 					     (PAGE_SIZE >> SECTOR_SHIFT));
-		num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
-				      num_bvecs);
-		if (unlikely(num_bvecs > BIO_MAX_PAGES))
-			num_bvecs = BIO_MAX_PAGES;
+		num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
 		bio->bi_sector = where->sector + (where->count - remaining);
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
-		bio->bi_private = io;
 		bio->bi_destructor = dm_bio_destructor;
-		bio->bi_max_vecs--;
-		bio_set_region(bio, region);
+		store_io_and_region_in_bio(bio, io, region);
 
 		/*
 		 * Try and add as many pages as possible.
@@ -337,6 +349,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	int i;
 	struct dpages old_pages = *dp;
 
+	BUG_ON(num_regions > DM_IO_MAX_REGIONS);
+
 	if (sync)
 		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 
@@ -361,7 +375,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 		   struct dm_io_region *where, int rw, struct dpages *dp,
 		   unsigned long *error_bits)
 {
-	struct io io;
+	/*
+	 * gcc <= 4.3 can't do the alignment for stack variables, so we must
+	 * align it on our own.
+	 * volatile prevents the optimizer from removing or reusing
+	 * "io_" field from the stack frame (allowed in ANSI C).
+	 */
+	volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
+	struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
 
 	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
 		WARN_ON(1);
@@ -369,33 +390,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 	}
 
 retry:
-	io.error_bits = 0;
-	io.eopnotsupp_bits = 0;
-	atomic_set(&io.count, 1); /* see dispatch_io() */
-	io.sleeper = current;
-	io.client = client;
+	io->error_bits = 0;
+	io->eopnotsupp_bits = 0;
+	atomic_set(&io->count, 1); /* see dispatch_io() */
+	io->sleeper = current;
+	io->client = client;
 
-	dispatch_io(rw, num_regions, where, dp, &io, 1);
+	dispatch_io(rw, num_regions, where, dp, io, 1);
 
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 
-		if (!atomic_read(&io.count))
+		if (!atomic_read(&io->count))
 			break;
 
 		io_schedule();
 	}
 	set_current_state(TASK_RUNNING);
 
-	if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
+	if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
 		rw &= ~(1 << BIO_RW_BARRIER);
 		goto retry;
 	}
 
 	if (error_bits)
-		*error_bits = io.error_bits;
+		*error_bits = io->error_bits;
 
-	return io.error_bits ? -EIO : 0;
+	return io->error_bits ? -EIO : 0;
 }
 
 static int async_io(struct dm_io_client *client, unsigned int num_regions,
-- 
cgit v1.2.3


From 4184153f9e483f9bb63339ed316e059962fe9794 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:59 +0000
Subject: dm raid1: support flush

Flush support for dm-raid1.

When it receives an empty barrier, submit it to all the devices via dm-io.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c       | 13 +++++++++++--
 drivers/md/dm-region-hash.c | 25 +++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index cc9dc79b078..752a29e1855 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -396,6 +396,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
  */
 static sector_t map_sector(struct mirror *m, struct bio *bio)
 {
+	if (unlikely(!bio->bi_size))
+		return 0;
 	return m->offset + (bio->bi_sector - m->ms->ti->begin);
 }
 
@@ -562,7 +564,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct dm_io_region io[ms->nr_mirrors], *dest = io;
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_rw = WRITE,
+		.bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
 		.mem.type = DM_IO_BVEC,
 		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
 		.notify.fn = write_callback,
@@ -603,6 +605,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
+		if (unlikely(bio_empty_barrier(bio))) {
+			bio_list_add(&sync, bio);
+			continue;
+		}
+
 		region = dm_rh_bio_to_region(ms->rh, bio);
 
 		if (log->type->is_remote_recovering &&
@@ -995,6 +1002,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->private = ms;
 	ti->split_io = dm_rh_get_region_size(ms->rh);
+	ti->num_flush_requests = 1;
 
 	ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
 	if (!ms->kmirrord_wq) {
@@ -1122,7 +1130,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		dm_rh_dec(ms->rh, map_context->ll);
+		if (likely(!bio_empty_barrier(bio)))
+			dm_rh_dec(ms->rh, map_context->ll);
 		return error;
 	}
 
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 36dbe29f2fd..00806b760cc 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -79,6 +79,11 @@ struct dm_region_hash {
 	struct list_head recovered_regions;
 	struct list_head failed_recovered_regions;
 
+	/*
+	 * If there was a barrier failure no regions can be marked clean.
+	 */
+	int barrier_failure;
+
 	void *context;
 	sector_t target_begin;
 
@@ -211,6 +216,7 @@ struct dm_region_hash *dm_region_hash_create(
 	INIT_LIST_HEAD(&rh->quiesced_regions);
 	INIT_LIST_HEAD(&rh->recovered_regions);
 	INIT_LIST_HEAD(&rh->failed_recovered_regions);
+	rh->barrier_failure = 0;
 
 	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
 						      sizeof(struct dm_region));
@@ -395,6 +401,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
 	region_t region = dm_rh_bio_to_region(rh, bio);
 	int recovering = 0;
 
+	if (bio_empty_barrier(bio)) {
+		rh->barrier_failure = 1;
+		return;
+	}
+
 	/* We must inform the log that the sync count has changed. */
 	log->type->set_region_sync(log, region, 0);
 
@@ -515,8 +526,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
 {
 	struct bio *bio;
 
-	for (bio = bios->head; bio; bio = bio->bi_next)
+	for (bio = bios->head; bio; bio = bio->bi_next) {
+		if (bio_empty_barrier(bio))
+			continue;
 		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+	}
 }
 EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
 
@@ -544,7 +558,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
 		 */
 
 		/* do nothing for DM_RH_NOSYNC */
-		if (reg->state == DM_RH_RECOVERING) {
+		if (unlikely(rh->barrier_failure)) {
+			/*
+			 * If a write barrier failed some time ago, we
+			 * don't know whether or not this write made it
+			 * to the disk, so we must resync the device.
+			 */
+			reg->state = DM_RH_NOSYNC;
+		} else if (reg->state == DM_RH_RECOVERING) {
 			list_add_tail(&reg->list, &rh->quiesced_regions);
 		} else if (reg->state == DM_RH_DIRTY) {
 			reg->state = DM_RH_CLEAN;
-- 
cgit v1.2.3


From b09acf1aa79462bdacfe6744b469a17722a52702 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:51:59 +0000
Subject: dm raid1: split touched state into two

Split the variable "touched" into two, "touched_dirtied" and
"touched_cleaned", set when some region was dirtied or cleaned.

This will be used to optimize flushes.

After a transition from "dirty" to "clean" state we don't have flush hardware
cache on the log device. After a transition from "clean" to "dirty" the cache
must be flushed.

Before a transition from "clean" to "dirty" state we don't have to flush all
the raid legs. Before a transition from "dirty" to "clean" we must flush all
the legs to make sure that they are really in sync.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 9443896ede0..31dc33df95c 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -208,7 +208,8 @@ struct log_header {
 
 struct log_c {
 	struct dm_target *ti;
-	int touched;
+	int touched_dirtied;
+	int touched_cleaned;
 	uint32_t region_size;
 	unsigned int region_count;
 	region_t sync_count;
@@ -253,14 +254,14 @@ static inline void log_set_bit(struct log_c *l,
 			       uint32_t *bs, unsigned bit)
 {
 	ext2_set_bit(bit, (unsigned long *) bs);
-	l->touched = 1;
+	l->touched_cleaned = 1;
 }
 
 static inline void log_clear_bit(struct log_c *l,
 				 uint32_t *bs, unsigned bit)
 {
 	ext2_clear_bit(bit, (unsigned long *) bs);
-	l->touched = 1;
+	l->touched_dirtied = 1;
 }
 
 /*----------------------------------------------------------------
@@ -378,7 +379,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	}
 
 	lc->ti = ti;
-	lc->touched = 0;
+	lc->touched_dirtied = 0;
+	lc->touched_cleaned = 0;
 	lc->region_size = region_size;
 	lc->region_count = region_count;
 	lc->sync = sync;
@@ -660,14 +662,16 @@ static int disk_flush(struct dm_dirty_log *log)
 	struct log_c *lc = (struct log_c *) log->context;
 
 	/* only write if the log has changed */
-	if (!lc->touched)
+	if (!lc->touched_cleaned && !lc->touched_dirtied)
 		return 0;
 
 	r = rw_header(lc, WRITE);
 	if (r)
 		fail_log_device(lc);
-	else
-		lc->touched = 0;
+	else {
+		lc->touched_dirtied = 0;
+		lc->touched_cleaned = 0;
+	}
 
 	return r;
 }
-- 
cgit v1.2.3


From 20a34a8ecc7d03eaa5054f58169ebff12f5f1f8c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:00 +0000
Subject: dm log: add flush_header function

Introduce flush_header and use it to flush the log device.

Note that we don't have to flush if all the regions transition
from "dirty" to "clean" state.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 31dc33df95c..6b23631db5b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -288,6 +288,19 @@ static int rw_header(struct log_c *lc, int rw)
 	return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
 }
 
+static int flush_header(struct log_c *lc)
+{
+	struct dm_io_region null_location = {
+		.bdev = lc->header_location.bdev,
+		.sector = 0,
+		.count = 0,
+	};
+
+	lc->io_req.bi_rw = WRITE_BARRIER;
+
+	return dm_io(&lc->io_req, 1, &null_location, NULL);
+}
+
 static int read_header(struct log_c *log)
 {
 	int r;
@@ -616,6 +629,8 @@ static int disk_resume(struct dm_dirty_log *log)
 
 	/* write the new header */
 	r = rw_header(lc, WRITE);
+	if (!r)
+		r = flush_header(lc);
 	if (r) {
 		DMWARN("%s: Failed to write header on dirty region log device",
 		       lc->log_dev->name);
@@ -669,7 +684,13 @@ static int disk_flush(struct dm_dirty_log *log)
 	if (r)
 		fail_log_device(lc);
 	else {
-		lc->touched_dirtied = 0;
+		if (lc->touched_dirtied) {
+			r = flush_header(lc);
+			if (r)
+				fail_log_device(lc);
+			else
+				lc->touched_dirtied = 0;
+		}
 		lc->touched_cleaned = 0;
 	}
 
-- 
cgit v1.2.3


From 5adc78d0d231b030405b31759f125f13404fdb64 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:00 +0000
Subject: dm log: introduce flush_failed variable

Introduce "flush failed" variable.  When a flush before clearing a bit
in the log fails, we don't know anything about which which regions are
in-sync and which not.

So we need to set all regions as not-in-sync and set the variable
"flush_failed" to prevent setting the in-sync bit in the future.

A target reload is the only way to get out of this situation.

The variable will be set in following patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6b23631db5b..d779f8c915d 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -210,6 +210,7 @@ struct log_c {
 	struct dm_target *ti;
 	int touched_dirtied;
 	int touched_cleaned;
+	int flush_failed;
 	uint32_t region_size;
 	unsigned int region_count;
 	region_t sync_count;
@@ -394,6 +395,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	lc->ti = ti;
 	lc->touched_dirtied = 0;
 	lc->touched_cleaned = 0;
+	lc->flush_failed = 0;
 	lc->region_size = region_size;
 	lc->region_count = region_count;
 	lc->sync = sync;
@@ -706,7 +708,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region)
 static void core_clear_region(struct dm_dirty_log *log, region_t region)
 {
 	struct log_c *lc = (struct log_c *) log->context;
-	log_set_bit(lc, lc->clean_bits, region);
+	if (likely(!lc->flush_failed))
+		log_set_bit(lc, lc->clean_bits, region);
 }
 
 static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
-- 
cgit v1.2.3


From 87a8f240e9bcf025ba45e4563c842b0d59c5e8ef Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:01 +0000
Subject: dm log: add flush callback fn

Introduce a callback pointer from the log to dm-raid1 layer.

Before some region is set as "in-sync", we need to flush hardware cache on
all the disks. But the log module doesn't have access to the mirror_set
structure. So it will use this callback.

So far the callback is unused, it will be used in further patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c          | 6 ++++--
 drivers/md/dm-raid1.c        | 2 +-
 include/linux/dm-dirty-log.h | 6 ++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index d779f8c915d..666a80e3602 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
 EXPORT_SYMBOL(dm_dirty_log_type_unregister);
 
 struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
-					 struct dm_target *ti,
-					 unsigned int argc, char **argv)
+			struct dm_target *ti,
+			int (*flush_callback_fn)(struct dm_target *ti),
+			unsigned int argc, char **argv)
 {
 	struct dm_dirty_log_type *type;
 	struct dm_dirty_log *log;
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
 		return NULL;
 	}
 
+	log->flush_callback_fn = flush_callback_fn;
 	log->type = type;
 	if (type->ctr(log, ti, argc, argv)) {
 		kfree(log);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 752a29e1855..d44bc497dad 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -896,7 +896,7 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 		return NULL;
 	}
 
-	dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2);
+	dl = dm_dirty_log_create(argv[0], ti, NULL, param_count, argv + 2);
 	if (!dl) {
 		ti->error = "Error creating mirror dirty log";
 		return NULL;
diff --git a/include/linux/dm-dirty-log.h b/include/linux/dm-dirty-log.h
index 5e8b11d88f6..7084503c340 100644
--- a/include/linux/dm-dirty-log.h
+++ b/include/linux/dm-dirty-log.h
@@ -21,6 +21,7 @@ struct dm_dirty_log_type;
 
 struct dm_dirty_log {
 	struct dm_dirty_log_type *type;
+	int (*flush_callback_fn)(struct dm_target *ti);
 	void *context;
 };
 
@@ -136,8 +137,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type);
  * type->constructor/destructor() directly.
  */
 struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
-					 struct dm_target *ti,
-					 unsigned argc, char **argv);
+			struct dm_target *ti,
+			int (*flush_callback_fn)(struct dm_target *ti),
+			unsigned argc, char **argv);
 void dm_dirty_log_destroy(struct dm_dirty_log *log);
 
 #endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From 076010e2e6ea5b66dfd1f81a6133fb014c9b291d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:01 +0000
Subject: dm log: use flush callback fn

Call the flush callback from the log.

If flush failed, we have no alternative but to mark the whole log as dirty.
Also we set the variable flush_failed to prevent any bits ever being marked as
clean again.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 666a80e3602..315e36a96b6 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -677,13 +677,26 @@ static int core_flush(struct dm_dirty_log *log)
 
 static int disk_flush(struct dm_dirty_log *log)
 {
-	int r;
-	struct log_c *lc = (struct log_c *) log->context;
+	int r, i;
+	struct log_c *lc = log->context;
 
 	/* only write if the log has changed */
 	if (!lc->touched_cleaned && !lc->touched_dirtied)
 		return 0;
 
+	if (lc->touched_cleaned && log->flush_callback_fn &&
+	    log->flush_callback_fn(lc->ti)) {
+		/*
+		 * At this point it is impossible to determine which
+		 * regions are clean and which are dirty (without
+		 * re-reading the log off disk). So mark all of them
+		 * dirty.
+		 */
+		lc->flush_failed = 1;
+		for (i = 0; i < lc->region_count; i++)
+			log_clear_bit(lc, lc->clean_bits, i);
+	}
+
 	r = rw_header(lc, WRITE);
 	if (r)
 		fail_log_device(lc);
-- 
cgit v1.2.3


From c0da3748b9a894b9f9b561ecc2d090a913988a0f Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:02 +0000
Subject: dm raid1: implement mirror_flush

Implement flush callee. It uses dm_io to send zero-size barrier synchronously
and concurrently to all the mirror legs.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d44bc497dad..751660b0c57 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -237,6 +237,40 @@ out:
 	schedule_work(&ms->trigger_event);
 }
 
+static int mirror_flush(struct dm_target *ti)
+{
+	struct mirror_set *ms = ti->private;
+	unsigned long error_bits;
+
+	unsigned int i;
+	struct dm_io_region io[ms->nr_mirrors];
+	struct mirror *m;
+	struct dm_io_request io_req = {
+		.bi_rw = WRITE_BARRIER,
+		.mem.type = DM_IO_KMEM,
+		.mem.ptr.bvec = NULL,
+		.client = ms->io_client,
+	};
+
+	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
+		io[i].bdev = m->dev->bdev;
+		io[i].sector = 0;
+		io[i].count = 0;
+	}
+
+	error_bits = -1;
+	dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
+	if (unlikely(error_bits != 0)) {
+		for (i = 0; i < ms->nr_mirrors; i++)
+			if (test_bit(i, &error_bits))
+				fail_mirror(ms->mirror + i,
+					    DM_RAID1_WRITE_ERROR);
+		return -EIO;
+	}
+
+	return 0;
+}
+
 /*-----------------------------------------------------------------
  * Recovery.
  *
@@ -896,7 +930,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 		return NULL;
 	}
 
-	dl = dm_dirty_log_create(argv[0], ti, NULL, param_count, argv + 2);
+	dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
+				 argv + 2);
 	if (!dl) {
 		ti->error = "Error creating mirror dirty log";
 		return NULL;
-- 
cgit v1.2.3


From 64b30c46e866bbff8a9e17883a18636adc358455 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:02 +0000
Subject: dm raid1: report flush errors separately in status

Report flush errors as 'F' instead of 'D' for log and mirror devices.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c   | 16 ++++++++++++----
 drivers/md/dm-raid1.c |  6 ++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 315e36a96b6..7035582786f 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -237,6 +237,7 @@ struct log_c {
 	 * Disk log fields
 	 */
 	int log_dev_failed;
+	int log_dev_flush_failed;
 	struct dm_dev *log_dev;
 	struct log_header header;
 
@@ -425,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	} else {
 		lc->log_dev = dev;
 		lc->log_dev_failed = 0;
+		lc->log_dev_flush_failed = 0;
 		lc->header_location.bdev = lc->log_dev->bdev;
 		lc->header_location.sector = 0;
 
@@ -633,8 +635,11 @@ static int disk_resume(struct dm_dirty_log *log)
 
 	/* write the new header */
 	r = rw_header(lc, WRITE);
-	if (!r)
+	if (!r) {
 		r = flush_header(lc);
+		if (r)
+			lc->log_dev_flush_failed = 1;
+	}
 	if (r) {
 		DMWARN("%s: Failed to write header on dirty region log device",
 		       lc->log_dev->name);
@@ -703,9 +708,10 @@ static int disk_flush(struct dm_dirty_log *log)
 	else {
 		if (lc->touched_dirtied) {
 			r = flush_header(lc);
-			if (r)
+			if (r) {
+				lc->log_dev_flush_failed = 1;
 				fail_log_device(lc);
-			else
+			} else
 				lc->touched_dirtied = 0;
 		}
 		lc->touched_cleaned = 0;
@@ -805,7 +811,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status,
 	switch(status) {
 	case STATUSTYPE_INFO:
 		DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
-		       lc->log_dev_failed ? 'D' : 'A');
+		       lc->log_dev_flush_failed ? 'F' :
+		       lc->log_dev_failed ? 'D' :
+		       'A');
 		break;
 
 	case STATUSTYPE_TABLE:
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 751660b0c57..85c8704c67b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
  *---------------------------------------------------------------*/
 enum dm_raid1_error {
 	DM_RAID1_WRITE_ERROR,
+	DM_RAID1_FLUSH_ERROR,
 	DM_RAID1_SYNC_ERROR,
 	DM_RAID1_READ_ERROR
 };
@@ -264,7 +265,7 @@ static int mirror_flush(struct dm_target *ti)
 		for (i = 0; i < ms->nr_mirrors; i++)
 			if (test_bit(i, &error_bits))
 				fail_mirror(ms->mirror + i,
-					    DM_RAID1_WRITE_ERROR);
+					    DM_RAID1_FLUSH_ERROR);
 		return -EIO;
 	}
 
@@ -1288,7 +1289,8 @@ static char device_status_char(struct mirror *m)
 	if (!atomic_read(&(m->error_count)))
 		return 'A';
 
-	return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
+	return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
+		(test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
 		(test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
 		(test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
 }
-- 
cgit v1.2.3


From 04788507686d184d8166918b70ef52311bc36dcb Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:03 +0000
Subject: dm raid1: add framework to hold bios during suspend

Add framework to delay bios until a suspend and then resubmit them with
either DM_ENDIO_REQUEUE (if the suspend was noflush) or complete them
with -EIO.  I/O barrier support will use this.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 85c8704c67b..895cce75eee 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -58,6 +58,7 @@ struct mirror_set {
 	struct bio_list reads;
 	struct bio_list writes;
 	struct bio_list failures;
+	struct bio_list holds;	/* bios are waiting until suspend */
 
 	struct dm_region_hash *rh;
 	struct dm_kcopyd_client *kcopyd_client;
@@ -450,6 +451,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
 	io->count = bio->bi_size >> 9;
 }
 
+static void hold_bio(struct mirror_set *ms, struct bio *bio)
+{
+	/*
+	 * If device is suspended, complete the bio.
+	 */
+	if (atomic_read(&ms->suspend)) {
+		if (dm_noflush_suspending(ms->ti))
+			bio_endio(bio, DM_ENDIO_REQUEUE);
+		else
+			bio_endio(bio, -EIO);
+		return;
+	}
+
+	/*
+	 * Hold bio until the suspend is complete.
+	 */
+	spin_lock_irq(&ms->lock);
+	bio_list_add(&ms->holds, bio);
+	spin_unlock_irq(&ms->lock);
+}
+
 /*-----------------------------------------------------------------
  * Reads
  *---------------------------------------------------------------*/
@@ -1225,6 +1247,9 @@ static void mirror_presuspend(struct dm_target *ti)
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 
+	struct bio_list holds;
+	struct bio *bio;
+
 	atomic_set(&ms->suspend, 1);
 
 	/*
@@ -1247,6 +1272,22 @@ static void mirror_presuspend(struct dm_target *ti)
 	 * we know that all of our I/O has been pushed.
 	 */
 	flush_workqueue(ms->kmirrord_wq);
+
+	/*
+	 * Now set ms->suspend is set and the workqueue flushed, no more
+	 * entries can be added to ms->hold list, so process it.
+	 *
+	 * Bios can still arrive concurrently with or after this
+	 * presuspend function, but they cannot join the hold list
+	 * because ms->suspend is set.
+	 */
+	spin_lock_irq(&ms->lock);
+	holds = ms->holds;
+	bio_list_init(&ms->holds);
+	spin_unlock_irq(&ms->lock);
+
+	while ((bio = bio_list_pop(&holds)))
+		hold_bio(ms, bio);
 }
 
 static void mirror_postsuspend(struct dm_target *ti)
-- 
cgit v1.2.3


From 0f398a8403e31c737b429fddc3850093d0bf58d0 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:04 +0000
Subject: dm raid1: use hold framework in do_failures

Use the hold framework in do_failures.

This patch doesn't change the bio processing logic, it just simplifies
failure handling and avoids periodically polling the failures list.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 34 +++++++++-------------------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 895cce75eee..0253130fa13 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -745,20 +745,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 {
 	struct bio *bio;
 
-	if (!failures->head)
+	if (likely(!failures->head))
 		return;
 
-	if (!ms->log_failure) {
-		while ((bio = bio_list_pop(failures))) {
-			ms->in_sync = 0;
-			dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
-		}
-		return;
-	}
-
 	/*
 	 * If the log has failed, unattempted writes are being
-	 * put on the failures list.  We can't issue those writes
+	 * put on the holds list.  We can't issue those writes
 	 * until a log has been marked, so we must store them.
 	 *
 	 * If a 'noflush' suspend is in progress, we can requeue
@@ -773,23 +765,15 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 	 * for us to treat them the same and requeue them
 	 * as well.
 	 */
-	if (dm_noflush_suspending(ms->ti)) {
-		while ((bio = bio_list_pop(failures)))
-			bio_endio(bio, DM_ENDIO_REQUEUE);
-		return;
-	}
 
-	if (atomic_read(&ms->suspend)) {
-		while ((bio = bio_list_pop(failures)))
-			bio_endio(bio, -EIO);
-		return;
+	while ((bio = bio_list_pop(failures))) {
+		if (ms->log_failure)
+			hold_bio(ms, bio);
+		else {
+			ms->in_sync = 0;
+			dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
+		}
 	}
-
-	spin_lock_irq(&ms->lock);
-	bio_list_merge(&ms->failures, failures);
-	spin_unlock_irq(&ms->lock);
-
-	delayed_wake(ms);
 }
 
 static void trigger_event(struct work_struct *work)
-- 
cgit v1.2.3


From 87968ddd2f3be1c21b932cac30157a83a1c4f935 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:04 +0000
Subject: dm raid1: abstract get_valid_mirror function

Move the logic to get a valid mirror leg into a function for re-use
in a later patch.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 0253130fa13..d1a7f1a4789 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -181,6 +181,17 @@ static void set_default_mirror(struct mirror *m)
 	atomic_set(&ms->default_mirror, m - m0);
 }
 
+static struct mirror *get_valid_mirror(struct mirror_set *ms)
+{
+	struct mirror *m;
+
+	for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
+		if (!atomic_read(&m->error_count))
+			return m;
+
+	return NULL;
+}
+
 /* fail_mirror
  * @m: mirror device to fail
  * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -226,13 +237,10 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 		goto out;
 	}
 
-	for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
-		if (!atomic_read(&new->error_count)) {
-			set_default_mirror(new);
-			break;
-		}
-
-	if (unlikely(new == ms->mirror + ms->nr_mirrors))
+	new = get_valid_mirror(ms);
+	if (new)
+		set_default_mirror(new);
+	else
 		DMWARN("All sides of mirror have failed.");
 
 out:
-- 
cgit v1.2.3


From c58098be979509a54021e837a47fcad08db31f94 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:05 +0000
Subject: dm raid1: remove bio_endio from dm_rh_mark_nosync

Move bio completion out of dm_rh_mark_nosync in preparation for the
next patch.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c          | 3 ++-
 drivers/md/dm-region-hash.c    | 6 +-----
 include/linux/dm-region-hash.h | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d1a7f1a4789..4f466ad7568 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -779,7 +779,8 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 			hold_bio(ms, bio);
 		else {
 			ms->in_sync = 0;
-			dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
+			dm_rh_mark_nosync(ms->rh, bio);
+			bio_endio(bio, 0);
 		}
 	}
 }
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 00806b760cc..5f19ceb6fe9 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -383,8 +383,6 @@ static void complete_resync_work(struct dm_region *reg, int success)
 /* dm_rh_mark_nosync
  * @ms
  * @bio
- * @done
- * @error
  *
  * The bio was written on some mirror(s) but failed on other mirror(s).
  * We can successfully endio the bio but should avoid the region being
@@ -392,8 +390,7 @@ static void complete_resync_work(struct dm_region *reg, int success)
  *
  * This function is _not_ safe in interrupt context!
  */
-void dm_rh_mark_nosync(struct dm_region_hash *rh,
-		       struct bio *bio, unsigned done, int error)
+void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
 {
 	unsigned long flags;
 	struct dm_dirty_log *log = rh->log;
@@ -430,7 +427,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
 	BUG_ON(!list_empty(&reg->list));
 	spin_unlock_irqrestore(&rh->region_lock, flags);
 
-	bio_endio(bio, error);
 	if (recovering)
 		complete_resync_work(reg, 0);
 }
diff --git a/include/linux/dm-region-hash.h b/include/linux/dm-region-hash.h
index a9e652a4137..9e2a7a401df 100644
--- a/include/linux/dm-region-hash.h
+++ b/include/linux/dm-region-hash.h
@@ -78,8 +78,7 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region);
 /* Delay bios on regions. */
 void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio);
 
-void dm_rh_mark_nosync(struct dm_region_hash *rh,
-		       struct bio *bio, unsigned done, int error);
+void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio);
 
 /*
  * Region recovery control.
-- 
cgit v1.2.3


From 60f355ead31e2be8d06ac8acb163df91a1c64e3b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:05 +0000
Subject: dm raid1: hold write bios when errors are handled

Hold all write bios when errors are handled.

Previously the failures list was used only when handling errors with
a userspace daemon such as dmeventd.  Now, it is always used for all bios.
The regions where some writes failed must be marked as nosync. This can only
be done in process context (i.e. in raid1 workqueue), not in the
write_callback function.

Previously the write would succeed if writing to at least one leg
succeeded.  This is wrong because data from the failed leg may be
replicated to the correct leg.  Now, if using a userspace daemon, the
write with some failures will be held until the daemon has done its job
and reconfigured the array.  If not using a daemon, the write still
succeeds if at least one leg succeeds. This is bad, but it is consistent
with current behavior.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 63 ++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4f466ad7568..e363335e8d8 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -578,7 +578,6 @@ static void write_callback(unsigned long error, void *context)
 	unsigned i, ret = 0;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
-	int uptodate = 0;
 	int should_wake = 0;
 	unsigned long flags;
 
@@ -591,36 +590,27 @@ static void write_callback(unsigned long error, void *context)
 	 * This way we handle both writes to SYNC and NOSYNC
 	 * regions with the same code.
 	 */
-	if (likely(!error))
-		goto out;
+	if (likely(!error)) {
+		bio_endio(bio, ret);
+		return;
+	}
 
 	for (i = 0; i < ms->nr_mirrors; i++)
 		if (test_bit(i, &error))
 			fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
-		else
-			uptodate = 1;
 
-	if (unlikely(!uptodate)) {
-		DMERR("All replicated volumes dead, failing I/O");
-		/* None of the writes succeeded, fail the I/O. */
-		ret = -EIO;
-	} else if (errors_handled(ms)) {
-		/*
-		 * Need to raise event.  Since raising
-		 * events can block, we need to do it in
-		 * the main thread.
-		 */
-		spin_lock_irqsave(&ms->lock, flags);
-		if (!ms->failures.head)
-			should_wake = 1;
-		bio_list_add(&ms->failures, bio);
-		spin_unlock_irqrestore(&ms->lock, flags);
-		if (should_wake)
-			wakeup_mirrord(ms);
-		return;
-	}
-out:
-	bio_endio(bio, ret);
+	/*
+	 * Need to raise event.  Since raising
+	 * events can block, we need to do it in
+	 * the main thread.
+	 */
+	spin_lock_irqsave(&ms->lock, flags);
+	if (!ms->failures.head)
+		should_wake = 1;
+	bio_list_add(&ms->failures, bio);
+	spin_unlock_irqrestore(&ms->lock, flags);
+	if (should_wake)
+		wakeup_mirrord(ms);
 }
 
 static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -773,15 +763,26 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 	 * for us to treat them the same and requeue them
 	 * as well.
 	 */
-
 	while ((bio = bio_list_pop(failures))) {
-		if (ms->log_failure)
-			hold_bio(ms, bio);
-		else {
+		if (!ms->log_failure) {
 			ms->in_sync = 0;
 			dm_rh_mark_nosync(ms->rh, bio);
-			bio_endio(bio, 0);
 		}
+
+		/*
+		 * If all the legs are dead, fail the I/O.
+		 * If we have been told to handle errors, hold the bio
+		 * and wait for userspace to deal with the problem.
+		 * Otherwise pretend that the I/O succeeded. (This would
+		 * be wrong if the failed leg returned after reboot and
+		 * got replicated back to the good legs.)
+		 */
+		if (!get_valid_mirror(ms))
+			bio_endio(bio, -EIO);
+		else if (errors_handled(ms))
+			hold_bio(ms, bio);
+		else
+			bio_endio(bio, 0);
 	}
 }
 
-- 
cgit v1.2.3


From 929be8fcb4b4b65d038e73d3bb34715851a95ca2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:06 +0000
Subject: dm raid1: hold all write bios when leg fails

Hold all write bios when leg fails and errors are handled

When using a userspace daemon such as dmeventd to handle errors, we must
delay completing  bios until it has done its job.
This patch prevents the following race:
  - primary leg fails
  - write "1" fail, the write is held, secondary leg is set default
  - write "2" goes straight to the secondary leg

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index e363335e8d8..f8d7b3aa46c 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -69,6 +69,7 @@ struct mirror_set {
 	region_t nr_regions;
 	int in_sync;
 	int log_failure;
+	int leg_failure;
 	atomic_t suspend;
 
 	atomic_t default_mirror;	/* Default mirror */
@@ -211,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 	struct mirror_set *ms = m->ms;
 	struct mirror *new;
 
+	ms->leg_failure = 1;
+
 	/*
 	 * error_count is used for nothing more than a
 	 * simple way to tell if a device has encountered
@@ -734,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		dm_rh_delay(ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(get_default_mirror(ms), bio);
-		generic_make_request(bio);
+		if (unlikely(ms->leg_failure) && errors_handled(ms))
+			hold_bio(ms, bio);
+		else {
+			map_bio(get_default_mirror(ms), bio);
+			generic_make_request(bio);
+		}
 	}
 }
 
@@ -848,6 +855,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
 	ms->log_failure = 0;
+	ms->leg_failure = 0;
 	atomic_set(&ms->suspend, 0);
 	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
-- 
cgit v1.2.3


From 5339fc2d47d1d720e027b9b832bf5aae8fba2ac0 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:06 +0000
Subject: dm raid1: explicitly initialise bio_lists

Explicitly initialize bio lists instead of relying on kzalloc.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Takahiro Yasui <tyasui@redhat.com>
Tested-by: Takahiro Yasui <tyasui@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f8d7b3aa46c..ad779bd13ae 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -849,6 +849,10 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	}
 
 	spin_lock_init(&ms->lock);
+	bio_list_init(&ms->reads);
+	bio_list_init(&ms->writes);
+	bio_list_init(&ms->failures);
+	bio_list_init(&ms->holds);
 
 	ms->ti = ti;
 	ms->nr_mirrors = nr_mirrors;
-- 
cgit v1.2.3


From a518b86d0b1b6a474f154697dc6f33e0a317ae72 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 10 Dec 2009 23:52:07 +0000
Subject: dm ioctl: prefer strlcpy over strncpy

strlcpy() will always null terminate the string.

    The code should already guarantee this as the last bytes are already
    NULs and the string lengths were restricted before being stored in
    hc.  Removing the '-1' becomes necessary so strlcpy() doesn't
    lose the last character of a maximum-length string.
	- agk

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index d19854c9818..99de0e4ce83 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -645,9 +645,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
 		 * Sneakily write in both the name and the uuid
 		 * while we have the cell.
 		 */
-		strncpy(param->name, hc->name, sizeof(param->name));
+		strlcpy(param->name, hc->name, sizeof(param->name));
 		if (hc->uuid)
-			strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1);
+			strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
 		else
 			param->uuid[0] = '\0';
 
-- 
cgit v1.2.3


From f5acc834287dd4f6cf712093ce0e779e38caad70 Mon Sep 17 00:00:00 2001
From: Jon Brassow <jbrassow@redhat.com>
Date: Thu, 10 Dec 2009 23:52:07 +0000
Subject: dm snapshot: avoid else clause in persistent_read_metadata

Minor code touch-up.  We don't need the 'else'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap-persistent.c | 46 +++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 0c746420c00..7e855fbeb22 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -552,35 +552,31 @@ static int persistent_read_metadata(struct dm_exception_store *store,
 		ps->current_area = 0;
 		zero_memory_area(ps);
 		r = zero_disk_area(ps, 0);
-		if (r) {
+		if (r)
 			DMWARN("zero_disk_area(0) failed");
-			return r;
-		}
-	} else {
-		/*
-		 * Sanity checks.
-		 */
-		if (ps->version != SNAPSHOT_DISK_VERSION) {
-			DMWARN("unable to handle snapshot disk version %d",
-			       ps->version);
-			return -EINVAL;
-		}
+		return r;
+	}
+	/*
+	 * Sanity checks.
+	 */
+	if (ps->version != SNAPSHOT_DISK_VERSION) {
+		DMWARN("unable to handle snapshot disk version %d",
+		       ps->version);
+		return -EINVAL;
+	}
 
-		/*
-		 * Metadata are valid, but snapshot is invalidated
-		 */
-		if (!ps->valid)
-			return 1;
+	/*
+	 * Metadata are valid, but snapshot is invalidated
+	 */
+	if (!ps->valid)
+		return 1;
 
-		/*
-		 * Read the metadata.
-		 */
-		r = read_exceptions(ps, callback, callback_context);
-		if (r)
-			return r;
-	}
+	/*
+	 * Read the metadata.
+	 */
+	r = read_exceptions(ps, callback, callback_context);
 
-	return 0;
+	return r;
 }
 
 static int persistent_prepare_exception(struct dm_exception_store *store,
-- 
cgit v1.2.3


From 102c6ddb1d081a6a1fede38c43a42c9811313ec7 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:08 +0000
Subject: dm snapshot: simplify sector_to_chunk expression

Removed unnecessary 'and' masking: The right shift discards the lower
bits so there is no need to clear them.

(A later patch needs this change to support a 32-bit chunk_mask.)

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 8a223a48802..5f9315b32a4 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -162,7 +162,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
 static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
 				      sector_t sector)
 {
-	return (sector & ~store->chunk_mask) >> store->chunk_shift;
+	return sector >> store->chunk_shift;
 }
 
 int dm_exception_store_type_register(struct dm_exception_store_type *type);
-- 
cgit v1.2.3


From 7e201b35132a1f02c931a0a06760766c846bb49b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:08 +0000
Subject: dm snapshot: abstract minimum_chunk_size fn

The origin needs to find minimum chunksize of all snapshots.  This logic is
moved to a separate function because it will be used at another place in
the snapshot merge patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 8a4a9c838af..48978ab42ae 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -523,6 +523,25 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	return 0;
 }
 
+#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
+
+/*
+ * Return a minimum chunk size of all snapshots that have the specified origin.
+ * Return zero if the origin has no snapshots.
+ */
+static sector_t __minimum_chunk_size(struct origin *o)
+{
+	struct dm_snapshot *snap;
+	unsigned chunk_size = 0;
+
+	if (o)
+		list_for_each_entry(snap, &o->snapshots, list)
+			chunk_size = min_not_zero(chunk_size,
+						  snap->store->chunk_size);
+
+	return chunk_size;
+}
+
 /*
  * Hard coded magic.
  */
@@ -1395,8 +1414,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
 	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
 }
 
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
 /*
  * Set the target "split_io" field to the minimum of all the snapshots'
  * chunk sizes.
@@ -1404,19 +1421,12 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
 static void origin_resume(struct dm_target *ti)
 {
 	struct dm_dev *dev = ti->private;
-	struct dm_snapshot *snap;
-	struct origin *o;
-	unsigned chunk_size = 0;
 
 	down_read(&_origins_lock);
-	o = __lookup_origin(dev->bdev);
-	if (o)
-		list_for_each_entry (snap, &o->snapshots, list)
-			chunk_size = min_not_zero(chunk_size,
-						  snap->store->chunk_size);
-	up_read(&_origins_lock);
 
-	ti->split_io = chunk_size;
+	ti->split_io = __minimum_chunk_size(__lookup_origin(dev->bdev));
+
+	up_read(&_origins_lock);
 }
 
 static int origin_status(struct dm_target *ti, status_type_t type, char *result,
-- 
cgit v1.2.3


From d32a6ea65fbc33621f9c790da3dff10201640b2a Mon Sep 17 00:00:00 2001
From: Jon Brassow <jbrassow@redhat.com>
Date: Thu, 10 Dec 2009 23:52:09 +0000
Subject: dm snapshot: consolidate insert exception functions

Consolidate the insert_*exception functions.  'insert_completed_exception'
already contains all the logic to handle 'insert_exception' (via
check for a hash_shift of 0), so remove redundant function.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 48978ab42ae..9135498213e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -390,13 +390,6 @@ static uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
 	return (chunk >> et->hash_shift) & et->hash_mask;
 }
 
-static void insert_exception(struct exception_table *eh,
-			     struct dm_snap_exception *e)
-{
-	struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
-	list_add(&e->hash_list, l);
-}
-
 static void remove_exception(struct dm_snap_exception *e)
 {
 	list_del(&e->hash_list);
@@ -457,10 +450,9 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 	atomic_dec(&s->pending_exceptions_count);
 }
 
-static void insert_completed_exception(struct dm_snapshot *s,
-				       struct dm_snap_exception *new_e)
+static void insert_exception(struct exception_table *eh,
+			     struct dm_snap_exception *new_e)
 {
-	struct exception_table *eh = &s->complete;
 	struct list_head *l;
 	struct dm_snap_exception *e = NULL;
 
@@ -518,7 +510,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	/* Consecutive_count is implicitly initialised to zero */
 	e->new_chunk = new;
 
-	insert_completed_exception(s, e);
+	insert_exception(&s->complete, e);
 
 	return 0;
 }
@@ -925,7 +917,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 	 * Add a proper exception, and remove the
 	 * in-flight exception from the list.
 	 */
-	insert_completed_exception(s, e);
+	insert_exception(&s->complete, e);
 
  out:
 	remove_exception(&pe->e);
-- 
cgit v1.2.3


From 1d4989c858093bda0426be536fc7f9c415857836 Mon Sep 17 00:00:00 2001
From: Jon Brassow <jbrassow@redhat.com>
Date: Thu, 10 Dec 2009 23:52:10 +0000
Subject: dm snapshot: rename dm_snap_exception to dm_exception

The exception structure is not necessarily just a snapshot
element (especially after we pull it out of dm-snap.c).

Renaming appropriately.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.h | 14 +++++++-------
 drivers/md/dm-snap-persistent.c |  4 ++--
 drivers/md/dm-snap-transient.c  |  4 ++--
 drivers/md/dm-snap.c            | 34 +++++++++++++++++-----------------
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 5f9315b32a4..92749696e35 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
  * of chunks that follow contiguously.  Remaining bits hold the number of the
  * chunk within the device.
  */
-struct dm_snap_exception {
+struct dm_exception {
 	struct list_head hash_list;
 
 	chunk_t old_chunk;
@@ -64,13 +64,13 @@ struct dm_exception_store_type {
 	 * Find somewhere to store the next exception.
 	 */
 	int (*prepare_exception) (struct dm_exception_store *store,
-				  struct dm_snap_exception *e);
+				  struct dm_exception *e);
 
 	/*
 	 * Update the metadata with this exception.
 	 */
 	void (*commit_exception) (struct dm_exception_store *store,
-				  struct dm_snap_exception *e,
+				  struct dm_exception *e,
 				  void (*callback) (void *, int success),
 				  void *callback_context);
 
@@ -120,12 +120,12 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
 	return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
 }
 
-static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
 {
 	return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
 }
 
-static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
 {
 	e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
 
@@ -140,12 +140,12 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
 	return chunk;
 }
 
-static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
+static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
 {
 	return 0;
 }
 
-static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
+static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
 {
 }
 
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 7e855fbeb22..24b8acd1be8 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -580,7 +580,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
 }
 
 static int persistent_prepare_exception(struct dm_exception_store *store,
-					struct dm_snap_exception *e)
+					struct dm_exception *e)
 {
 	struct pstore *ps = get_info(store);
 	uint32_t stride;
@@ -607,7 +607,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
 }
 
 static void persistent_commit_exception(struct dm_exception_store *store,
-					struct dm_snap_exception *e,
+					struct dm_exception *e,
 					void (*callback) (void *, int success),
 					void *callback_context)
 {
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index cde5aa558e6..267801b34ff 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -36,7 +36,7 @@ static int transient_read_metadata(struct dm_exception_store *store,
 }
 
 static int transient_prepare_exception(struct dm_exception_store *store,
-				       struct dm_snap_exception *e)
+				       struct dm_exception *e)
 {
 	struct transient_c *tc = store->context;
 	sector_t size = get_dev_size(store->cow->bdev);
@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store,
 }
 
 static void transient_commit_exception(struct dm_exception_store *store,
-				       struct dm_snap_exception *e,
+				       struct dm_exception *e,
 				       void (*callback) (void *, int success),
 				       void *callback_context)
 {
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9135498213e..a7d60f64406 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -116,7 +116,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
 }
 
 struct dm_snap_pending_exception {
-	struct dm_snap_exception e;
+	struct dm_exception e;
 
 	/*
 	 * Origin buffers waiting for this to complete are held
@@ -371,7 +371,7 @@ static int init_exception_table(struct exception_table *et, uint32_t size,
 static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem)
 {
 	struct list_head *slot;
-	struct dm_snap_exception *ex, *next;
+	struct dm_exception *ex, *next;
 	int i, size;
 
 	size = et->hash_mask + 1;
@@ -390,7 +390,7 @@ static uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
 	return (chunk >> et->hash_shift) & et->hash_mask;
 }
 
-static void remove_exception(struct dm_snap_exception *e)
+static void remove_exception(struct dm_exception *e)
 {
 	list_del(&e->hash_list);
 }
@@ -399,11 +399,11 @@ static void remove_exception(struct dm_snap_exception *e)
  * Return the exception data for a sector, or NULL if not
  * remapped.
  */
-static struct dm_snap_exception *lookup_exception(struct exception_table *et,
+static struct dm_exception *lookup_exception(struct exception_table *et,
 						  chunk_t chunk)
 {
 	struct list_head *slot;
-	struct dm_snap_exception *e;
+	struct dm_exception *e;
 
 	slot = &et->table[exception_hash(et, chunk)];
 	list_for_each_entry (e, slot, hash_list)
@@ -414,9 +414,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et,
 	return NULL;
 }
 
-static struct dm_snap_exception *alloc_exception(void)
+static struct dm_exception *alloc_exception(void)
 {
-	struct dm_snap_exception *e;
+	struct dm_exception *e;
 
 	e = kmem_cache_alloc(exception_cache, GFP_NOIO);
 	if (!e)
@@ -425,7 +425,7 @@ static struct dm_snap_exception *alloc_exception(void)
 	return e;
 }
 
-static void free_exception(struct dm_snap_exception *e)
+static void free_exception(struct dm_exception *e)
 {
 	kmem_cache_free(exception_cache, e);
 }
@@ -451,10 +451,10 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 }
 
 static void insert_exception(struct exception_table *eh,
-			     struct dm_snap_exception *new_e)
+			     struct dm_exception *new_e)
 {
 	struct list_head *l;
-	struct dm_snap_exception *e = NULL;
+	struct dm_exception *e = NULL;
 
 	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
 
@@ -499,7 +499,7 @@ out:
 static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 {
 	struct dm_snapshot *s = context;
-	struct dm_snap_exception *e;
+	struct dm_exception *e;
 
 	e = alloc_exception();
 	if (!e)
@@ -876,7 +876,7 @@ static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
 
 static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 {
-	struct dm_snap_exception *e;
+	struct dm_exception *e;
 	struct dm_snapshot *s = pe->snap;
 	struct bio *origin_bios = NULL;
 	struct bio *snapshot_bios = NULL;
@@ -988,7 +988,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 static struct dm_snap_pending_exception *
 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
 {
-	struct dm_snap_exception *e = lookup_exception(&s->pending, chunk);
+	struct dm_exception *e = lookup_exception(&s->pending, chunk);
 
 	if (!e)
 		return NULL;
@@ -1034,7 +1034,7 @@ __find_pending_exception(struct dm_snapshot *s,
 	return pe;
 }
 
-static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
+static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
 			    struct bio *bio, chunk_t chunk)
 {
 	bio->bi_bdev = s->store->cow->bdev;
@@ -1048,7 +1048,7 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
 static int snapshot_map(struct dm_target *ti, struct bio *bio,
 			union map_info *map_context)
 {
-	struct dm_snap_exception *e;
+	struct dm_exception *e;
 	struct dm_snapshot *s = ti->private;
 	int r = DM_MAPIO_REMAPPED;
 	chunk_t chunk;
@@ -1221,7 +1221,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 {
 	int r = DM_MAPIO_REMAPPED, first = 0;
 	struct dm_snapshot *snap;
-	struct dm_snap_exception *e;
+	struct dm_exception *e;
 	struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL;
 	chunk_t chunk;
 	LIST_HEAD(pe_queue);
@@ -1500,7 +1500,7 @@ static int __init dm_snapshot_init(void)
 		goto bad2;
 	}
 
-	exception_cache = KMEM_CACHE(dm_snap_exception, 0);
+	exception_cache = KMEM_CACHE(dm_exception, 0);
 	if (!exception_cache) {
 		DMERR("Couldn't create exception cache.");
 		r = -ENOMEM;
-- 
cgit v1.2.3


From 191437a53c8269df3a2c6199206781e742c57bb5 Mon Sep 17 00:00:00 2001
From: Jon Brassow <jbrassow@redhat.com>
Date: Thu, 10 Dec 2009 23:52:10 +0000
Subject: dm snapshot: rename exception_table to dm_exception_table

Rename exception_table for broader use outside dm-snap.c

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a7d60f64406..f40331cb1f6 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -49,7 +49,7 @@
 #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
 
-struct exception_table {
+struct dm_exception_table {
 	uint32_t hash_mask;
 	unsigned hash_shift;
 	struct list_head *table;
@@ -73,8 +73,8 @@ struct dm_snapshot {
 
 	atomic_t pending_exceptions_count;
 
-	struct exception_table pending;
-	struct exception_table complete;
+	struct dm_exception_table pending;
+	struct dm_exception_table complete;
 
 	/*
 	 * pe_lock protects all pending_exception operations and access
@@ -351,7 +351,7 @@ static void unregister_snapshot(struct dm_snapshot *s)
  * The lowest hash_shift bits of the chunk number are ignored, allowing
  * some consecutive chunks to be grouped together.
  */
-static int init_exception_table(struct exception_table *et, uint32_t size,
+static int init_exception_table(struct dm_exception_table *et, uint32_t size,
 				unsigned hash_shift)
 {
 	unsigned int i;
@@ -368,7 +368,8 @@ static int init_exception_table(struct exception_table *et, uint32_t size,
 	return 0;
 }
 
-static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem)
+static void exit_exception_table(struct dm_exception_table *et,
+				 struct kmem_cache *mem)
 {
 	struct list_head *slot;
 	struct dm_exception *ex, *next;
@@ -385,7 +386,7 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache *
 	vfree(et->table);
 }
 
-static uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
+static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
 {
 	return (chunk >> et->hash_shift) & et->hash_mask;
 }
@@ -399,7 +400,7 @@ static void remove_exception(struct dm_exception *e)
  * Return the exception data for a sector, or NULL if not
  * remapped.
  */
-static struct dm_exception *lookup_exception(struct exception_table *et,
+static struct dm_exception *lookup_exception(struct dm_exception_table *et,
 						  chunk_t chunk)
 {
 	struct list_head *slot;
@@ -450,7 +451,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 	atomic_dec(&s->pending_exceptions_count);
 }
 
-static void insert_exception(struct exception_table *eh,
+static void insert_exception(struct dm_exception_table *eh,
 			     struct dm_exception *new_e)
 {
 	struct list_head *l;
-- 
cgit v1.2.3


From 3510cb94ff7b04b016bd22bfee913e2c1c05c066 Mon Sep 17 00:00:00 2001
From: Jon Brassow <jbrassow@redhat.com>
Date: Thu, 10 Dec 2009 23:52:11 +0000
Subject: dm snapshot: rename exception functions

Rename exception functions.  Preparing to pull them out of
dm-snap.c for broader use.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 66 ++++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f40331cb1f6..cb4c2c3a43f 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -351,8 +351,8 @@ static void unregister_snapshot(struct dm_snapshot *s)
  * The lowest hash_shift bits of the chunk number are ignored, allowing
  * some consecutive chunks to be grouped together.
  */
-static int init_exception_table(struct dm_exception_table *et, uint32_t size,
-				unsigned hash_shift)
+static int dm_exception_table_init(struct dm_exception_table *et,
+				   uint32_t size, unsigned hash_shift)
 {
 	unsigned int i;
 
@@ -368,8 +368,8 @@ static int init_exception_table(struct dm_exception_table *et, uint32_t size,
 	return 0;
 }
 
-static void exit_exception_table(struct dm_exception_table *et,
-				 struct kmem_cache *mem)
+static void dm_exception_table_exit(struct dm_exception_table *et,
+				    struct kmem_cache *mem)
 {
 	struct list_head *slot;
 	struct dm_exception *ex, *next;
@@ -391,7 +391,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
 	return (chunk >> et->hash_shift) & et->hash_mask;
 }
 
-static void remove_exception(struct dm_exception *e)
+static void dm_remove_exception(struct dm_exception *e)
 {
 	list_del(&e->hash_list);
 }
@@ -400,8 +400,8 @@ static void remove_exception(struct dm_exception *e)
  * Return the exception data for a sector, or NULL if not
  * remapped.
  */
-static struct dm_exception *lookup_exception(struct dm_exception_table *et,
-						  chunk_t chunk)
+static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
+						chunk_t chunk)
 {
 	struct list_head *slot;
 	struct dm_exception *e;
@@ -415,7 +415,7 @@ static struct dm_exception *lookup_exception(struct dm_exception_table *et,
 	return NULL;
 }
 
-static struct dm_exception *alloc_exception(void)
+static struct dm_exception *alloc_completed_exception(void)
 {
 	struct dm_exception *e;
 
@@ -426,7 +426,7 @@ static struct dm_exception *alloc_exception(void)
 	return e;
 }
 
-static void free_exception(struct dm_exception *e)
+static void free_completed_exception(struct dm_exception *e)
 {
 	kmem_cache_free(exception_cache, e);
 }
@@ -451,8 +451,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 	atomic_dec(&s->pending_exceptions_count);
 }
 
-static void insert_exception(struct dm_exception_table *eh,
-			     struct dm_exception *new_e)
+static void dm_insert_exception(struct dm_exception_table *eh,
+				struct dm_exception *new_e)
 {
 	struct list_head *l;
 	struct dm_exception *e = NULL;
@@ -471,7 +471,7 @@ static void insert_exception(struct dm_exception_table *eh,
 		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
 					 dm_consecutive_chunk_count(e) + 1)) {
 			dm_consecutive_chunk_count_inc(e);
-			free_exception(new_e);
+			free_completed_exception(new_e);
 			return;
 		}
 
@@ -481,7 +481,7 @@ static void insert_exception(struct dm_exception_table *eh,
 			dm_consecutive_chunk_count_inc(e);
 			e->old_chunk--;
 			e->new_chunk--;
-			free_exception(new_e);
+			free_completed_exception(new_e);
 			return;
 		}
 
@@ -502,7 +502,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	struct dm_snapshot *s = context;
 	struct dm_exception *e;
 
-	e = alloc_exception();
+	e = alloc_completed_exception();
 	if (!e)
 		return -ENOMEM;
 
@@ -511,7 +511,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	/* Consecutive_count is implicitly initialised to zero */
 	e->new_chunk = new;
 
-	insert_exception(&s->complete, e);
+	dm_insert_exception(&s->complete, e);
 
 	return 0;
 }
@@ -568,8 +568,8 @@ static int init_hash_tables(struct dm_snapshot *s)
 	if (hash_size < 64)
 		hash_size = 64;
 	hash_size = rounddown_pow_of_two(hash_size);
-	if (init_exception_table(&s->complete, hash_size,
-				 DM_CHUNK_CONSECUTIVE_BITS))
+	if (dm_exception_table_init(&s->complete, hash_size,
+				    DM_CHUNK_CONSECUTIVE_BITS))
 		return -ENOMEM;
 
 	/*
@@ -580,8 +580,8 @@ static int init_hash_tables(struct dm_snapshot *s)
 	if (hash_size < 64)
 		hash_size = 64;
 
-	if (init_exception_table(&s->pending, hash_size, 0)) {
-		exit_exception_table(&s->complete, exception_cache);
+	if (dm_exception_table_init(&s->pending, hash_size, 0)) {
+		dm_exception_table_exit(&s->complete, exception_cache);
 		return -ENOMEM;
 	}
 
@@ -716,8 +716,8 @@ bad_pending_pool:
 	dm_kcopyd_client_destroy(s->kcopyd_client);
 
 bad_kcopyd:
-	exit_exception_table(&s->pending, pending_cache);
-	exit_exception_table(&s->complete, exception_cache);
+	dm_exception_table_exit(&s->pending, pending_cache);
+	dm_exception_table_exit(&s->complete, exception_cache);
 
 bad_hash_tables:
 	dm_put_device(ti, s->origin);
@@ -737,8 +737,8 @@ static void __free_exceptions(struct dm_snapshot *s)
 	dm_kcopyd_client_destroy(s->kcopyd_client);
 	s->kcopyd_client = NULL;
 
-	exit_exception_table(&s->pending, pending_cache);
-	exit_exception_table(&s->complete, exception_cache);
+	dm_exception_table_exit(&s->pending, pending_cache);
+	dm_exception_table_exit(&s->complete, exception_cache);
 }
 
 static void snapshot_dtr(struct dm_target *ti)
@@ -891,7 +891,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 		goto out;
 	}
 
-	e = alloc_exception();
+	e = alloc_completed_exception();
 	if (!e) {
 		down_write(&s->lock);
 		__invalidate_snapshot(s, -ENOMEM);
@@ -902,7 +902,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 
 	down_write(&s->lock);
 	if (!s->valid) {
-		free_exception(e);
+		free_completed_exception(e);
 		error = 1;
 		goto out;
 	}
@@ -918,10 +918,10 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 	 * Add a proper exception, and remove the
 	 * in-flight exception from the list.
 	 */
-	insert_exception(&s->complete, e);
+	dm_insert_exception(&s->complete, e);
 
  out:
-	remove_exception(&pe->e);
+	dm_remove_exception(&pe->e);
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = put_pending_exception(pe);
 
@@ -989,7 +989,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 static struct dm_snap_pending_exception *
 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
 {
-	struct dm_exception *e = lookup_exception(&s->pending, chunk);
+	struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
 
 	if (!e)
 		return NULL;
@@ -1030,7 +1030,7 @@ __find_pending_exception(struct dm_snapshot *s,
 	}
 
 	get_pending_exception(pe);
-	insert_exception(&s->pending, &pe->e);
+	dm_insert_exception(&s->pending, &pe->e);
 
 	return pe;
 }
@@ -1077,7 +1077,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	}
 
 	/* If the block is already remapped - use that, else remap it */
-	e = lookup_exception(&s->complete, chunk);
+	e = dm_lookup_exception(&s->complete, chunk);
 	if (e) {
 		remap_exception(s, e, bio, chunk);
 		goto out_unlock;
@@ -1101,7 +1101,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 				goto out_unlock;
 			}
 
-			e = lookup_exception(&s->complete, chunk);
+			e = dm_lookup_exception(&s->complete, chunk);
 			if (e) {
 				free_pending_exception(pe);
 				remap_exception(s, e, bio, chunk);
@@ -1254,7 +1254,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 		 * ref_count is initialised to 1 so pending_complete()
 		 * won't destroy the primary_pe while we're inside this loop.
 		 */
-		e = lookup_exception(&snap->complete, chunk);
+		e = dm_lookup_exception(&snap->complete, chunk);
 		if (e)
 			goto next_snapshot;
 
@@ -1269,7 +1269,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 				goto next_snapshot;
 			}
 
-			e = lookup_exception(&snap->complete, chunk);
+			e = dm_lookup_exception(&snap->complete, chunk);
 			if (e) {
 				free_pending_exception(pe);
 				goto next_snapshot;
-- 
cgit v1.2.3


From 985903bb3a6d98623360ab6c855417f638840029 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:11 +0000
Subject: dm snapshot: add allocated metadata to snapshot status

Add number of sectors used by metadata to the end of the snapshot's status
line.

Renamed dm_exception_store_type's 'fraction_full' to 'usage'.  Renamed
arguments to be clearer about what is being returned.  Also added
'metadata_sectors'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.h |  6 +++---
 drivers/md/dm-snap-persistent.c | 23 +++++++++++++++++------
 drivers/md/dm-snap-transient.c  | 15 +++++++++------
 drivers/md/dm-snap.c            | 21 ++++++++++++---------
 4 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 92749696e35..366c8b1fca3 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -86,9 +86,9 @@ struct dm_exception_store_type {
 	/*
 	 * Return how full the snapshot is.
 	 */
-	void (*fraction_full) (struct dm_exception_store *store,
-			       sector_t *numerator,
-			       sector_t *denominator);
+	void (*usage) (struct dm_exception_store *store,
+		       sector_t *total_sectors, sector_t *sectors_allocated,
+		       sector_t *metadata_sectors);
 
 	/* For internal device-mapper use only. */
 	struct list_head list;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 24b8acd1be8..767065f6c5f 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -489,11 +489,22 @@ static struct pstore *get_info(struct dm_exception_store *store)
 	return (struct pstore *) store->context;
 }
 
-static void persistent_fraction_full(struct dm_exception_store *store,
-				     sector_t *numerator, sector_t *denominator)
+static void persistent_usage(struct dm_exception_store *store,
+			     sector_t *total_sectors,
+			     sector_t *sectors_allocated,
+			     sector_t *metadata_sectors)
 {
-	*numerator = get_info(store)->next_free * store->chunk_size;
-	*denominator = get_dev_size(store->cow->bdev);
+	struct pstore *ps = get_info(store);
+
+	*sectors_allocated = ps->next_free * store->chunk_size;
+	*total_sectors = get_dev_size(store->cow->bdev);
+
+	/*
+	 * First chunk is the fixed header.
+	 * Then there are (ps->current_area + 1) metadata chunks, each one
+	 * separated from the next by ps->exceptions_per_area data chunks.
+	 */
+	*metadata_sectors = (ps->current_area + 2) * store->chunk_size;
 }
 
 static void persistent_dtr(struct dm_exception_store *store)
@@ -738,7 +749,7 @@ static struct dm_exception_store_type _persistent_type = {
 	.prepare_exception = persistent_prepare_exception,
 	.commit_exception = persistent_commit_exception,
 	.drop_snapshot = persistent_drop_snapshot,
-	.fraction_full = persistent_fraction_full,
+	.usage = persistent_usage,
 	.status = persistent_status,
 };
 
@@ -751,7 +762,7 @@ static struct dm_exception_store_type _persistent_compat_type = {
 	.prepare_exception = persistent_prepare_exception,
 	.commit_exception = persistent_commit_exception,
 	.drop_snapshot = persistent_drop_snapshot,
-	.fraction_full = persistent_fraction_full,
+	.usage = persistent_usage,
 	.status = persistent_status,
 };
 
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 267801b34ff..245a50c7337 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store,
 	callback(callback_context, 1);
 }
 
-static void transient_fraction_full(struct dm_exception_store *store,
-				    sector_t *numerator, sector_t *denominator)
+static void transient_usage(struct dm_exception_store *store,
+			    sector_t *total_sectors,
+			    sector_t *sectors_allocated,
+			    sector_t *metadata_sectors)
 {
-	*numerator = ((struct transient_c *) store->context)->next_free;
-	*denominator = get_dev_size(store->cow->bdev);
+	*sectors_allocated = ((struct transient_c *) store->context)->next_free;
+	*total_sectors = get_dev_size(store->cow->bdev);
+	*metadata_sectors = 0;
 }
 
 static int transient_ctr(struct dm_exception_store *store,
@@ -106,7 +109,7 @@ static struct dm_exception_store_type _transient_type = {
 	.read_metadata = transient_read_metadata,
 	.prepare_exception = transient_prepare_exception,
 	.commit_exception = transient_commit_exception,
-	.fraction_full = transient_fraction_full,
+	.usage = transient_usage,
 	.status = transient_status,
 };
 
@@ -118,7 +121,7 @@ static struct dm_exception_store_type _transient_compat_type = {
 	.read_metadata = transient_read_metadata,
 	.prepare_exception = transient_prepare_exception,
 	.commit_exception = transient_commit_exception,
-	.fraction_full = transient_fraction_full,
+	.usage = transient_usage,
 	.status = transient_status,
 };
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index cb4c2c3a43f..8bd77cbd7e4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1174,14 +1174,17 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 		if (!snap->valid)
 			DMEMIT("Invalid");
 		else {
-			if (snap->store->type->fraction_full) {
-				sector_t numerator, denominator;
-				snap->store->type->fraction_full(snap->store,
-								 &numerator,
-								 &denominator);
-				DMEMIT("%llu/%llu",
-				       (unsigned long long)numerator,
-				       (unsigned long long)denominator);
+			if (snap->store->type->usage) {
+				sector_t total_sectors, sectors_allocated,
+					 metadata_sectors;
+				snap->store->type->usage(snap->store,
+							 &total_sectors,
+							 &sectors_allocated,
+							 &metadata_sectors);
+				DMEMIT("%llu/%llu %llu",
+				       (unsigned long long)sectors_allocated,
+				       (unsigned long long)total_sectors,
+				       (unsigned long long)metadata_sectors);
 			}
 			else
 				DMEMIT("Unknown");
@@ -1462,7 +1465,7 @@ static struct target_type origin_target = {
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 7, 0},
+	.version = {1, 8, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
-- 
cgit v1.2.3


From fc56f6fbcca3672c63c93c65f45105faacfc13cb Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:12 +0000
Subject: dm snapshot: move cow ref from exception store to snap core

Store the reference to the snapshot cow device in the core snapshot
code instead of each exception store.  It can be accessed through the
new function dm_snap_cow().  Exception stores should each now maintain a
reference to their parent snapshot struct.

This is cleaner and makes part of the forthcoming snapshot merge code simpler.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-exception-store.c | 30 ++++++-----------
 drivers/md/dm-exception-store.h | 12 +++++--
 drivers/md/dm-snap-persistent.c | 12 +++----
 drivers/md/dm-snap-transient.c  |  7 ++--
 drivers/md/dm-snap.c            | 74 +++++++++++++++++++++++++++--------------
 5 files changed, 78 insertions(+), 57 deletions(-)

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 205215968ff..2b7907b6dd0 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
 	}
 
 	/* Validate the chunk size against the device block size */
-	if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
+	if (chunk_size %
+	    (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
 		*error = "Chunk size is not a multiple of device blocksize";
 		return -EINVAL;
 	}
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
 }
 
 int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+			      struct dm_snapshot *snap,
 			      unsigned *args_used,
 			      struct dm_exception_store **store)
 {
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 	struct dm_exception_store *tmp_store;
 	char persistent;
 
-	if (argc < 3) {
+	if (argc < 2) {
 		ti->error = "Insufficient exception store arguments";
 		return -EINVAL;
 	}
@@ -209,7 +211,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 		return -ENOMEM;
 	}
 
-	persistent = toupper(*argv[1]);
+	persistent = toupper(*argv[0]);
 	if (persistent == 'P')
 		type = get_type("P");
 	else if (persistent == 'N')
@@ -227,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 	}
 
 	tmp_store->type = type;
-	tmp_store->ti = ti;
+	tmp_store->snap = snap;
 
-	r = dm_get_device(ti, argv[0], 0, 0,
-			  FMODE_READ | FMODE_WRITE, &tmp_store->cow);
-	if (r) {
-		ti->error = "Cannot get COW device";
-		goto bad_cow;
-	}
-
-	r = set_chunk_size(tmp_store, argv[2], &ti->error);
+	r = set_chunk_size(tmp_store, argv[1], &ti->error);
 	if (r)
-		goto bad_ctr;
+		goto bad;
 
 	r = type->ctr(tmp_store, 0, NULL);
 	if (r) {
 		ti->error = "Exception store type constructor failed";
-		goto bad_ctr;
+		goto bad;
 	}
 
-	*args_used = 3;
+	*args_used = 2;
 	*store = tmp_store;
 	return 0;
 
-bad_ctr:
-	dm_put_device(ti, tmp_store->cow);
-bad_cow:
+bad:
 	put_type(type);
 bad_type:
 	kfree(tmp_store);
@@ -263,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
 void dm_exception_store_destroy(struct dm_exception_store *store)
 {
 	store->type->dtr(store);
-	dm_put_device(store->ti, store->cow);
 	put_type(store->type);
 	kfree(store);
 }
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 366c8b1fca3..bb8874653de 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -94,11 +94,11 @@ struct dm_exception_store_type {
 	struct list_head list;
 };
 
+struct dm_snapshot;
+
 struct dm_exception_store {
 	struct dm_exception_store_type *type;
-	struct dm_target *ti;
-
-	struct dm_dev *cow;
+	struct dm_snapshot *snap;
 
 	/* Size of data blocks saved - must be a power of 2 */
 	unsigned chunk_size;
@@ -108,6 +108,11 @@ struct dm_exception_store {
 	void *context;
 };
 
+/*
+ * Obtain the cow device used by a given snapshot.
+ */
+struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
+
 /*
  * Funtions to manipulate consecutive chunks
  */
@@ -173,6 +178,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
 				      char **error);
 
 int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+			      struct dm_snapshot *snap,
 			      unsigned *args_used,
 			      struct dm_exception_store **store);
 void dm_exception_store_destroy(struct dm_exception_store *store);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 767065f6c5f..157999ebd23 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -214,7 +214,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
 		    int metadata)
 {
 	struct dm_io_region where = {
-		.bdev = ps->store->cow->bdev,
+		.bdev = dm_snap_cow(ps->store->snap)->bdev,
 		.sector = ps->store->chunk_size * chunk,
 		.count = ps->store->chunk_size,
 	};
@@ -294,7 +294,8 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	 */
 	if (!ps->store->chunk_size) {
 		ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-		    bdev_logical_block_size(ps->store->cow->bdev) >> 9);
+		    bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
+					    bdev) >> 9);
 		ps->store->chunk_mask = ps->store->chunk_size - 1;
 		ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
 		chunk_size_supplied = 0;
@@ -497,7 +498,7 @@ static void persistent_usage(struct dm_exception_store *store,
 	struct pstore *ps = get_info(store);
 
 	*sectors_allocated = ps->next_free * store->chunk_size;
-	*total_sectors = get_dev_size(store->cow->bdev);
+	*total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
 
 	/*
 	 * First chunk is the fixed header.
@@ -596,7 +597,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
 	struct pstore *ps = get_info(store);
 	uint32_t stride;
 	chunk_t next_free;
-	sector_t size = get_dev_size(store->cow->bdev);
+	sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
 
 	/* Is there enough room ? */
 	if (size < ((ps->next_free + 1) * store->chunk_size))
@@ -733,8 +734,7 @@ static unsigned persistent_status(struct dm_exception_store *store,
 	case STATUSTYPE_INFO:
 		break;
 	case STATUSTYPE_TABLE:
-		DMEMIT(" %s P %llu", store->cow->name,
-		       (unsigned long long)store->chunk_size);
+		DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
 	}
 
 	return sz;
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 245a50c7337..a0898a66a2f 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -39,7 +39,7 @@ static int transient_prepare_exception(struct dm_exception_store *store,
 				       struct dm_exception *e)
 {
 	struct transient_c *tc = store->context;
-	sector_t size = get_dev_size(store->cow->bdev);
+	sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
 
 	if (size < (tc->next_free + store->chunk_size))
 		return -1;
@@ -65,7 +65,7 @@ static void transient_usage(struct dm_exception_store *store,
 			    sector_t *metadata_sectors)
 {
 	*sectors_allocated = ((struct transient_c *) store->context)->next_free;
-	*total_sectors = get_dev_size(store->cow->bdev);
+	*total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
 	*metadata_sectors = 0;
 }
 
@@ -94,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store,
 	case STATUSTYPE_INFO:
 		break;
 	case STATUSTYPE_TABLE:
-		DMEMIT(" %s N %llu", store->cow->name,
-		       (unsigned long long)store->chunk_size);
+		DMEMIT(" N %llu", (unsigned long long)store->chunk_size);
 	}
 
 	return sz;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 8bd77cbd7e4..dc500a6f623 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -59,6 +59,9 @@ struct dm_snapshot {
 	struct rw_semaphore lock;
 
 	struct dm_dev *origin;
+	struct dm_dev *cow;
+
+	struct dm_target *ti;
 
 	/* List of snapshots per Origin */
 	struct list_head list;
@@ -97,6 +100,12 @@ struct dm_snapshot {
 	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
 };
 
+struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
+{
+	return s->cow;
+}
+EXPORT_SYMBOL(dm_snap_cow);
+
 static struct workqueue_struct *ksnapd;
 static void flush_queued_bios(struct work_struct *work);
 
@@ -558,7 +567,7 @@ static int init_hash_tables(struct dm_snapshot *s)
 	 * Calculate based on the size of the original volume or
 	 * the COW volume...
 	 */
-	cow_dev_size = get_dev_size(s->store->cow->bdev);
+	cow_dev_size = get_dev_size(s->cow->bdev);
 	origin_dev_size = get_dev_size(s->origin->bdev);
 	max_buckets = calc_max_buckets();
 
@@ -596,45 +605,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	struct dm_snapshot *s;
 	int i;
 	int r = -EINVAL;
-	char *origin_path;
-	struct dm_exception_store *store;
+	char *origin_path, *cow_path;
 	unsigned args_used;
 
 	if (argc != 4) {
 		ti->error = "requires exactly 4 arguments";
 		r = -EINVAL;
-		goto bad_args;
+		goto bad;
 	}
 
 	origin_path = argv[0];
 	argv++;
 	argc--;
 
-	r = dm_exception_store_create(ti, argc, argv, &args_used, &store);
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		ti->error = "Cannot allocate snapshot context private "
+		    "structure";
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	cow_path = argv[0];
+	argv++;
+	argc--;
+
+	r = dm_get_device(ti, cow_path, 0, 0,
+			  FMODE_READ | FMODE_WRITE, &s->cow);
+	if (r) {
+		ti->error = "Cannot get COW device";
+		goto bad_cow;
+	}
+
+	r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
 	if (r) {
 		ti->error = "Couldn't create exception store";
 		r = -EINVAL;
-		goto bad_args;
+		goto bad_store;
 	}
 
 	argv += args_used;
 	argc -= args_used;
 
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (!s) {
-		ti->error = "Cannot allocate snapshot context private "
-		    "structure";
-		r = -ENOMEM;
-		goto bad_snap;
-	}
-
 	r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
 	if (r) {
 		ti->error = "Cannot get origin device";
 		goto bad_origin;
 	}
 
-	s->store = store;
+	s->ti = ti;
 	s->valid = 1;
 	s->active = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
@@ -723,12 +742,15 @@ bad_hash_tables:
 	dm_put_device(ti, s->origin);
 
 bad_origin:
-	kfree(s);
+	dm_exception_store_destroy(s->store);
 
-bad_snap:
-	dm_exception_store_destroy(store);
+bad_store:
+	dm_put_device(ti, s->cow);
 
-bad_args:
+bad_cow:
+	kfree(s);
+
+bad:
 	return r;
 }
 
@@ -777,6 +799,8 @@ static void snapshot_dtr(struct dm_target *ti)
 
 	dm_exception_store_destroy(s->store);
 
+	dm_put_device(ti, s->cow);
+
 	kfree(s);
 }
 
@@ -839,7 +863,7 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
 
 	s->valid = 0;
 
-	dm_table_event(s->store->ti->table);
+	dm_table_event(s->ti->table);
 }
 
 static void get_pending_exception(struct dm_snap_pending_exception *pe)
@@ -977,7 +1001,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 	src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
 	src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
 
-	dest.bdev = s->store->cow->bdev;
+	dest.bdev = s->cow->bdev;
 	dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
 	dest.count = src.count;
 
@@ -1038,7 +1062,7 @@ __find_pending_exception(struct dm_snapshot *s,
 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
 			    struct bio *bio, chunk_t chunk)
 {
-	bio->bi_bdev = s->store->cow->bdev;
+	bio->bi_bdev = s->cow->bdev;
 	bio->bi_sector = chunk_to_sector(s->store,
 					 dm_chunk_number(e->new_chunk) +
 					 (chunk - e->old_chunk)) +
@@ -1056,7 +1080,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	struct dm_snap_pending_exception *pe = NULL;
 
 	if (unlikely(bio_empty_barrier(bio))) {
-		bio->bi_bdev = s->store->cow->bdev;
+		bio->bi_bdev = s->cow->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
 
@@ -1200,7 +1224,7 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 		 * to make private copies if the output is to
 		 * make sense.
 		 */
-		DMEMIT("%s", snap->origin->name);
+		DMEMIT("%s %s", snap->origin->name, snap->cow->name);
 		snap->store->type->status(snap->store, type, result + sz,
 					  maxlen - sz);
 		break;
@@ -1240,7 +1264,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 			goto next_snapshot;
 
 		/* Nothing to do if writing beyond end of snapshot */
-		if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table))
+		if (bio->bi_sector >= dm_table_get_size(snap->ti->table))
 			goto next_snapshot;
 
 		/*
-- 
cgit v1.2.3


From c26655ca3ca7550740a63820ee981e5c7c797523 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:12 +0000
Subject: dm snapshot: track suspended state in target

Keep track of whether or not the device is suspended within the snapshot
target module, the same as we do in dm-raid1.

We will use this later to enforce the correct sequence of ioctls to
transfer the in-core exceptions from a snapshot target instance in
one table to a replacement one capable of merging them back
into the origin.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index dc500a6f623..fd04caa9034 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -72,6 +72,9 @@ struct dm_snapshot {
 	/* Origin writes don't trigger exceptions until this is set */
 	int active;
 
+	/* Whether or not owning mapped_device is suspended */
+	int suspended;
+
 	mempool_t *pending_pool;
 
 	atomic_t pending_exceptions_count;
@@ -656,6 +659,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->ti = ti;
 	s->valid = 1;
 	s->active = 0;
+	s->suspended = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
 	init_rwsem(&s->lock);
 	spin_lock_init(&s->pe_lock);
@@ -1175,12 +1179,22 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
 	return 0;
 }
 
+static void snapshot_postsuspend(struct dm_target *ti)
+{
+	struct dm_snapshot *s = ti->private;
+
+	down_write(&s->lock);
+	s->suspended = 1;
+	up_write(&s->lock);
+}
+
 static void snapshot_resume(struct dm_target *ti)
 {
 	struct dm_snapshot *s = ti->private;
 
 	down_write(&s->lock);
 	s->active = 1;
+	s->suspended = 0;
 	up_write(&s->lock);
 }
 
@@ -1489,12 +1503,13 @@ static struct target_type origin_target = {
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 8, 0},
+	.version = {1, 9, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
 	.map     = snapshot_map,
 	.end_io  = snapshot_end_io,
+	.postsuspend = snapshot_postsuspend,
 	.resume  = snapshot_resume,
 	.status  = snapshot_status,
 	.iterate_devices = snapshot_iterate_devices,
-- 
cgit v1.2.3


From 9ca170a3c0cbb0d5251cef6f5a3300fa436ba8ec Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:13 +0000
Subject: dm kcopyd: accept zero size jobs

dm-kcopyd: accept zero-size jobs

This patch changes dm-kcopyd so that it accepts zero-size jobs and completes
them immediatelly via its completion thread.

It is needed for multisnapshots snapshot resizing. When we are writing to
a chunk beyond origin end, no copying is done. To simplify the code, we submit
an empty request to kcopyd and let kcopyd complete it. If we didn't submit
a request to kcopyd and called the completion routine immediatelly, it would
violate the principle that completion is called only from one thread and
it would need additional locking.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3e3fc06cb86..addf8347504 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job)
 {
 	struct dm_kcopyd_client *kc = job->kc;
 	atomic_inc(&kc->nr_jobs);
-	push(&kc->pages_jobs, job);
+	if (unlikely(!job->source.count))
+		push(&kc->complete_jobs, job);
+	else
+		push(&kc->pages_jobs, job);
 	wake(kc);
 }
 
-- 
cgit v1.2.3


From 90abb8c4cec8f0aa4ce58790542e3cf13071601a Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:13 +0000
Subject: dm: abstract dm_in_flight function

This patch adds md_in_flight() to get the number of in_flight I/Os.
No functional change.

This patch is a preparation for a later patch in this series, which
changes I/O counter to md->pending from q->in_flight in request-based dm.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 473f0c3c019..73b89afd656 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -452,6 +452,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info)
 	mempool_free(info, info->tio->md->io_pool);
 }
 
+static int md_in_flight(struct mapped_device *md)
+{
+	return atomic_read(&md->pending[READ]) +
+	       atomic_read(&md->pending[WRITE]);
+}
+
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
@@ -2100,8 +2106,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 				break;
 			}
 			spin_unlock_irqrestore(q->queue_lock, flags);
-		} else if (!atomic_read(&md->pending[0]) &&
-					!atomic_read(&md->pending[1]))
+		} else if (!md_in_flight(md))
 			break;
 
 		if (interruptible == TASK_INTERRUPTIBLE &&
-- 
cgit v1.2.3


From 598de40947909e6b948569710383661ecc0ddc8e Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:14 +0000
Subject: dm: use clone in map_request function

This patch changes the argument of map_request() to clone request
from original request.  No functional change.

This patch is a preparation for PATCH 9, which needs to use
map_request() for clones sharing an original barrier request.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 73b89afd656..cf0b455b21e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1495,11 +1495,10 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 	return BLKPREP_OK;
 }
 
-static void map_request(struct dm_target *ti, struct request *rq,
+static void map_request(struct dm_target *ti, struct request *clone,
 			struct mapped_device *md)
 {
 	int r;
-	struct request *clone = rq->special;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
 	/*
@@ -1576,7 +1575,7 @@ static void dm_request_fn(struct request_queue *q)
 
 		blk_start_request(rq);
 		spin_unlock(q->queue_lock);
-		map_request(ti, rq, md);
+		map_request(ti, rq->special, md);
 		spin_lock_irq(q->queue_lock);
 	}
 
-- 
cgit v1.2.3


From 0888564393a1277ce2d0564d819e1bcff1120343 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:15 +0000
Subject: dm: pass gfp_mask to alloc_rq_tio

This patch adds the gfp_mask argument to alloc_rq_tio().
No functional change.

This patch is a preparation for a later patch in this series which needs to
allocate tio (for barrier I/O) with different allocation flag (GFP_NOIO) from
the one in the normal I/O code path.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index cf0b455b21e..a42dfb7a718 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -432,9 +432,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 	mempool_free(tio, md->tio_pool);
 }
 
-static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
+					    gfp_t gfp_mask)
 {
-	return mempool_alloc(md->tio_pool, GFP_ATOMIC);
+	return mempool_alloc(md->tio_pool, gfp_mask);
 }
 
 static void free_rq_tio(struct dm_rq_target_io *tio)
@@ -1471,7 +1472,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 		return BLKPREP_KILL;
 	}
 
-	tio = alloc_rq_tio(md); /* Only one for each original request */
+	tio = alloc_rq_tio(md, GFP_ATOMIC);
 	if (!tio)
 		/* -ENOMEM */
 		return BLKPREP_DEFER;
-- 
cgit v1.2.3


From 6facdaff229f2b25d0de82be9be99b9f562e72ba Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:15 +0000
Subject: dm: abstract clone_rq

This patch factors out the request cloning code in dm_prep_fn()
as clone_rq().  No functional change.

This patch is a preparation for a later patch in this series which needs to
make clones from an original barrier request.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a42dfb7a718..30f5dc8e52b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1445,6 +1445,32 @@ static int setup_clone(struct request *clone, struct request *rq,
 	return 0;
 }
 
+static struct request *clone_rq(struct request *rq, struct mapped_device *md,
+				gfp_t gfp_mask)
+{
+	struct request *clone;
+	struct dm_rq_target_io *tio;
+
+	tio = alloc_rq_tio(md, gfp_mask);
+	if (!tio)
+		return NULL;
+
+	tio->md = md;
+	tio->ti = NULL;
+	tio->orig = rq;
+	tio->error = 0;
+	memset(&tio->info, 0, sizeof(tio->info));
+
+	clone = &tio->clone;
+	if (setup_clone(clone, rq, tio)) {
+		/* -ENOMEM */
+		free_rq_tio(tio);
+		return NULL;
+	}
+
+	return clone;
+}
+
 static int dm_rq_flush_suspending(struct mapped_device *md)
 {
 	return !md->suspend_rq.special;
@@ -1456,7 +1482,6 @@ static int dm_rq_flush_suspending(struct mapped_device *md)
 static int dm_prep_fn(struct request_queue *q, struct request *rq)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_rq_target_io *tio;
 	struct request *clone;
 
 	if (unlikely(rq == &md->suspend_rq)) {
@@ -1472,24 +1497,10 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 		return BLKPREP_KILL;
 	}
 
-	tio = alloc_rq_tio(md, GFP_ATOMIC);
-	if (!tio)
-		/* -ENOMEM */
+	clone = clone_rq(rq, md, GFP_ATOMIC);
+	if (!clone)
 		return BLKPREP_DEFER;
 
-	tio->md = md;
-	tio->ti = NULL;
-	tio->orig = rq;
-	tio->error = 0;
-	memset(&tio->info, 0, sizeof(tio->info));
-
-	clone = &tio->clone;
-	if (setup_clone(clone, rq, tio)) {
-		/* -ENOMEM */
-		free_rq_tio(tio);
-		return BLKPREP_DEFER;
-	}
-
 	rq->special = clone;
 	rq->cmd_flags |= REQ_DONTPREP;
 
-- 
cgit v1.2.3


From 9f518b27cf682dd5155a4c1679d52cd4b5be82f2 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:16 +0000
Subject: dm: simplify request based suspend

The semantics of bio-based dm were changed recently in the case of
suspend with "--nolockfs" but without "--noflush".
Before 2.6.30, I/Os submitted before the suspend invocation were always
flushed.  From 2.6.30 onwards, I/Os submitted before the suspend
invocation might not be flushed.  (For details, see
http://marc.info/?t=123994433400003&r=1&w=2)

This patch brings the behaviour of request-based dm into line with
bio-based dm, simplifying the code and preparing for a subsequent patch
that will wait for all in_flight I/Os to complete without stopping
request_queue and use dm_wait_for_completion() for it.

This change in semantics simplifies the suspend code as follows:
  o Suspend is implemented as stopping request_queue
    in request-based dm, and all I/Os are queued in the request_queue
    even after suspend is invoked.
  o In the old semantics, we had to track whether I/Os were
    queued before or after the suspend invocation, so a special
    barrier-like request called 'suspend marker' was introduced.
  o With the new semantics, we don't need to flush any I/O
    so we can remove the marker and the code related to the marker
    handling and I/O flushing.

After removing this codes, the suspend sequence is now:
  1. Flush all I/Os by lock_fs() if needed.
  2. Stop dispatching any I/O by stopping the request_queue.
  3. Wait for all in-flight I/Os to be completed or requeued.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 158 +++++---------------------------------------------------
 1 file changed, 14 insertions(+), 144 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 30f5dc8e52b..634b1daab2d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -178,9 +178,6 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
-	/* marker of flush suspend for request-based dm */
-	struct request suspend_rq;
-
 	/* For saving the address of __make_request for request based dm */
 	make_request_fn *saved_make_request_fn;
 
@@ -1471,11 +1468,6 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 	return clone;
 }
 
-static int dm_rq_flush_suspending(struct mapped_device *md)
-{
-	return !md->suspend_rq.special;
-}
-
 /*
  * Called with the queue lock held.
  */
@@ -1484,14 +1476,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 	struct mapped_device *md = q->queuedata;
 	struct request *clone;
 
-	if (unlikely(rq == &md->suspend_rq)) {
-		if (dm_rq_flush_suspending(md))
-			return BLKPREP_OK;
-		else
-			/* The flush suspend was interrupted */
-			return BLKPREP_KILL;
-	}
-
 	if (unlikely(rq->special)) {
 		DMWARN("Already has something in rq->special.");
 		return BLKPREP_KILL;
@@ -1560,27 +1544,15 @@ static void dm_request_fn(struct request_queue *q)
 	struct request *rq;
 
 	/*
-	 * For noflush suspend, check blk_queue_stopped() to immediately
-	 * quit I/O dispatching.
+	 * For suspend, check blk_queue_stopped() and don't increment
+	 * the number of in-flight I/Os after the queue is stopped
+	 * in dm_suspend().
 	 */
 	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
 		if (!rq)
 			goto plug_and_out;
 
-		if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
-			if (queue_in_flight(q))
-				/* Not quiet yet.  Wait more */
-				goto plug_and_out;
-
-			/* This device should be quiet now */
-			__stop_queue(q);
-			blk_start_request(rq);
-			__blk_end_request_all(rq, 0);
-			wake_up(&md->wait);
-			goto out;
-		}
-
 		ti = dm_table_find_target(map, blk_rq_pos(rq));
 		if (ti->type->busy && ti->type->busy(ti))
 			goto plug_and_out;
@@ -2112,7 +2084,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 		smp_mb();
 		if (dm_request_based(md)) {
 			spin_lock_irqsave(q->queue_lock, flags);
-			if (!queue_in_flight(q) && blk_queue_stopped(q)) {
+			if (!queue_in_flight(q)) {
 				spin_unlock_irqrestore(q->queue_lock, flags);
 				break;
 			}
@@ -2245,67 +2217,6 @@ out:
 	return r;
 }
 
-static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
-{
-	md->suspend_rq.special = (void *)0x1;
-}
-
-static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
-{
-	struct request_queue *q = md->queue;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (!noflush)
-		dm_rq_invalidate_suspend_marker(md);
-	__start_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
-{
-	struct request *rq = &md->suspend_rq;
-	struct request_queue *q = md->queue;
-
-	if (noflush)
-		stop_queue(q);
-	else {
-		blk_rq_init(q, rq);
-		blk_insert_request(q, rq, 0, NULL);
-	}
-}
-
-static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
-{
-	int r = 1;
-	struct request *rq = &md->suspend_rq;
-	struct request_queue *q = md->queue;
-	unsigned long flags;
-
-	if (noflush)
-		return r;
-
-	/* The marker must be protected by queue lock if it is in use */
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (unlikely(rq->ref_count)) {
-		/*
-		 * This can happen, when the previous flush suspend was
-		 * interrupted, the marker is still in the queue and
-		 * this flush suspend has been invoked, because we don't
-		 * remove the marker at the time of suspend interruption.
-		 * We have only one marker per mapped_device, so we can't
-		 * start another flush suspend while it is in use.
-		 */
-		BUG_ON(!rq->special); /* The marker should be invalidated */
-		DMWARN("Invalidating the previous flush suspend is still in"
-		       " progress.  Please retry later.");
-		r = 0;
-	}
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return r;
-}
-
 /*
  * Functions to lock and unlock any filesystem running on the
  * device.
@@ -2348,49 +2259,11 @@ static void unlock_fs(struct mapped_device *md)
 /*
  * Suspend mechanism in request-based dm.
  *
- * After the suspend starts, further incoming requests are kept in
- * the request_queue and deferred.
- * Remaining requests in the request_queue at the start of suspend are flushed
- * if it is flush suspend.
- * The suspend completes when the following conditions have been satisfied,
- * so wait for it:
- *    1. q->in_flight is 0 (which means no in_flight request)
- *    2. queue has been stopped (which means no request dispatching)
- *
- *
- * Noflush suspend
- * ---------------
- * Noflush suspend doesn't need to dispatch remaining requests.
- * So stop the queue immediately.  Then, wait for all in_flight requests
- * to be completed or requeued.
- *
- * To abort noflush suspend, start the queue.
+ * 1. Flush all I/Os by lock_fs() if needed.
+ * 2. Stop dispatching any I/O by stopping the request_queue.
+ * 3. Wait for all in-flight I/Os to be completed or requeued.
  *
- *
- * Flush suspend
- * -------------
- * Flush suspend needs to dispatch remaining requests.  So stop the queue
- * after the remaining requests are completed. (Requeued request must be also
- * re-dispatched and completed.  Until then, we can't stop the queue.)
- *
- * During flushing the remaining requests, further incoming requests are also
- * inserted to the same queue.  To distinguish which requests are to be
- * flushed, we insert a marker request to the queue at the time of starting
- * flush suspend, like a barrier.
- * The dispatching is blocked when the marker is found on the top of the queue.
- * And the queue is stopped when all in_flight requests are completed, since
- * that means the remaining requests are completely flushed.
- * Then, the marker is removed from the queue.
- *
- * To abort flush suspend, we also need to take care of the marker, not only
- * starting the queue.
- * We don't remove the marker forcibly from the queue since it's against
- * the block-layer manner.  Instead, we put a invalidated mark on the marker.
- * When the invalidated marker is found on the top of the queue, it is
- * immediately removed from the queue, so it doesn't block dispatching.
- * Because we have only one marker per mapped_device, we can't start another
- * flush suspend until the invalidated marker is removed from the queue.
- * So fail and return with -EBUSY in such a case.
+ * To abort suspend, start the request_queue.
  */
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
@@ -2406,11 +2279,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
-	if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
-		r = -EBUSY;
-		goto out_unlock;
-	}
-
 	map = dm_get_table(md);
 
 	/*
@@ -2424,8 +2292,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	dm_table_presuspend_targets(map);
 
 	/*
-	 * Flush I/O to the device. noflush supersedes do_lockfs,
-	 * because lock_fs() needs to flush I/Os.
+	 * Flush I/O to the device.
+	 * Any I/O submitted after lock_fs() may not be flushed.
+	 * noflush takes precedence over do_lockfs.
+	 * (lock_fs() flushes I/Os and waits for them to complete.)
 	 */
 	if (!noflush && do_lockfs) {
 		r = lock_fs(md);
@@ -2457,7 +2327,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	flush_workqueue(md->wq);
 
 	if (dm_request_based(md))
-		dm_rq_start_suspend(md, noflush);
+		stop_queue(md->queue);
 
 	/*
 	 * At this point no more requests are entering target request routines.
@@ -2476,7 +2346,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		dm_queue_flush(md);
 
 		if (dm_request_based(md))
-			dm_rq_abort_suspend(md, noflush);
+			start_queue(md->queue);
 
 		unlock_fs(md);
 		goto out; /* pushback list is already flushed, so skip flush */
-- 
cgit v1.2.3


From b4324feeae304ae39e631a254d238a7d63be004b Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:16 +0000
Subject: dm: use md pending for in flight IO counting

This patch changes the counter for the number of in_flight I/Os
to md->pending from q->in_flight in preparation for a later patch.
No functional change.

Request-based dm used q->in_flight to count the number of in-flight
clones assuming the counter is always incremented for an in-flight
original request and original:clone is 1:1 relationship.
However, it this no longer true for barrier requests.
So use md->pending to count the number of in-flight clones.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 46 ++++++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 634b1daab2d..01d741a0c07 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -727,23 +727,16 @@ static void end_clone_bio(struct bio *clone, int error)
  * the md may be freed in dm_put() at the end of this function.
  * Or do dm_get() before calling this function and dm_put() later.
  */
-static void rq_completed(struct mapped_device *md, int run_queue)
+static void rq_completed(struct mapped_device *md, int rw, int run_queue)
 {
-	int wakeup_waiters = 0;
-	struct request_queue *q = md->queue;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (!queue_in_flight(q))
-		wakeup_waiters = 1;
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	atomic_dec(&md->pending[rw]);
 
 	/* nudge anyone waiting on suspend queue */
-	if (wakeup_waiters)
+	if (!md_in_flight(md))
 		wake_up(&md->wait);
 
 	if (run_queue)
-		blk_run_queue(q);
+		blk_run_queue(md->queue);
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
@@ -774,6 +767,7 @@ static void dm_unprep_request(struct request *rq)
  */
 void dm_requeue_unmapped_request(struct request *clone)
 {
+	int rw = rq_data_dir(clone);
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
@@ -788,7 +782,7 @@ void dm_requeue_unmapped_request(struct request *clone)
 	blk_requeue_request(q, rq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	rq_completed(md, 0);
+	rq_completed(md, rw, 0);
 }
 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
 
@@ -827,6 +821,7 @@ static void start_queue(struct request_queue *q)
  */
 static void dm_end_request(struct request *clone, int error)
 {
+	int rw = rq_data_dir(clone);
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
@@ -848,7 +843,7 @@ static void dm_end_request(struct request *clone, int error)
 
 	blk_end_request_all(rq, error);
 
-	rq_completed(md, 1);
+	rq_completed(md, rw, 1);
 }
 
 /*
@@ -1541,12 +1536,13 @@ static void dm_request_fn(struct request_queue *q)
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_table(md);
 	struct dm_target *ti;
-	struct request *rq;
+	struct request *rq, *clone;
 
 	/*
-	 * For suspend, check blk_queue_stopped() and don't increment
-	 * the number of in-flight I/Os after the queue is stopped
-	 * in dm_suspend().
+	 * For suspend, check blk_queue_stopped() and increment
+	 * ->pending within a single queue_lock not to increment the
+	 * number of in-flight I/Os after the queue is stopped in
+	 * dm_suspend().
 	 */
 	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
@@ -1558,8 +1554,11 @@ static void dm_request_fn(struct request_queue *q)
 			goto plug_and_out;
 
 		blk_start_request(rq);
+		clone = rq->special;
+		atomic_inc(&md->pending[rq_data_dir(clone)]);
+
 		spin_unlock(q->queue_lock);
-		map_request(ti, rq->special, md);
+		map_request(ti, clone, md);
 		spin_lock_irq(q->queue_lock);
 	}
 
@@ -2071,8 +2070,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
-	struct request_queue *q = md->queue;
-	unsigned long flags;
 
 	dm_unplug_all(md->queue);
 
@@ -2082,14 +2079,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 		set_current_state(interruptible);
 
 		smp_mb();
-		if (dm_request_based(md)) {
-			spin_lock_irqsave(q->queue_lock, flags);
-			if (!queue_in_flight(q)) {
-				spin_unlock_irqrestore(q->queue_lock, flags);
-				break;
-			}
-			spin_unlock_irqrestore(q->queue_lock, flags);
-		} else if (!md_in_flight(md))
+		if (!md_in_flight(md))
 			break;
 
 		if (interruptible == TASK_INTERRUPTIBLE &&
-- 
cgit v1.2.3


From 11a68244e16b0c35e122dd55b4e7c595e0fb67a1 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:17 +0000
Subject: dm: refactor request based completion functions

This patch factors out the clone completion code, dm_done(),
from dm_softirq_done() in preparation for a subsequent patch.
No functional change.

dm_done() will be used in barrier completion, which can't use and
doesn't need softirq.  The softirq_done callback needs to get a clone
from an original request but it can't in the case of barrier, where
an original request is shared by multiple clones.  On the other hand,
the completion of barrier clones doesn't involve re-submitting requests,
which was the primary reason of the need for softirq.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 01d741a0c07..c65be45a4c4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -846,34 +846,45 @@ static void dm_end_request(struct request *clone, int error)
 	rq_completed(md, rw, 1);
 }
 
-/*
- * Request completion handler for request-based dm
- */
-static void dm_softirq_done(struct request *rq)
+static void dm_done(struct request *clone, int error, bool mapped)
 {
-	struct request *clone = rq->completion_data;
+	int r = error;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
-	int error = tio->error;
 
-	if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
-		error = rq_end_io(tio->ti, clone, error, &tio->info);
+	if (mapped && rq_end_io)
+		r = rq_end_io(tio->ti, clone, error, &tio->info);
 
-	if (error <= 0)
+	if (r <= 0)
 		/* The target wants to complete the I/O */
-		dm_end_request(clone, error);
-	else if (error == DM_ENDIO_INCOMPLETE)
+		dm_end_request(clone, r);
+	else if (r == DM_ENDIO_INCOMPLETE)
 		/* The target will handle the I/O */
 		return;
-	else if (error == DM_ENDIO_REQUEUE)
+	else if (r == DM_ENDIO_REQUEUE)
 		/* The target wants to requeue the I/O */
 		dm_requeue_unmapped_request(clone);
 	else {
-		DMWARN("unimplemented target endio return value: %d", error);
+		DMWARN("unimplemented target endio return value: %d", r);
 		BUG();
 	}
 }
 
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+	bool mapped = true;
+	struct request *clone = rq->completion_data;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	if (rq->cmd_flags & REQ_FAILED)
+		mapped = false;
+
+	dm_done(clone, tio->error, mapped);
+}
+
 /*
  * Complete the clone and the original request with the error status
  * through softirq context.
-- 
cgit v1.2.3


From 980691e5f3a1b5ebbb2d34014e028fd7f1c6e4fb Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:17 +0000
Subject: dm: move dm_end_request

This patch moves dm_end_request() to make the next patch more readable.
No functional change.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 62 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c65be45a4c4..821a5dd6a8d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -752,6 +752,37 @@ static void free_rq_clone(struct request *clone)
 	free_rq_tio(tio);
 }
 
+/*
+ * Complete the clone and the original request.
+ * Must be called without queue lock.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+	int rw = rq_data_dir(clone);
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+
+	if (blk_pc_request(rq)) {
+		rq->errors = clone->errors;
+		rq->resid_len = clone->resid_len;
+
+		if (rq->sense)
+			/*
+			 * We are using the sense buffer of the original
+			 * request.
+			 * So setting the length of the sense data is enough.
+			 */
+			rq->sense_len = clone->sense_len;
+	}
+
+	free_rq_clone(clone);
+
+	blk_end_request_all(rq, error);
+
+	rq_completed(md, rw, 1);
+}
+
 static void dm_unprep_request(struct request *rq)
 {
 	struct request *clone = rq->special;
@@ -815,37 +846,6 @@ static void start_queue(struct request_queue *q)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-/*
- * Complete the clone and the original request.
- * Must be called without queue lock.
- */
-static void dm_end_request(struct request *clone, int error)
-{
-	int rw = rq_data_dir(clone);
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	struct mapped_device *md = tio->md;
-	struct request *rq = tio->orig;
-
-	if (blk_pc_request(rq)) {
-		rq->errors = clone->errors;
-		rq->resid_len = clone->resid_len;
-
-		if (rq->sense)
-			/*
-			 * We are using the sense buffer of the original
-			 * request.
-			 * So setting the length of the sense data is enough.
-			 */
-			rq->sense_len = clone->sense_len;
-	}
-
-	free_rq_clone(clone);
-
-	blk_end_request_all(rq, error);
-
-	rq_completed(md, rw, 1);
-}
-
 static void dm_done(struct request *clone, int error, bool mapped)
 {
 	int r = error;
-- 
cgit v1.2.3


From d0bcb8786532b01206f04258eb6b7d4ac858436a Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:18 +0000
Subject: dm: add request based barrier support

This patch adds barrier support for request-based dm.

CORE DESIGN

The design is basically same as bio-based dm, which emulates barrier
by mapping empty barrier bios before/after a barrier I/O.
But request-based dm has been using struct request_queue for I/O
queueing, so the block-layer's barrier mechanism can be used.

o Summary of the block-layer's behavior (which is depended by dm-core)
  Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for
  I/O barrier.  It means that when an I/O requiring barrier is found
  in the request_queue, the block-layer makes pre-flush request and
  post-flush request just before and just after the I/O respectively.

  After the ordered sequence starts, the block-layer waits for all
  in-flight I/Os to complete, then gives drivers the pre-flush request,
  the barrier I/O and the post-flush request one by one.
  It means that the request_queue is stopped automatically by
  the block-layer until drivers complete each sequence.

o dm-core
  For the barrier I/O, treats it as a normal I/O, so no additional
  code is needed.

  For the pre/post-flush request, flushes caches by the followings:
    1. Make the number of empty barrier requests required by target's
       num_flush_requests, and map them (dm_rq_barrier()).
    2. Waits for the mapped barriers to complete (dm_rq_barrier()).
       If error has occurred, save the error value to md->barrier_error
       (dm_end_request()).
       (*) Basically, the first reported error is taken.
           But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE
           follows.
    3. Requeue the pre/post-flush request if the error value is
       DM_ENDIO_REQUEUE.  Otherwise, completes with the error value
       (dm_rq_barrier_work()).
  The pre/post-flush work above is done in the kernel thread (kdmflush)
  context, since memory allocation which might sleep is needed in
  dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is
  an irq-disabled context.
  Also, clones of the pre/post-flush request share an original, so
  such clones can't be completed using the softirq context.
  Instead, complete them in the context of underlying device drivers.
  It should be safe since there is no I/O dispatching during
  the completion of such clones.

  For suspend, the workqueue of kdmflush needs to be flushed after
  the request_queue has been stopped.  Otherwise, the next flush work
  can be kicked even after the suspend completes.

TARGET INTERFACE

No new interface is added.
Just use the existing num_flush_requests in struct target_type
as same as bio-based dm.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 196 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 821a5dd6a8d..3de8d6d5b0b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -142,10 +142,20 @@ struct mapped_device {
 	 */
 	int barrier_error;
 
+	/*
+	 * Protect barrier_error from concurrent endio processing
+	 * in request-based dm.
+	 */
+	spinlock_t barrier_error_lock;
+
 	/*
 	 * Processing queue (flush/barriers)
 	 */
 	struct workqueue_struct *wq;
+	struct work_struct barrier_work;
+
+	/* A pointer to the currently processing pre/post flush request */
+	struct request *flush_request;
 
 	/*
 	 * The current mapping.
@@ -722,6 +732,23 @@ static void end_clone_bio(struct bio *clone, int error)
 	blk_update_request(tio->orig, 0, nr_bytes);
 }
 
+static void store_barrier_error(struct mapped_device *md, int error)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&md->barrier_error_lock, flags);
+	/*
+	 * Basically, the first error is taken, but:
+	 *   -EOPNOTSUPP supersedes any I/O error.
+	 *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
+	 */
+	if (!md->barrier_error || error == -EOPNOTSUPP ||
+	    (md->barrier_error != -EOPNOTSUPP &&
+	     error == DM_ENDIO_REQUEUE))
+		md->barrier_error = error;
+	spin_unlock_irqrestore(&md->barrier_error_lock, flags);
+}
+
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -759,11 +786,13 @@ static void free_rq_clone(struct request *clone)
 static void dm_end_request(struct request *clone, int error)
 {
 	int rw = rq_data_dir(clone);
+	int run_queue = 1;
+	bool is_barrier = blk_barrier_rq(clone);
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 
-	if (blk_pc_request(rq)) {
+	if (blk_pc_request(rq) && !is_barrier) {
 		rq->errors = clone->errors;
 		rq->resid_len = clone->resid_len;
 
@@ -778,9 +807,14 @@ static void dm_end_request(struct request *clone, int error)
 
 	free_rq_clone(clone);
 
-	blk_end_request_all(rq, error);
+	if (unlikely(is_barrier)) {
+		if (unlikely(error))
+			store_barrier_error(md, error);
+		run_queue = 0;
+	} else
+		blk_end_request_all(rq, error);
 
-	rq_completed(md, rw, 1);
+	rq_completed(md, rw, run_queue);
 }
 
 static void dm_unprep_request(struct request *rq)
@@ -805,6 +839,16 @@ void dm_requeue_unmapped_request(struct request *clone)
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 
+	if (unlikely(blk_barrier_rq(clone))) {
+		/*
+		 * Barrier clones share an original request.
+		 * Leave it to dm_end_request(), which handles this special
+		 * case.
+		 */
+		dm_end_request(clone, DM_ENDIO_REQUEUE);
+		return;
+	}
+
 	dm_unprep_request(rq);
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -894,6 +938,19 @@ static void dm_complete_request(struct request *clone, int error)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 
+	if (unlikely(blk_barrier_rq(clone))) {
+		/*
+		 * Barrier clones share an original request.  So can't use
+		 * softirq_done with the original.
+		 * Pass the clone to dm_done() directly in this special case.
+		 * It is safe (even if clone->q->queue_lock is held here)
+		 * because there is no I/O dispatching during the completion
+		 * of barrier clone.
+		 */
+		dm_done(clone, error, true);
+		return;
+	}
+
 	tio->error = error;
 	rq->completion_data = clone;
 	blk_complete_request(rq);
@@ -910,6 +967,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 
+	if (unlikely(blk_barrier_rq(clone))) {
+		/*
+		 * Barrier clones share an original request.
+		 * Leave it to dm_end_request(), which handles this special
+		 * case.
+		 */
+		BUG_ON(error > 0);
+		dm_end_request(clone, error);
+		return;
+	}
+
 	rq->cmd_flags |= REQ_FAILED;
 	dm_complete_request(clone, error);
 }
@@ -1364,11 +1432,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct mapped_device *md = q->queuedata;
 
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
-		return 0;
-	}
-
 	return md->saved_make_request_fn(q, bio); /* call __make_request() */
 }
 
@@ -1387,6 +1450,25 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	return _dm_request(q, bio);
 }
 
+/*
+ * Mark this request as flush request, so that dm_request_fn() can
+ * recognize.
+ */
+static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
+{
+	rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
+	rq->cmd[0] = REQ_LB_OP_FLUSH;
+}
+
+static bool dm_rq_is_flush_request(struct request *rq)
+{
+	if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+	    rq->cmd[0] == REQ_LB_OP_FLUSH)
+		return true;
+	else
+		return false;
+}
+
 void dm_dispatch_request(struct request *rq)
 {
 	int r;
@@ -1432,16 +1514,24 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 static int setup_clone(struct request *clone, struct request *rq,
 		       struct dm_rq_target_io *tio)
 {
-	int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-				  dm_rq_bio_constructor, tio);
+	int r;
 
-	if (r)
-		return r;
+	if (dm_rq_is_flush_request(rq)) {
+		blk_rq_init(NULL, clone);
+		clone->cmd_type = REQ_TYPE_FS;
+		clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
+	} else {
+		r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+				      dm_rq_bio_constructor, tio);
+		if (r)
+			return r;
+
+		clone->cmd = rq->cmd;
+		clone->cmd_len = rq->cmd_len;
+		clone->sense = rq->sense;
+		clone->buffer = rq->buffer;
+	}
 
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
-	clone->buffer = rq->buffer;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
 
@@ -1482,6 +1572,9 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 	struct mapped_device *md = q->queuedata;
 	struct request *clone;
 
+	if (unlikely(dm_rq_is_flush_request(rq)))
+		return BLKPREP_OK;
+
 	if (unlikely(rq->special)) {
 		DMWARN("Already has something in rq->special.");
 		return BLKPREP_KILL;
@@ -1560,6 +1653,14 @@ static void dm_request_fn(struct request_queue *q)
 		if (!rq)
 			goto plug_and_out;
 
+		if (unlikely(dm_rq_is_flush_request(rq))) {
+			BUG_ON(md->flush_request);
+			md->flush_request = rq;
+			blk_start_request(rq);
+			queue_work(md->wq, &md->barrier_work);
+			goto out;
+		}
+
 		ti = dm_table_find_target(map, blk_rq_pos(rq));
 		if (ti->type->busy && ti->type->busy(ti))
 			goto plug_and_out;
@@ -1726,6 +1827,7 @@ out:
 static const struct block_device_operations dm_blk_dops;
 
 static void dm_wq_work(struct work_struct *work);
+static void dm_rq_barrier_work(struct work_struct *work);
 
 /*
  * Allocate and initialise a blank device with a given minor.
@@ -1755,6 +1857,7 @@ static struct mapped_device *alloc_dev(int minor)
 	init_rwsem(&md->io_lock);
 	mutex_init(&md->suspend_lock);
 	spin_lock_init(&md->deferred_lock);
+	spin_lock_init(&md->barrier_error_lock);
 	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->open_count, 0);
@@ -1789,6 +1892,8 @@ static struct mapped_device *alloc_dev(int minor)
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
+	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
+			  dm_rq_prepare_flush);
 
 	md->disk = alloc_disk(1);
 	if (!md->disk)
@@ -1798,6 +1903,7 @@ static struct mapped_device *alloc_dev(int minor)
 	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
+	INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
 	init_waitqueue_head(&md->eventq);
 
 	md->disk->major = _major;
@@ -2185,6 +2291,73 @@ static void dm_queue_flush(struct mapped_device *md)
 	queue_work(md->wq, &md->work);
 }
 
+static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	tio->info.flush_request = flush_nr;
+}
+
+/* Issue barrier requests to targets and wait for their completion. */
+static int dm_rq_barrier(struct mapped_device *md)
+{
+	int i, j;
+	struct dm_table *map = dm_get_table(md);
+	unsigned num_targets = dm_table_get_num_targets(map);
+	struct dm_target *ti;
+	struct request *clone;
+
+	md->barrier_error = 0;
+
+	for (i = 0; i < num_targets; i++) {
+		ti = dm_table_get_target(map, i);
+		for (j = 0; j < ti->num_flush_requests; j++) {
+			clone = clone_rq(md->flush_request, md, GFP_NOIO);
+			dm_rq_set_flush_nr(clone, j);
+			atomic_inc(&md->pending[rq_data_dir(clone)]);
+			map_request(ti, clone, md);
+		}
+	}
+
+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+	dm_table_put(map);
+
+	return md->barrier_error;
+}
+
+static void dm_rq_barrier_work(struct work_struct *work)
+{
+	int error;
+	struct mapped_device *md = container_of(work, struct mapped_device,
+						barrier_work);
+	struct request_queue *q = md->queue;
+	struct request *rq;
+	unsigned long flags;
+
+	/*
+	 * Hold the md reference here and leave it at the last part so that
+	 * the md can't be deleted by device opener when the barrier request
+	 * completes.
+	 */
+	dm_get(md);
+
+	error = dm_rq_barrier(md);
+
+	rq = md->flush_request;
+	md->flush_request = NULL;
+
+	if (error == DM_ENDIO_REQUEUE) {
+		spin_lock_irqsave(q->queue_lock, flags);
+		blk_requeue_request(q, rq);
+		spin_unlock_irqrestore(q->queue_lock, flags);
+	} else
+		blk_end_request_all(rq, error);
+
+	blk_run_queue(q);
+
+	dm_put(md);
+}
+
 /*
  * Swap in a new table (destroying old one).
  */
@@ -2325,11 +2498,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
 	up_write(&md->io_lock);
 
-	flush_workqueue(md->wq);
-
+	/*
+	 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
+	 * can be kicked until md->queue is stopped.  So stop md->queue before
+	 * flushing md->wq.
+	 */
 	if (dm_request_based(md))
 		stop_queue(md->queue);
 
+	flush_workqueue(md->wq);
+
 	/*
 	 * At this point no more requests are entering target request routines.
 	 * We call dm_wait_for_completion to wait for all existing requests
-- 
cgit v1.2.3


From 7c6664114b7342a36f14a6836564e865669b3cea Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Thu, 10 Dec 2009 23:52:19 +0000
Subject: dm: rename dm_get_table to dm_get_live_table

Rename dm_get_table to dm_get_live_table.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c         | 14 +++++++-------
 drivers/md/dm.c               | 24 ++++++++++++------------
 include/linux/device-mapper.h |  2 +-
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 99de0e4ce83..bf3d19a0fb7 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -235,7 +235,7 @@ static void __hash_remove(struct hash_cell *hc)
 	dm_set_mdptr(hc->md, NULL);
 	mutex_unlock(&dm_hash_cells_mutex);
 
-	table = dm_get_table(hc->md);
+	table = dm_get_live_table(hc->md);
 	if (table) {
 		dm_table_event(table);
 		dm_table_put(table);
@@ -338,7 +338,7 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
 	/*
 	 * Wake up any dm event waiters.
 	 */
-	table = dm_get_table(hc->md);
+	table = dm_get_live_table(hc->md);
 	if (table) {
 		dm_table_event(table);
 		dm_table_put(table);
@@ -564,7 +564,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 
 	param->event_nr = dm_get_event_nr(md);
 
-	table = dm_get_table(md);
+	table = dm_get_live_table(md);
 	if (table) {
 		param->flags |= DM_ACTIVE_PRESENT_FLAG;
 		param->target_count = dm_table_get_num_targets(table);
@@ -993,7 +993,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	if (r)
 		goto out;
 
-	table = dm_get_table(md);
+	table = dm_get_live_table(md);
 	if (table) {
 		retrieve_status(table, param, param_size);
 		dm_table_put(table);
@@ -1226,7 +1226,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
 	if (r)
 		goto out;
 
-	table = dm_get_table(md);
+	table = dm_get_live_table(md);
 	if (table) {
 		retrieve_deps(table, param, param_size);
 		dm_table_put(table);
@@ -1255,7 +1255,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 	if (r)
 		goto out;
 
-	table = dm_get_table(md);
+	table = dm_get_live_table(md);
 	if (table) {
 		retrieve_status(table, param, param_size);
 		dm_table_put(table);
@@ -1299,7 +1299,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 		goto out;
 	}
 
-	table = dm_get_table(md);
+	table = dm_get_live_table(md);
 	if (!table)
 		goto out_argv;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3de8d6d5b0b..233a2e9156a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -397,7 +397,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
-	struct dm_table *map = dm_get_table(md);
+	struct dm_table *map = dm_get_live_table(md);
 	struct dm_target *tgt;
 	int r = -ENOTTY;
 
@@ -528,7 +528,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
  * function to access the md->map field, and make sure they call
  * dm_table_put() when finished.
  */
-struct dm_table *dm_get_table(struct mapped_device *md)
+struct dm_table *dm_get_live_table(struct mapped_device *md)
 {
 	struct dm_table *t;
 	unsigned long flags;
@@ -1294,7 +1294,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	struct clone_info ci;
 	int error = 0;
 
-	ci.map = dm_get_table(md);
+	ci.map = dm_get_live_table(md);
 	if (unlikely(!ci.map)) {
 		if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
 			bio_io_error(bio);
@@ -1335,7 +1335,7 @@ static int dm_merge_bvec(struct request_queue *q,
 			 struct bio_vec *biovec)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_table(md);
+	struct dm_table *map = dm_get_live_table(md);
 	struct dm_target *ti;
 	sector_t max_sectors;
 	int max_size = 0;
@@ -1638,7 +1638,7 @@ static void map_request(struct dm_target *ti, struct request *clone,
 static void dm_request_fn(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_table(md);
+	struct dm_table *map = dm_get_live_table(md);
 	struct dm_target *ti;
 	struct request *rq, *clone;
 
@@ -1697,7 +1697,7 @@ static int dm_lld_busy(struct request_queue *q)
 {
 	int r;
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_table(md);
+	struct dm_table *map = dm_get_live_table(md);
 
 	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
 		r = 1;
@@ -1712,7 +1712,7 @@ static int dm_lld_busy(struct request_queue *q)
 static void dm_unplug_all(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_table(md);
+	struct dm_table *map = dm_get_live_table(md);
 
 	if (map) {
 		if (dm_request_based(md))
@@ -1730,7 +1730,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	struct dm_table *map;
 
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
-		map = dm_get_table(md);
+		map = dm_get_live_table(md);
 		if (map) {
 			/*
 			 * Request-based dm cares about only own queue for
@@ -2166,7 +2166,7 @@ void dm_put(struct mapped_device *md)
 	BUG_ON(test_bit(DMF_FREEING, &md->flags));
 
 	if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
-		map = dm_get_table(md);
+		map = dm_get_live_table(md);
 		idr_replace(&_minor_idr, MINOR_ALLOCED,
 			    MINOR(disk_devt(dm_disk(md))));
 		set_bit(DMF_FREEING, &md->flags);
@@ -2302,7 +2302,7 @@ static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
 static int dm_rq_barrier(struct mapped_device *md)
 {
 	int i, j;
-	struct dm_table *map = dm_get_table(md);
+	struct dm_table *map = dm_get_live_table(md);
 	unsigned num_targets = dm_table_get_num_targets(map);
 	struct dm_target *ti;
 	struct request *clone;
@@ -2453,7 +2453,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
-	map = dm_get_table(md);
+	map = dm_get_live_table(md);
 
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2558,7 +2558,7 @@ int dm_resume(struct mapped_device *md)
 	if (!dm_suspended(md))
 		goto out;
 
-	map = dm_get_table(md);
+	map = dm_get_live_table(md);
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index df7607e6dce..fc16a351ef7 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -276,7 +276,7 @@ void dm_table_unplug_all(struct dm_table *t);
 /*
  * Table reference counting.
  */
-struct dm_table *dm_get_table(struct mapped_device *md);
+struct dm_table *dm_get_live_table(struct mapped_device *md);
 void dm_table_get(struct dm_table *t);
 void dm_table_put(struct dm_table *t);
 
-- 
cgit v1.2.3


From 6df400ab64c68fc4072a13019fc20fd7e3d51303 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:19 +0000
Subject: dm mpath: flush workqueues before suspend completes

This patch stops the remaining dm-mpath activity during the suspend
sequence by flushing workqueues in postsuspend function.

The current dm-mpath target may not be quiet even after suspend completes
because some workqueues (e.g. device_handler's work, event handling)
are not flushed during the suspend sequence, even though suspended
devices/targets are supposed to be quiet in this state.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index dce971dbdfa..5b23f2df9bd 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -885,13 +885,18 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	return r;
 }
 
-static void multipath_dtr(struct dm_target *ti)
+static void flush_multipath_work(void)
 {
-	struct multipath *m = (struct multipath *) ti->private;
-
 	flush_workqueue(kmpath_handlerd);
 	flush_workqueue(kmultipathd);
 	flush_scheduled_work();
+}
+
+static void multipath_dtr(struct dm_target *ti)
+{
+	struct multipath *m = ti->private;
+
+	flush_multipath_work();
 	free_multipath(m);
 }
 
@@ -1261,6 +1266,11 @@ static void multipath_presuspend(struct dm_target *ti)
 	queue_if_no_path(m, 0, 1);
 }
 
+static void multipath_postsuspend(struct dm_target *ti)
+{
+	flush_multipath_work();
+}
+
 /*
  * Restore the queue_if_no_path setting.
  */
@@ -1567,13 +1577,14 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 1, 0},
+	.version = {1, 1, 1},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
 	.map_rq = multipath_map,
 	.rq_end_io = multipath_end_io,
 	.presuspend = multipath_presuspend,
+	.postsuspend = multipath_postsuspend,
 	.resume = multipath_resume,
 	.status = multipath_status,
 	.message = multipath_message,
-- 
cgit v1.2.3


From 432a212c0dd0f4ca386cf37c5b740ac9dbda4479 Mon Sep 17 00:00:00 2001
From: Mike Anderson <andmike@linux.vnet.ibm.com>
Date: Thu, 10 Dec 2009 23:52:20 +0000
Subject: dm: add dm_deleting_md function

Add dm_deleting_md to check whether or not a given mapped
device is currently being deleted.

Signed-off-by: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 9 +++++++--
 drivers/md/dm.h | 5 +++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 233a2e9156a..16f759fe04f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -329,6 +329,11 @@ static void __exit dm_exit(void)
 /*
  * Block device functions
  */
+int dm_deleting_md(struct mapped_device *md)
+{
+	return test_bit(DMF_DELETING, &md->flags);
+}
+
 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 {
 	struct mapped_device *md;
@@ -340,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 		goto out;
 
 	if (test_bit(DMF_FREEING, &md->flags) ||
-	    test_bit(DMF_DELETING, &md->flags)) {
+	    dm_deleting_md(md)) {
 		md = NULL;
 		goto out;
 	}
@@ -2659,7 +2664,7 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 		return NULL;
 
 	if (test_bit(DMF_FREEING, &md->flags) ||
-	    test_bit(DMF_DELETING, &md->flags))
+	    dm_deleting_md(md))
 		return NULL;
 
 	dm_get(md);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 4a95e8fa360..604a5f2a238 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -88,6 +88,11 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 
 int dm_split_args(int *argc, char ***argvp, char *input);
 
+/*
+ * Is this mapped_device being deleted?
+ */
+int dm_deleting_md(struct mapped_device *md);
+
 /*
  * The device-mapper can be driven through one of two interfaces;
  * ioctl or filesystem, depending which patch you have applied.
-- 
cgit v1.2.3


From c50abeb38026ea721a812cf8a9b2fac5d3b7684b Mon Sep 17 00:00:00 2001
From: Mike Anderson <andmike@linux.vnet.ibm.com>
Date: Thu, 10 Dec 2009 23:52:20 +0000
Subject: dm ioctl: forbid messages to devices being deleted

Once we begin deleting a device, prevent any further messages being sent
to targets of its table (to avoid races).

Signed-off-by: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index bf3d19a0fb7..d06dd39856f 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1303,6 +1303,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 	if (!table)
 		goto out_argv;
 
+	if (dm_deleting_md(md)) {
+		r = -ENXIO;
+		goto out_table;
+	}
+
 	ti = dm_table_find_target(table, tmsg->sector);
 	if (!dm_target_is_valid(ti)) {
 		DMWARN("Target message sector outside device.");
@@ -1314,6 +1319,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 		r = -EINVAL;
 	}
 
+ out_table:
 	dm_table_put(table);
  out_argv:
 	kfree(argv);
-- 
cgit v1.2.3


From 6380f26f0424034345461cabaab9a7030d905b59 Mon Sep 17 00:00:00 2001
From: Mike Anderson <andmike@linux.vnet.ibm.com>
Date: Thu, 10 Dec 2009 23:52:21 +0000
Subject: dm mpath: add mutex to synchronize adding and flushing work

Add a mutex to allow possible creators of new work to synchronize with
flushing work queues.

Signed-off-by: Mike Anderson <andmike@linux.vnet.ibm.com>
Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 59 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5b23f2df9bd..700154e2148 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -93,6 +93,8 @@ struct multipath {
 	 * can resubmit bios on error.
 	 */
 	mempool_t *mpio_pool;
+
+	struct mutex work_mutex;
 };
 
 /*
@@ -198,6 +200,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		m->queue_io = 1;
 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
 		INIT_WORK(&m->trigger_event, trigger_event);
+		mutex_init(&m->work_mutex);
 		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
@@ -1268,7 +1271,11 @@ static void multipath_presuspend(struct dm_target *ti)
 
 static void multipath_postsuspend(struct dm_target *ti)
 {
+	struct multipath *m = ti->private;
+
+	mutex_lock(&m->work_mutex);
 	flush_multipath_work();
+	mutex_unlock(&m->work_mutex);
 }
 
 /*
@@ -1407,51 +1414,61 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 
 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 {
-	int r;
+	int r = -EINVAL;
 	struct dm_dev *dev;
 	struct multipath *m = (struct multipath *) ti->private;
 	action_fn action;
 
+	mutex_lock(&m->work_mutex);
+
 	if (argc == 1) {
-		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
-			return queue_if_no_path(m, 1, 0);
-		else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
-			return queue_if_no_path(m, 0, 0);
+		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
+			r = queue_if_no_path(m, 1, 0);
+			goto out;
+		} else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
+			r = queue_if_no_path(m, 0, 0);
+			goto out;
+		}
 	}
 
-	if (argc != 2)
-		goto error;
+	if (argc != 2) {
+		DMWARN("Unrecognised multipath message received.");
+		goto out;
+	}
 
-	if (!strnicmp(argv[0], MESG_STR("disable_group")))
-		return bypass_pg_num(m, argv[1], 1);
-	else if (!strnicmp(argv[0], MESG_STR("enable_group")))
-		return bypass_pg_num(m, argv[1], 0);
-	else if (!strnicmp(argv[0], MESG_STR("switch_group")))
-		return switch_pg_num(m, argv[1]);
-	else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
+	if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
+		r = bypass_pg_num(m, argv[1], 1);
+		goto out;
+	} else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
+		r = bypass_pg_num(m, argv[1], 0);
+		goto out;
+	} else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
+		r = switch_pg_num(m, argv[1]);
+		goto out;
+	} else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
 		action = reinstate_path;
 	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
 		action = fail_path;
-	else
-		goto error;
+	else {
+		DMWARN("Unrecognised multipath message received.");
+		goto out;
+	}
 
 	r = dm_get_device(ti, argv[1], ti->begin, ti->len,
 			  dm_table_get_mode(ti->table), &dev);
 	if (r) {
 		DMWARN("message: error getting device %s",
 		       argv[1]);
-		return -EINVAL;
+		goto out;
 	}
 
 	r = action_dev(m, dev, action);
 
 	dm_put_device(ti, dev);
 
+out:
+	mutex_unlock(&m->work_mutex);
 	return r;
-
-error:
-	DMWARN("Unrecognised multipath message received.");
-	return -EINVAL;
 }
 
 static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
-- 
cgit v1.2.3


From 67a46dad25ccc8910995d8671f7ec17a6bc823cb Mon Sep 17 00:00:00 2001
From: Mike Anderson <andmike@linux.vnet.ibm.com>
Date: Thu, 10 Dec 2009 23:52:21 +0000
Subject: dm mpath: prevent io from work queue while suspended

Reject messages that can generate I/O while the device itself
is suspended.

Signed-off-by: Mike Anderson <andmike@linux.vnet.ibm.com>
Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 700154e2148..45d9bf14cc4 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -95,6 +95,8 @@ struct multipath {
 	mempool_t *mpio_pool;
 
 	struct mutex work_mutex;
+
+	unsigned suspended;	/* Don't create new I/O internally when set. */
 };
 
 /*
@@ -1274,6 +1276,7 @@ static void multipath_postsuspend(struct dm_target *ti)
 	struct multipath *m = ti->private;
 
 	mutex_lock(&m->work_mutex);
+	m->suspended = 1;
 	flush_multipath_work();
 	mutex_unlock(&m->work_mutex);
 }
@@ -1286,6 +1289,10 @@ static void multipath_resume(struct dm_target *ti)
 	struct multipath *m = (struct multipath *) ti->private;
 	unsigned long flags;
 
+	mutex_lock(&m->work_mutex);
+	m->suspended = 0;
+	mutex_unlock(&m->work_mutex);
+
 	spin_lock_irqsave(&m->lock, flags);
 	m->queue_if_no_path = m->saved_queue_if_no_path;
 	spin_unlock_irqrestore(&m->lock, flags);
@@ -1421,6 +1428,11 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 
 	mutex_lock(&m->work_mutex);
 
+	if (m->suspended) {
+		r = -EBUSY;
+		goto out;
+	}
+
 	if (argc == 1) {
 		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
 			r = queue_if_no_path(m, 1, 0);
-- 
cgit v1.2.3


From 12fc0f49dc994d8d90dcf3df13f5b1ee5441288d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:22 +0000
Subject: dm io: handle empty barriers

Accept empty barriers in dm-io.

dm-io will process empty write barrier requests just like the other
read/write requests.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index f6a714c5aab..10f457ca6af 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -309,7 +309,11 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 	unsigned num_bvecs;
 	sector_t remaining = where->count;
 
-	while (remaining) {
+	/*
+	 * where->count may be zero if rw holds a write barrier and we
+	 * need to send a zero-sized barrier.
+	 */
+	do {
 		/*
 		 * Allocate a suitably sized-bio.
 		 */
@@ -339,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 
 		atomic_inc(&io->count);
 		submit_bio(rw, bio);
-	}
+	} while (remaining);
 }
 
 static void dispatch_io(int rw, unsigned int num_regions,
@@ -360,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	 */
 	for (i = 0; i < num_regions; i++) {
 		*dp = old_pages;
-		if (where[i].count)
+		if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
 			do_region(rw, i, where + i, dp, io);
 	}
 
-- 
cgit v1.2.3


From 1d0f3ce83200edc5d43723c77c62b09ad6560294 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:22 +0000
Subject: dm ioctl: retrieve status from inactive table

Add the flag DM_QUERY_INACTIVE_TABLE_FLAG to the ioctls to return
infomation about the loaded-but-not-yet-active table instead of the live
table.  Prior to this patch it was impossible to obtain this information
until the device had been 'resumed'.

Userspace dmsetup and libdevmapper support the flag as of version 1.02.40.
e.g. dmsetup info --inactive vg1-lv1

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c    | 70 +++++++++++++++++++++++++++++++++++++++---------
 include/linux/dm-ioctl.h | 13 ++++++---
 2 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index d06dd39856f..a3d20265ffc 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -523,8 +523,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size)
 	return 0;
 }
 
-
-
 static int check_name(const char *name)
 {
 	if (strchr(name, '/')) {
@@ -535,6 +533,40 @@ static int check_name(const char *name)
 	return 0;
 }
 
+/*
+ * On successful return, the caller must not attempt to acquire
+ * _hash_lock without first calling dm_table_put, because dm_table_destroy
+ * waits for this dm_table_put and could be called under this lock.
+ */
+static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
+{
+	struct hash_cell *hc;
+	struct dm_table *table = NULL;
+
+	down_read(&_hash_lock);
+	hc = dm_get_mdptr(md);
+	if (!hc || hc->md != md) {
+		DMWARN("device has been removed from the dev hash table.");
+		goto out;
+	}
+
+	table = hc->new_map;
+	if (table)
+		dm_table_get(table);
+
+out:
+	up_read(&_hash_lock);
+
+	return table;
+}
+
+static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
+						      struct dm_ioctl *param)
+{
+	return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
+		dm_get_inactive_table(md) : dm_get_live_table(md);
+}
+
 /*
  * Fills in a dm_ioctl structure, ready for sending back to
  * userland.
@@ -559,18 +591,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	 */
 	param->open_count = dm_open_count(md);
 
-	if (get_disk_ro(disk))
-		param->flags |= DM_READONLY_FLAG;
-
 	param->event_nr = dm_get_event_nr(md);
+	param->target_count = 0;
 
 	table = dm_get_live_table(md);
 	if (table) {
-		param->flags |= DM_ACTIVE_PRESENT_FLAG;
-		param->target_count = dm_table_get_num_targets(table);
+		if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
+			if (get_disk_ro(disk))
+				param->flags |= DM_READONLY_FLAG;
+			param->target_count = dm_table_get_num_targets(table);
+		}
 		dm_table_put(table);
-	} else
-		param->target_count = 0;
+
+		param->flags |= DM_ACTIVE_PRESENT_FLAG;
+	}
+
+	if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
+		table = dm_get_inactive_table(md);
+		if (table) {
+			if (!(dm_table_get_mode(table) & FMODE_WRITE))
+				param->flags |= DM_READONLY_FLAG;
+			param->target_count = dm_table_get_num_targets(table);
+			dm_table_put(table);
+		}
+	}
 
 	return 0;
 }
@@ -993,7 +1037,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	if (r)
 		goto out;
 
-	table = dm_get_live_table(md);
+	table = dm_get_live_or_inactive_table(md, param);
 	if (table) {
 		retrieve_status(table, param, param_size);
 		dm_table_put(table);
@@ -1226,7 +1270,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
 	if (r)
 		goto out;
 
-	table = dm_get_live_table(md);
+	table = dm_get_live_or_inactive_table(md, param);
 	if (table) {
 		retrieve_deps(table, param, param_size);
 		dm_table_put(table);
@@ -1255,13 +1299,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
 	if (r)
 		goto out;
 
-	table = dm_get_live_table(md);
+	table = dm_get_live_or_inactive_table(md, param);
 	if (table) {
 		retrieve_status(table, param, param_size);
 		dm_table_put(table);
 	}
 
- out:
+out:
 	dm_put(md);
 	return r;
 }
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 2ab84c83c31..aa95508d2f9 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
- * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004 - 2009 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the LGPL.
  */
@@ -266,9 +266,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	15
+#define DM_VERSION_MINOR	16
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2009-04-01)"
+#define DM_VERSION_EXTRA	"-ioctl (2009-11-05)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -309,4 +309,11 @@ enum {
  */
 #define DM_NOFLUSH_FLAG		(1 << 11) /* In */
 
+/*
+ * If set, any table information returned will relate to the inactive
+ * table instead of the live one.  Always check DM_INACTIVE_PRESENT_FLAG
+ * is set before using the data returned.
+ */
+#define DM_QUERY_INACTIVE_TABLE_FLAG	(1 << 12) /* In */
+
 #endif				/* _LINUX_DM_IOCTL_H */
-- 
cgit v1.2.3


From a794015597a2d9b437470c7692aac77e5fc08cd2 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Thu, 10 Dec 2009 23:52:23 +0000
Subject: dm: bind new table before destroying old

When replacing a mapped device's table during a 'resume', delay the
destruction of the old table until the new one is successfully in place.

This will make it easier for a later patch to transfer internal state
information from the old table to the new one (something we do not currently
support) while giving us more options for reversion if a later part
of the operation fails.

Devices are always in the suspended state during dm_swap_table().
This patch reinforces the requirement that all I/O must have been
flushed from the table targets while in this state (including any in
workqueues).  In the case of 'noflush' suspending, unprocessed
I/O should have been 'pushed back' to the dm core prior to this point,
for resubmission after the new table is in place.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c |  3 +++
 drivers/md/dm.c       | 16 +++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1a6cb3c7822..3162b9c3c3f 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -237,6 +237,9 @@ void dm_table_destroy(struct dm_table *t)
 {
 	unsigned int i;
 
+	if (!t)
+		return;
+
 	while (atomic_read(&t->holders))
 		msleep(1);
 	smp_mb();
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 16f759fe04f..255b75c6154 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2077,19 +2077,23 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
 	return 0;
 }
 
-static void __unbind(struct mapped_device *md)
+/*
+ * Returns unbound table for the caller to free.
+ */
+static struct dm_table *__unbind(struct mapped_device *md)
 {
 	struct dm_table *map = md->map;
 	unsigned long flags;
 
 	if (!map)
-		return;
+		return NULL;
 
 	dm_table_event_callback(map, NULL, NULL);
 	write_lock_irqsave(&md->map_lock, flags);
 	md->map = NULL;
 	write_unlock_irqrestore(&md->map_lock, flags);
-	dm_table_destroy(map);
+
+	return map;
 }
 
 /*
@@ -2182,7 +2186,7 @@ void dm_put(struct mapped_device *md)
 		}
 		dm_sysfs_exit(md);
 		dm_table_put(map);
-		__unbind(md);
+		dm_table_destroy(__unbind(md));
 		free_dev(md);
 	}
 }
@@ -2368,6 +2372,7 @@ static void dm_rq_barrier_work(struct work_struct *work)
  */
 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
+	struct dm_table *map;
 	struct queue_limits limits;
 	int r = -EINVAL;
 
@@ -2388,8 +2393,9 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 		goto out;
 	}
 
-	__unbind(md);
+	map = __unbind(md);
 	r = __bind(md, table, &limits);
+	dm_table_destroy(map);
 
 out:
 	mutex_unlock(&md->suspend_lock);
-- 
cgit v1.2.3


From 042d2a9bcd80fe12d4b0871706aa9dd2231e8238 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Thu, 10 Dec 2009 23:52:24 +0000
Subject: dm: keep old table until after resume succeeded

When swapping a new table into place, retain the old table until
its replacement is in place.

An old check for an empty table is removed because this is enforced
in populate_table().

__unbind() becomes redundant when followed by __bind().

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c         | 10 ++++++----
 drivers/md/dm.c               | 34 +++++++++++++++++-----------------
 include/linux/device-mapper.h |  4 +++-
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a3d20265ffc..63fd4de25bd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -855,7 +855,7 @@ static int do_resume(struct dm_ioctl *param)
 	unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
 	struct hash_cell *hc;
 	struct mapped_device *md;
-	struct dm_table *new_map;
+	struct dm_table *new_map, *old_map = NULL;
 
 	down_write(&_hash_lock);
 
@@ -884,11 +884,11 @@ static int do_resume(struct dm_ioctl *param)
 		if (!dm_suspended(md))
 			dm_suspend(md, suspend_flags);
 
-		r = dm_swap_table(md, new_map);
-		if (r) {
+		old_map = dm_swap_table(md, new_map);
+		if (IS_ERR(old_map)) {
 			dm_table_destroy(new_map);
 			dm_put(md);
-			return r;
+			return PTR_ERR(old_map);
 		}
 
 		if (dm_table_get_mode(new_map) & FMODE_WRITE)
@@ -900,6 +900,8 @@ static int do_resume(struct dm_ioctl *param)
 	if (dm_suspended(md))
 		r = dm_resume(md);
 
+	if (old_map)
+		dm_table_destroy(old_map);
 
 	if (!r) {
 		dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 255b75c6154..c254c6cf69a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2033,9 +2033,13 @@ static void __set_size(struct mapped_device *md, sector_t size)
 	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
-static int __bind(struct mapped_device *md, struct dm_table *t,
-		  struct queue_limits *limits)
+/*
+ * Returns old map, which caller must destroy.
+ */
+static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
+			       struct queue_limits *limits)
 {
+	struct dm_table *old_map;
 	struct request_queue *q = md->queue;
 	sector_t size;
 	unsigned long flags;
@@ -2050,11 +2054,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
 
 	__set_size(md, size);
 
-	if (!size) {
-		dm_table_destroy(t);
-		return 0;
-	}
-
 	dm_table_event_callback(t, event_callback, md);
 
 	/*
@@ -2070,11 +2069,12 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
 	__bind_mempools(md, t);
 
 	write_lock_irqsave(&md->map_lock, flags);
+	old_map = md->map;
 	md->map = t;
 	dm_table_set_restrictions(t, q, limits);
 	write_unlock_irqrestore(&md->map_lock, flags);
 
-	return 0;
+	return old_map;
 }
 
 /*
@@ -2368,13 +2368,13 @@ static void dm_rq_barrier_work(struct work_struct *work)
 }
 
 /*
- * Swap in a new table (destroying old one).
+ * Swap in a new table, returning the old one for the caller to destroy.
  */
-int dm_swap_table(struct mapped_device *md, struct dm_table *table)
+struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
-	struct dm_table *map;
+	struct dm_table *map = ERR_PTR(-EINVAL);
 	struct queue_limits limits;
-	int r = -EINVAL;
+	int r;
 
 	mutex_lock(&md->suspend_lock);
 
@@ -2383,8 +2383,10 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 		goto out;
 
 	r = dm_calculate_queue_limits(table, &limits);
-	if (r)
+	if (r) {
+		map = ERR_PTR(r);
 		goto out;
+	}
 
 	/* cannot change the device type, once a table is bound */
 	if (md->map &&
@@ -2393,13 +2395,11 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 		goto out;
 	}
 
-	map = __unbind(md);
-	r = __bind(md, table, &limits);
-	dm_table_destroy(map);
+	map = __bind(md, table, &limits);
 
 out:
 	mutex_unlock(&md->suspend_lock);
-	return r;
+	return map;
 }
 
 /*
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index fc16a351ef7..b9c6c8ca11b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -295,8 +295,10 @@ void dm_table_event(struct dm_table *t);
 
 /*
  * The device must be suspended before calling this method.
+ * Returns the previous table, which the caller must destroy.
  */
-int dm_swap_table(struct mapped_device *md, struct dm_table *t);
+struct dm_table *dm_swap_table(struct mapped_device *md,
+			       struct dm_table *t);
 
 /*
  * A wrapper around vmalloc.
-- 
cgit v1.2.3


From c1f0c183f6acc6d32c5a1d0249ec68bf783af7b1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:24 +0000
Subject: dm snapshot: allow live exception store handover between tables

Permit in-use snapshot exception data to be 'handed over' from one
snapshot instance to another.  This is a pre-requisite for patches
that allow the changes made in a snapshot device to be merged back into
its origin device and also allows device resizing.

The basic call sequence is:

  dmsetup load new_snapshot (referencing the existing in-use cow device)
     - the ctr code detects that the cow is already in use and allows the
       two snapshot target instances to be linked together
  dmsetup suspend original_snapshot
  dmsetup resume new_snapshot
     - the new_snapshot becomes live, and if anything now tries to access
       the original one it will receive -EIO
  dmsetup remove original_snapshot

(There can only be two snapshot targets referencing the same cow device
simultaneously.)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 263 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 236 insertions(+), 27 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index fd04caa9034..b5b9118c063 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -302,23 +302,117 @@ static void __insert_origin(struct origin *o)
 	list_add_tail(&o->hash_list, sl);
 }
 
+/*
+ * _origins_lock must be held when calling this function.
+ * Returns number of snapshots registered using the supplied cow device, plus:
+ * snap_src - a snapshot suitable for use as a source of exception handover
+ * snap_dest - a snapshot capable of receiving exception handover.
+ *
+ * Possible return values and states:
+ *   0: NULL, NULL  - first new snapshot
+ *   1: snap_src, NULL - normal snapshot
+ *   2: snap_src, snap_dest  - waiting for handover
+ *   2: snap_src, NULL - handed over, waiting for old to be deleted
+ *   1: NULL, snap_dest - source got destroyed without handover
+ */
+static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
+					struct dm_snapshot **snap_src,
+					struct dm_snapshot **snap_dest)
+{
+	struct dm_snapshot *s;
+	struct origin *o;
+	int count = 0;
+	int active;
+
+	o = __lookup_origin(snap->origin->bdev);
+	if (!o)
+		goto out;
+
+	list_for_each_entry(s, &o->snapshots, list) {
+		if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
+			continue;
+
+		down_read(&s->lock);
+		active = s->active;
+		up_read(&s->lock);
+
+		if (active) {
+			if (snap_src)
+				*snap_src = s;
+		} else if (snap_dest)
+			*snap_dest = s;
+
+		count++;
+	}
+
+out:
+	return count;
+}
+
+/*
+ * On success, returns 1 if this snapshot is a handover destination,
+ * otherwise returns 0.
+ */
+static int __validate_exception_handover(struct dm_snapshot *snap)
+{
+	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+
+	/* Does snapshot need exceptions handed over to it? */
+	if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest) == 2) ||
+	    snap_dest) {
+		snap->ti->error = "Snapshot cow pairing for exception "
+				  "table handover failed";
+		return -EINVAL;
+	}
+
+	/*
+	 * If no snap_src was found, snap cannot become a handover
+	 * destination.
+	 */
+	if (!snap_src)
+		return 0;
+
+	return 1;
+}
+
+static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
+{
+	struct dm_snapshot *l;
+
+	/* Sort the list according to chunk size, largest-first smallest-last */
+	list_for_each_entry(l, &o->snapshots, list)
+		if (l->store->chunk_size < s->store->chunk_size)
+			break;
+	list_add_tail(&s->list, &l->list);
+}
+
 /*
  * Make a note of the snapshot and its origin so we can look it
  * up when the origin has a write on it.
+ *
+ * Also validate snapshot exception store handovers.
+ * On success, returns 1 if this registration is a handover destination,
+ * otherwise returns 0.
  */
 static int register_snapshot(struct dm_snapshot *snap)
 {
-	struct dm_snapshot *l;
-	struct origin *o, *new_o;
+	struct origin *o, *new_o = NULL;
 	struct block_device *bdev = snap->origin->bdev;
+	int r = 0;
 
 	new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
 	if (!new_o)
 		return -ENOMEM;
 
 	down_write(&_origins_lock);
-	o = __lookup_origin(bdev);
 
+	r = __validate_exception_handover(snap);
+	if (r < 0) {
+		kfree(new_o);
+		goto out;
+	}
+
+	o = __lookup_origin(bdev);
 	if (o)
 		kfree(new_o);
 	else {
@@ -332,14 +426,27 @@ static int register_snapshot(struct dm_snapshot *snap)
 		__insert_origin(o);
 	}
 
-	/* Sort the list according to chunk size, largest-first smallest-last */
-	list_for_each_entry(l, &o->snapshots, list)
-		if (l->store->chunk_size < snap->store->chunk_size)
-			break;
-	list_add_tail(&snap->list, &l->list);
+	__insert_snapshot(o, snap);
+
+out:
+	up_write(&_origins_lock);
+
+	return r;
+}
+
+/*
+ * Move snapshot to correct place in list according to chunk size.
+ */
+static void reregister_snapshot(struct dm_snapshot *s)
+{
+	struct block_device *bdev = s->origin->bdev;
+
+	down_write(&_origins_lock);
+
+	list_del(&s->list);
+	__insert_snapshot(__lookup_origin(bdev), s);
 
 	up_write(&_origins_lock);
-	return 0;
 }
 
 static void unregister_snapshot(struct dm_snapshot *s)
@@ -350,7 +457,7 @@ static void unregister_snapshot(struct dm_snapshot *s)
 	o = __lookup_origin(s->origin->bdev);
 
 	list_del(&s->list);
-	if (list_empty(&o->snapshots)) {
+	if (o && list_empty(&o->snapshots)) {
 		list_del(&o->hash_list);
 		kfree(o);
 	}
@@ -662,6 +769,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->suspended = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
 	init_rwsem(&s->lock);
+	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
 
 	/* Allocate hash table for COW data */
@@ -696,39 +804,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	spin_lock_init(&s->tracked_chunk_lock);
 
-	/* Metadata must only be loaded into one table at once */
+	bio_list_init(&s->queued_bios);
+	INIT_WORK(&s->queued_bios_work, flush_queued_bios);
+
+	ti->private = s;
+	ti->num_flush_requests = 1;
+
+	/* Add snapshot to the list of snapshots for this origin */
+	/* Exceptions aren't triggered till snapshot_resume() is called */
+	r = register_snapshot(s);
+	if (r == -ENOMEM) {
+		ti->error = "Snapshot origin struct allocation failed";
+		goto bad_load_and_register;
+	} else if (r < 0) {
+		/* invalid handover, register_snapshot has set ti->error */
+		goto bad_load_and_register;
+	}
+
+	/*
+	 * Metadata must only be loaded into one table at once, so skip this
+	 * if metadata will be handed over during resume.
+	 * Chunk size will be set during the handover - set it to zero to
+	 * ensure it's ignored.
+	 */
+	if (r > 0) {
+		s->store->chunk_size = 0;
+		return 0;
+	}
+
 	r = s->store->type->read_metadata(s->store, dm_add_exception,
 					  (void *)s);
 	if (r < 0) {
 		ti->error = "Failed to read snapshot metadata";
-		goto bad_load_and_register;
+		goto bad_read_metadata;
 	} else if (r > 0) {
 		s->valid = 0;
 		DMWARN("Snapshot is marked invalid.");
 	}
 
-	bio_list_init(&s->queued_bios);
-	INIT_WORK(&s->queued_bios_work, flush_queued_bios);
-
 	if (!s->store->chunk_size) {
 		ti->error = "Chunk size not set";
-		goto bad_load_and_register;
-	}
-
-	/* Add snapshot to the list of snapshots for this origin */
-	/* Exceptions aren't triggered till snapshot_resume() is called */
-	if (register_snapshot(s)) {
-		r = -EINVAL;
-		ti->error = "Cannot register snapshot origin";
-		goto bad_load_and_register;
+		goto bad_read_metadata;
 	}
-
-	ti->private = s;
 	ti->split_io = s->store->chunk_size;
-	ti->num_flush_requests = 1;
 
 	return 0;
 
+bad_read_metadata:
+	unregister_snapshot(s);
+
 bad_load_and_register:
 	mempool_destroy(s->tracked_chunk_pool);
 
@@ -767,15 +891,58 @@ static void __free_exceptions(struct dm_snapshot *s)
 	dm_exception_table_exit(&s->complete, exception_cache);
 }
 
+static void __handover_exceptions(struct dm_snapshot *snap_src,
+				  struct dm_snapshot *snap_dest)
+{
+	union {
+		struct dm_exception_table table_swap;
+		struct dm_exception_store *store_swap;
+	} u;
+
+	/*
+	 * Swap all snapshot context information between the two instances.
+	 */
+	u.table_swap = snap_dest->complete;
+	snap_dest->complete = snap_src->complete;
+	snap_src->complete = u.table_swap;
+
+	u.store_swap = snap_dest->store;
+	snap_dest->store = snap_src->store;
+	snap_src->store = u.store_swap;
+
+	snap_dest->store->snap = snap_dest;
+	snap_src->store->snap = snap_src;
+
+	snap_dest->ti->split_io = snap_dest->store->chunk_size;
+	snap_dest->valid = snap_src->valid;
+
+	/*
+	 * Set source invalid to ensure it receives no further I/O.
+	 */
+	snap_src->valid = 0;
+}
+
 static void snapshot_dtr(struct dm_target *ti)
 {
 #ifdef CONFIG_DM_DEBUG
 	int i;
 #endif
 	struct dm_snapshot *s = ti->private;
+	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 
 	flush_workqueue(ksnapd);
 
+	down_read(&_origins_lock);
+	/* Check whether exception handover must be cancelled */
+	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	if (snap_src && snap_dest && (s == snap_src)) {
+		down_write(&snap_dest->lock);
+		snap_dest->valid = 0;
+		up_write(&snap_dest->lock);
+		DMERR("Cancelling snapshot handover.");
+	}
+	up_read(&_origins_lock);
+
 	/* Prevent further origin writes from using this snapshot. */
 	/* After this returns there can be no new kcopyd jobs. */
 	unregister_snapshot(s);
@@ -1188,9 +1355,50 @@ static void snapshot_postsuspend(struct dm_target *ti)
 	up_write(&s->lock);
 }
 
+static int snapshot_preresume(struct dm_target *ti)
+{
+	int r = 0;
+	struct dm_snapshot *s = ti->private;
+	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+
+	down_read(&_origins_lock);
+	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	if (snap_src && snap_dest) {
+		down_read(&snap_src->lock);
+		if (s == snap_src) {
+			DMERR("Unable to resume snapshot source until "
+			      "handover completes.");
+			r = -EINVAL;
+		} else if (!snap_src->suspended) {
+			DMERR("Unable to perform snapshot handover until "
+			      "source is suspended.");
+			r = -EINVAL;
+		}
+		up_read(&snap_src->lock);
+	}
+	up_read(&_origins_lock);
+
+	return r;
+}
+
 static void snapshot_resume(struct dm_target *ti)
 {
 	struct dm_snapshot *s = ti->private;
+	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+
+	down_read(&_origins_lock);
+	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	if (snap_src && snap_dest) {
+		down_write(&snap_src->lock);
+		down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+		__handover_exceptions(snap_src, snap_dest);
+		up_write(&snap_dest->lock);
+		up_write(&snap_src->lock);
+	}
+	up_read(&_origins_lock);
+
+	/* Now we have correct chunk size, reregister */
+	reregister_snapshot(s);
 
 	down_write(&s->lock);
 	s->active = 1;
@@ -1510,6 +1718,7 @@ static struct target_type snapshot_target = {
 	.map     = snapshot_map,
 	.end_io  = snapshot_end_io,
 	.postsuspend = snapshot_postsuspend,
+	.preresume  = snapshot_preresume,
 	.resume  = snapshot_resume,
 	.status  = snapshot_status,
 	.iterate_devices = snapshot_iterate_devices,
-- 
cgit v1.2.3


From 6db4ccd6357f28c9ef7058b3bc48904c4b2ac419 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:25 +0000
Subject: dm: trace request based remapping

This patch adds a remapping trace to request-based dm.
BIO-based dm already has the equivalent tracepoint.

For example, under this dm stack (linear LV on multipath):
  # dmsetup ls --tree -o ascii
  vg-lv0 (253:1)
   `-mpath0 (253:0)
      |- (8:160)
      |- (66:80)
      |- (65:176)
      `- (65:160)

Trace of 'dd of=/dev/vg/lv0 bs=128k count=1 oflag=direct' looks like this:

without the patch:
  dd-6674  [000]   539.727384: block_bio_queue: 253,1 WS 0 + 256 [dd]
  dd-6674  [000]   539.727392: block_remap: 253,0 WS 384 + 256 <- (253,1) 0
  dd-6674  [000]   539.727394: block_bio_queue: 253,0 WS 384 + 256 [dd]
  dd-6674  [000]   539.727405: block_getrq: 253,0 WS 384 + 256 [dd]
  dd-6674  [000]   539.727409: block_plug: [dd]
  dd-6674  [000]   539.727410: block_rq_insert: 253,0 W 0 () 384 + 256 [dd]
  dd-6674  [000]   539.727416: block_rq_issue: 253,0 W 0 () 384 + 256 [dd]
  dd-6674  [000]   539.727426: block_rq_insert: 65,176 W 0 () 384 + 256 [dd]
  dd-6674  [000]   539.727427: block_rq_issue: 65,176 W 0 () 384 + 256 [dd]
  ...

and with the patch: (the line with '**' is the trace added by this patch)
  dd-6617  [002]   162.914301: block_bio_queue: 253,1 WS 0 + 256 [dd]
  dd-6617  [002]   162.914314: block_remap: 253,0 WS 384 + 256 <- (253,1) 0
  dd-6617  [002]   162.914316: block_bio_queue: 253,0 WS 384 + 256 [dd]
  dd-6617  [002]   162.914331: block_getrq: 253,0 WS 384 + 256 [dd]
  dd-6617  [002]   162.914335: block_plug: [dd]
  dd-6617  [002]   162.914337: block_rq_insert: 253,0 W 0 () 384 + 256 [dd]
  dd-6617  [002]   162.914347: block_rq_issue: 253,0 W 0 () 384 + 256 [dd]
**dd-6617  [002]   162.914356: block_rq_remap: 65,176 W 384 + 256 <- (253,0) 384
  dd-6617  [002]   162.914358: block_rq_insert: 65,176 W 0 () 384 + 256 [dd]
  dd-6617  [002]   162.914359: block_rq_issue: 65,176 W 0 () 384 + 256 [dd]
  ...

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c254c6cf69a..40e257fadac 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1618,6 +1618,8 @@ static void map_request(struct dm_target *ti, struct request *clone,
 		break;
 	case DM_MAPIO_REMAPPED:
 		/* The target has remapped the I/O so dispatch it */
+		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
+				     blk_rq_pos(tio->orig));
 		dm_dispatch_request(clone);
 		break;
 	case DM_MAPIO_REQUEUE:
-- 
cgit v1.2.3


From 61afef614b013ee1b767cdd10325acae1db1f4d2 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 10 Dec 2009 23:52:25 +0000
Subject: dm crypt: add plain64 iv

The default plain IV is 32-bit only.

This plain64 IV provides a compatible mode for encrypted devices bigger
than 4TB.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 91e1bf91769..a93637223c8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -158,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
  * plain: the initial vector is the 32-bit little-endian version of the sector
  *        number, padded with zeros if necessary.
  *
+ * plain64: the initial vector is the 64-bit little-endian version of the sector
+ *        number, padded with zeros if necessary.
+ *
  * essiv: "encrypted sector|salt initial vector", the sector number is
  *        encrypted with the bulk cipher using a salt as key. The salt
  *        should be derived from the bulk cipher's key via hashing.
@@ -180,6 +183,15 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 	return 0;
 }
 
+static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
+				sector_t sector)
+{
+	memset(iv, 0, cc->iv_size);
+	*(u64 *)iv = cpu_to_le64(sector);
+
+	return 0;
+}
+
 /* Initialise ESSIV - compute salt but no local memory allocations */
 static int crypt_iv_essiv_init(struct crypt_config *cc)
 {
@@ -342,6 +354,10 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
 	.generator = crypt_iv_plain_gen
 };
 
+static struct crypt_iv_operations crypt_iv_plain64_ops = {
+	.generator = crypt_iv_plain64_gen
+};
+
 static struct crypt_iv_operations crypt_iv_essiv_ops = {
 	.ctr       = crypt_iv_essiv_ctr,
 	.dtr       = crypt_iv_essiv_dtr,
@@ -1063,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		cc->iv_gen_ops = NULL;
 	else if (strcmp(ivmode, "plain") == 0)
 		cc->iv_gen_ops = &crypt_iv_plain_ops;
+	else if (strcmp(ivmode, "plain64") == 0)
+		cc->iv_gen_ops = &crypt_iv_plain64_ops;
 	else if (strcmp(ivmode, "essiv") == 0)
 		cc->iv_gen_ops = &crypt_iv_essiv_ops;
 	else if (strcmp(ivmode, "benbi") == 0)
-- 
cgit v1.2.3


From 4d4471cb5c1ec426c0f24818b270dc7b3ad7e655 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:26 +0000
Subject: dm: swap target postsuspend call and setting suspended flag

This patch moves DMF_SUSPENDED flag set before postsuspend.
No one should care about the ordering, because the flag set and
the postsuspend are protected by a single lock, md->suspend_lock,
and all strict flag-checkers take the lock.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 40e257fadac..f2b993c4335 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2550,10 +2550,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 * requests are being added to md->deferred list.
 	 */
 
-	dm_table_postsuspend_targets(map);
-
 	set_bit(DMF_SUSPENDED, &md->flags);
 
+	dm_table_postsuspend_targets(map);
+
 out:
 	dm_table_put(map);
 
-- 
cgit v1.2.3


From 4f186f8bbfa92bf1a2b39f7a8674348bfdba9437 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:26 +0000
Subject: dm: rename dm_suspended to dm_suspended_md

This patch renames dm_suspended() to dm_suspended_md() and
keeps it internal to dm.
No functional change.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c         |  8 ++++----
 drivers/md/dm-sysfs.c         |  2 +-
 drivers/md/dm.c               | 12 ++++++------
 drivers/md/dm.h               |  5 +++++
 include/linux/device-mapper.h |  1 -
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 63fd4de25bd..1d669322b27 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -579,7 +579,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
 			  DM_ACTIVE_PRESENT_FLAG);
 
-	if (dm_suspended(md))
+	if (dm_suspended_md(md))
 		param->flags |= DM_SUSPEND_FLAG;
 
 	param->dev = huge_encode_dev(disk_devt(disk));
@@ -839,7 +839,7 @@ static int do_suspend(struct dm_ioctl *param)
 	if (param->flags & DM_NOFLUSH_FLAG)
 		suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
 
-	if (!dm_suspended(md))
+	if (!dm_suspended_md(md))
 		r = dm_suspend(md, suspend_flags);
 
 	if (!r)
@@ -881,7 +881,7 @@ static int do_resume(struct dm_ioctl *param)
 			suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
 		if (param->flags & DM_NOFLUSH_FLAG)
 			suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
-		if (!dm_suspended(md))
+		if (!dm_suspended_md(md))
 			dm_suspend(md, suspend_flags);
 
 		old_map = dm_swap_table(md, new_map);
@@ -897,7 +897,7 @@ static int do_resume(struct dm_ioctl *param)
 			set_disk_ro(dm_disk(md), 1);
 	}
 
-	if (dm_suspended(md))
+	if (dm_suspended_md(md))
 		r = dm_resume(md);
 
 	if (old_map)
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index b000de38c99..f53392df7b9 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
 
 static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
 {
-	sprintf(buf, "%d\n", dm_suspended(md));
+	sprintf(buf, "%d\n", dm_suspended_md(md));
 
 	return strlen(buf);
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f2b993c4335..e0702bf3793 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -415,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 
 	tgt = dm_table_get_target(map, 0);
 
-	if (dm_suspended(md)) {
+	if (dm_suspended_md(md)) {
 		r = -EAGAIN;
 		goto out;
 	}
@@ -2182,7 +2182,7 @@ void dm_put(struct mapped_device *md)
 			    MINOR(disk_devt(dm_disk(md))));
 		set_bit(DMF_FREEING, &md->flags);
 		spin_unlock(&_minor_lock);
-		if (!dm_suspended(md)) {
+		if (!dm_suspended_md(md)) {
 			dm_table_presuspend_targets(map);
 			dm_table_postsuspend_targets(map);
 		}
@@ -2381,7 +2381,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	mutex_lock(&md->suspend_lock);
 
 	/* device must be suspended */
-	if (!dm_suspended(md))
+	if (!dm_suspended_md(md))
 		goto out;
 
 	r = dm_calculate_queue_limits(table, &limits);
@@ -2461,7 +2461,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	mutex_lock(&md->suspend_lock);
 
-	if (dm_suspended(md)) {
+	if (dm_suspended_md(md)) {
 		r = -EINVAL;
 		goto out_unlock;
 	}
@@ -2568,7 +2568,7 @@ int dm_resume(struct mapped_device *md)
 	struct dm_table *map = NULL;
 
 	mutex_lock(&md->suspend_lock);
-	if (!dm_suspended(md))
+	if (!dm_suspended_md(md))
 		goto out;
 
 	map = dm_get_live_table(md);
@@ -2679,7 +2679,7 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 	return md;
 }
 
-int dm_suspended(struct mapped_device *md)
+int dm_suspended_md(struct mapped_device *md)
 {
 	return test_bit(DMF_SUSPENDED, &md->flags);
 }
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 604a5f2a238..8dadaa5bc39 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -93,6 +93,11 @@ int dm_split_args(int *argc, char ***argvp, char *input);
  */
 int dm_deleting_md(struct mapped_device *md);
 
+/*
+ * Is this mapped_device suspended?
+ */
+int dm_suspended_md(struct mapped_device *md);
+
 /*
  * The device-mapper can be driven through one of two interfaces;
  * ioctl or filesystem, depending which patch you have applied.
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index b9c6c8ca11b..fca0d31bbf2 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -235,7 +235,6 @@ void dm_uevent_add(struct mapped_device *md, struct list_head *elist);
 const char *dm_device_name(struct mapped_device *md);
 int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
 struct gendisk *dm_disk(struct mapped_device *md);
-int dm_suspended(struct mapped_device *md);
 int dm_noflush_suspending(struct dm_target *ti);
 union map_info *dm_get_mapinfo(struct bio *bio);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
-- 
cgit v1.2.3


From 64dbce580d5a7e89e8de20b91f80c7267cdad91d Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:27 +0000
Subject: dm: export suspended state to targets

This patch adds the exported dm_suspended() function so that targets
can check whether or not they are suspended.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c               | 11 +++++++++++
 include/linux/device-mapper.h |  1 +
 2 files changed, 12 insertions(+)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e0702bf3793..3167480b532 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2684,6 +2684,17 @@ int dm_suspended_md(struct mapped_device *md)
 	return test_bit(DMF_SUSPENDED, &md->flags);
 }
 
+int dm_suspended(struct dm_target *ti)
+{
+	struct mapped_device *md = dm_table_get_md(ti->table);
+	int r = dm_suspended_md(md);
+
+	dm_put(md);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_suspended);
+
 int dm_noflush_suspending(struct dm_target *ti)
 {
 	struct mapped_device *md = dm_table_get_md(ti->table);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index fca0d31bbf2..d4c9c0b88ad 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -235,6 +235,7 @@ void dm_uevent_add(struct mapped_device *md, struct list_head *elist);
 const char *dm_device_name(struct mapped_device *md);
 int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
 struct gendisk *dm_disk(struct mapped_device *md);
+int dm_suspended(struct dm_target *ti);
 int dm_noflush_suspending(struct dm_target *ti);
 union map_info *dm_get_mapinfo(struct bio *bio);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
-- 
cgit v1.2.3


From c2f3d24b783fda20618b73d65678eb5dfae31a5d Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 10 Dec 2009 23:52:27 +0000
Subject: dm mpath: reject messages when device is suspended

This patch rejects messages that can generate I/O while the device
itself is suspended.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Cc: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 45d9bf14cc4..e81345a1d08 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1433,6 +1433,11 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 		goto out;
 	}
 
+	if (dm_suspended(ti)) {
+		r = -EBUSY;
+		goto out;
+	}
+
 	if (argc == 1) {
 		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
 			r = queue_if_no_path(m, 1, 0);
-- 
cgit v1.2.3


From 9eaae8ffbc340fc034fed1e5d0dc9ca0e943f817 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:28 +0000
Subject: dm snapshot: make bio optional in __origin_write

To support the merging of snapshots back into their origin we need
to trigger exceptions in other snapshots not being merged without
any incoming bio on the origin device.  The bio parameter to
__origin_write() becomes optional and the sector needs supplying
separately.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index b5b9118c063..5e553c50c21 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1467,7 +1467,19 @@ static int snapshot_iterate_devices(struct dm_target *ti,
 /*-----------------------------------------------------------------
  * Origin methods
  *---------------------------------------------------------------*/
-static int __origin_write(struct list_head *snapshots, struct bio *bio)
+
+/*
+ * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
+ * supplied bio was ignored.  The caller may submit it immediately.
+ * (No remapping actually occurs as the origin is always a direct linear
+ * map.)
+ *
+ * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
+ * and any supplied bio is added to a list to be submitted once all
+ * the necessary exceptions exist.
+ */
+static int __origin_write(struct list_head *snapshots, sector_t sector,
+			  struct bio *bio)
 {
 	int r = DM_MAPIO_REMAPPED, first = 0;
 	struct dm_snapshot *snap;
@@ -1486,14 +1498,14 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 			goto next_snapshot;
 
 		/* Nothing to do if writing beyond end of snapshot */
-		if (bio->bi_sector >= dm_table_get_size(snap->ti->table))
+		if (sector >= dm_table_get_size(snap->ti->table))
 			goto next_snapshot;
 
 		/*
 		 * Remember, different snapshots can have
 		 * different chunk sizes.
 		 */
-		chunk = sector_to_chunk(snap->store, bio->bi_sector);
+		chunk = sector_to_chunk(snap->store, sector);
 
 		/*
 		 * Check exception table to see if block
@@ -1543,7 +1555,8 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 				first = 1;
 			}
 
-			bio_list_add(&primary_pe->origin_bios, bio);
+			if (bio)
+				bio_list_add(&primary_pe->origin_bios, bio);
 
 			r = DM_MAPIO_SUBMITTED;
 		}
@@ -1599,7 +1612,7 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
 	down_read(&_origins_lock);
 	o = __lookup_origin(origin->bdev);
 	if (o)
-		r = __origin_write(&o->snapshots, bio);
+		r = __origin_write(&o->snapshots, bio->bi_sector, bio);
 	up_read(&_origins_lock);
 
 	return r;
-- 
cgit v1.2.3


From 615d1eb9cad9b34ed17c18c604254e8db533ac6f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:29 +0000
Subject: dm snapshot: create function for chunk_is_tracked wait

Move the __chunk_is_tracked() loop into a separate function as we will
also need to call it from the write path in the rare case of conflicting
writes to the same chunk.

Originally introduced in commit a8d41b59f3f5a7ac19452ef442a7fc1b5fa17366
("dm snapshot: fix race during exception creation").

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5e553c50c21..288994ee714 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -233,6 +233,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
 	return found;
 }
 
+/*
+ * This conflicting I/O is extremely improbable in the caller,
+ * so msleep(1) is sufficient and there is no need for a wait queue.
+ */
+static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
+{
+	while (__chunk_is_tracked(s, chunk))
+		msleep(1);
+}
+
 /*
  * One of these per registered origin, held in the snapshot_origins hash
  */
@@ -1102,12 +1112,8 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 		goto out;
 	}
 
-	/*
-	 * Check for conflicting reads. This is extremely improbable,
-	 * so msleep(1) is sufficient and there is no need for a wait queue.
-	 */
-	while (__chunk_is_tracked(s, pe->e.old_chunk))
-		msleep(1);
+	/* Check for conflicting reads */
+	__check_for_conflicting_io(s, pe->e.old_chunk);
 
 	/*
 	 * Add a proper exception, and remove the
-- 
cgit v1.2.3


From 4454a6216f75a9ef8c4bd0a65e34b101f725ef1e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:29 +0000
Subject: dm exception store: add merge specific methods

Add functions that decide how many consecutive chunks of snapshot to
merge back into the origin next and to update the metadata afterwards.

prepare_merge provides a pointer to the most recent still-to-be-merged
chunk and returns how many previous ones are consecutive and can be
processed together.

commit_merge removes the nr_merged most-recent chunks permanently from
the exception store.  The number must not exceed that returned by
prepare_merge.

Introduce NUM_SNAPSHOT_HDR_CHUNKS to show where the snapshot header
chunk is accounted for.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.h |  17 ++++++
 drivers/md/dm-snap-persistent.c | 114 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index bb8874653de..c53e08935b4 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -74,6 +74,23 @@ struct dm_exception_store_type {
 				  void (*callback) (void *, int success),
 				  void *callback_context);
 
+	/*
+	 * Returns 0 if the exception store is empty.
+	 *
+	 * If there are exceptions still to be merged, sets
+	 * *last_old_chunk and *last_new_chunk to the most recent
+	 * still-to-be-merged chunk and returns the number of
+	 * consecutive previous ones.
+	 */
+	int (*prepare_merge) (struct dm_exception_store *store,
+			      chunk_t *last_old_chunk, chunk_t *last_new_chunk);
+
+	/*
+	 * Clear the last n exceptions.
+	 * nr_merged must be <= the value returned by prepare_merge.
+	 */
+	int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
+
 	/*
 	 * The snapshot is invalid, note this in the metadata.
 	 */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 157999ebd23..7d08879689a 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -55,6 +55,8 @@
  */
 #define SNAPSHOT_DISK_VERSION 1
 
+#define NUM_SNAPSHOT_HDR_CHUNKS 1
+
 struct disk_header {
 	uint32_t magic;
 
@@ -120,7 +122,22 @@ struct pstore {
 
 	/*
 	 * The next free chunk for an exception.
+	 *
+	 * When creating exceptions, all the chunks here and above are
+	 * free.  It holds the next chunk to be allocated.  On rare
+	 * occasions (e.g. after a system crash) holes can be left in
+	 * the exception store because chunks can be committed out of
+	 * order.
+	 *
+	 * When merging exceptions, it does not necessarily mean all the
+	 * chunks here and above are free.  It holds the value it would
+	 * have held if all chunks had been committed in order of
+	 * allocation.  Consequently the value may occasionally be
+	 * slightly too low, but since it's only used for 'status' and
+	 * it can never reach its minimum value too early this doesn't
+	 * matter.
 	 */
+
 	chunk_t next_free;
 
 	/*
@@ -409,6 +426,15 @@ static void write_exception(struct pstore *ps,
 	e->new_chunk = cpu_to_le64(de->new_chunk);
 }
 
+static void clear_exception(struct pstore *ps, uint32_t index)
+{
+	struct disk_exception *e = get_exception(ps, index);
+
+	/* clear it */
+	e->old_chunk = 0;
+	e->new_chunk = 0;
+}
+
 /*
  * Registers the exceptions that are present in the current area.
  * 'full' is filled in to indicate if the area has been
@@ -505,7 +531,8 @@ static void persistent_usage(struct dm_exception_store *store,
 	 * Then there are (ps->current_area + 1) metadata chunks, each one
 	 * separated from the next by ps->exceptions_per_area data chunks.
 	 */
-	*metadata_sectors = (ps->current_area + 2) * store->chunk_size;
+	*metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
+			    store->chunk_size;
 }
 
 static void persistent_dtr(struct dm_exception_store *store)
@@ -680,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 	ps->callback_count = 0;
 }
 
+static int persistent_prepare_merge(struct dm_exception_store *store,
+				    chunk_t *last_old_chunk,
+				    chunk_t *last_new_chunk)
+{
+	struct pstore *ps = get_info(store);
+	struct disk_exception de;
+	int nr_consecutive;
+	int r;
+
+	/*
+	 * When current area is empty, move back to preceding area.
+	 */
+	if (!ps->current_committed) {
+		/*
+		 * Have we finished?
+		 */
+		if (!ps->current_area)
+			return 0;
+
+		ps->current_area--;
+		r = area_io(ps, READ);
+		if (r < 0)
+			return r;
+		ps->current_committed = ps->exceptions_per_area;
+	}
+
+	read_exception(ps, ps->current_committed - 1, &de);
+	*last_old_chunk = de.old_chunk;
+	*last_new_chunk = de.new_chunk;
+
+	/*
+	 * Find number of consecutive chunks within the current area,
+	 * working backwards.
+	 */
+	for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
+	     nr_consecutive++) {
+		read_exception(ps, ps->current_committed - 1 - nr_consecutive,
+			       &de);
+		if (de.old_chunk != *last_old_chunk - nr_consecutive ||
+		    de.new_chunk != *last_new_chunk - nr_consecutive)
+			break;
+	}
+
+	return nr_consecutive;
+}
+
+static int persistent_commit_merge(struct dm_exception_store *store,
+				   int nr_merged)
+{
+	int r, i;
+	struct pstore *ps = get_info(store);
+
+	BUG_ON(nr_merged > ps->current_committed);
+
+	for (i = 0; i < nr_merged; i++)
+		clear_exception(ps, ps->current_committed - 1 - i);
+
+	r = area_io(ps, WRITE);
+	if (r < 0)
+		return r;
+
+	ps->current_committed -= nr_merged;
+
+	/*
+	 * At this stage, only persistent_usage() uses ps->next_free, so
+	 * we make no attempt to keep ps->next_free strictly accurate
+	 * as exceptions may have been committed out-of-order originally.
+	 * Once a snapshot has become merging, we set it to the value it
+	 * would have held had all the exceptions been committed in order.
+	 *
+	 * ps->current_area does not get reduced by prepare_merge() until
+	 * after commit_merge() has removed the nr_merged previous exceptions.
+	 */
+	ps->next_free = (area_location(ps, ps->current_area) - 1) +
+			(ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS;
+
+	return 0;
+}
+
 static void persistent_drop_snapshot(struct dm_exception_store *store)
 {
 	struct pstore *ps = get_info(store);
@@ -705,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store,
 	ps->area = NULL;
 	ps->zero_area = NULL;
 	ps->header_area = NULL;
-	ps->next_free = 2;	/* skipping the header and first area */
+	ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
 	ps->current_committed = 0;
 
 	ps->callback_count = 0;
@@ -748,6 +854,8 @@ static struct dm_exception_store_type _persistent_type = {
 	.read_metadata = persistent_read_metadata,
 	.prepare_exception = persistent_prepare_exception,
 	.commit_exception = persistent_commit_exception,
+	.prepare_merge = persistent_prepare_merge,
+	.commit_merge = persistent_commit_merge,
 	.drop_snapshot = persistent_drop_snapshot,
 	.usage = persistent_usage,
 	.status = persistent_status,
@@ -761,6 +869,8 @@ static struct dm_exception_store_type _persistent_compat_type = {
 	.read_metadata = persistent_read_metadata,
 	.prepare_exception = persistent_prepare_exception,
 	.commit_exception = persistent_commit_exception,
+	.prepare_merge = persistent_prepare_merge,
+	.commit_merge = persistent_commit_merge,
 	.drop_snapshot = persistent_drop_snapshot,
 	.usage = persistent_usage,
 	.status = persistent_status,
-- 
cgit v1.2.3


From d698aa4500aa3ca9559142060caf0f79da998744 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:30 +0000
Subject: dm snapshot: add merge target

The snapshot-merge target allows a snapshot to be merged back into the
snapshot's origin device.

One anticipated use of snapshot merging is the rollback of filesystems
to back out problematic system upgrades.

This patch adds snapshot-merge target management to both
dm_snapshot_init() and dm_snapshot_exit().  As an initial place-holder,
snapshot-merge is identical to the snapshot target.  Documentation is
provided.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/snapshot.txt | 60 +++++++++++++++++++++++++++++---
 drivers/md/dm-snap.c                     | 53 +++++++++++++++++++++-------
 2 files changed, 96 insertions(+), 17 deletions(-)

diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt
index a5009c8300f..e3a77b21513 100644
--- a/Documentation/device-mapper/snapshot.txt
+++ b/Documentation/device-mapper/snapshot.txt
@@ -8,13 +8,19 @@ the block device which are also writable without interfering with the
 original content;
 *) To create device "forks", i.e. multiple different versions of the
 same data stream.
+*) To merge a snapshot of a block device back into the snapshot's origin
+device.
 
+In the first two cases, dm copies only the chunks of data that get
+changed and uses a separate copy-on-write (COW) block device for
+storage.
 
-In both cases, dm copies only the chunks of data that get changed and
-uses a separate copy-on-write (COW) block device for storage.
+For snapshot merge the contents of the COW storage are merged back into
+the origin device.
 
 
-There are two dm targets available: snapshot and snapshot-origin.
+There are three dm targets available:
+snapshot, snapshot-origin, and snapshot-merge.
 
 *) snapshot-origin <origin>
 
@@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be
 saved on disk - they can be kept in memory by the kernel.
 
 
-How this is used by LVM2
-========================
+* snapshot-merge <origin> <COW device> <persistent> <chunksize>
+
+takes the same table arguments as the snapshot target except it only
+works with persistent snapshots.  This target assumes the role of the
+"snapshot-origin" target and must not be loaded if the "snapshot-origin"
+is still present for <origin>.
+
+Creates a merging snapshot that takes control of the changed chunks
+stored in the <COW device> of an existing snapshot, through a handover
+procedure, and merges these chunks back into the <origin>.  Once merging
+has started (in the background) the <origin> may be opened and the merge
+will continue while I/O is flowing to it.  Changes to the <origin> are
+deferred until the merging snapshot's corresponding chunk(s) have been
+merged.  Once merging has started the snapshot device, associated with
+the "snapshot" target, will return -EIO when accessed.
+
+
+How snapshot is used by LVM2
+============================
 When you create the first LVM2 snapshot of a volume, four dm devices are used:
 
 1) a device containing the original mapping table of the source volume;
@@ -72,3 +95,30 @@ brw-------  1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
 brw-------  1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
 brw-------  1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
 
+
+How snapshot-merge is used by LVM2
+==================================
+A merging snapshot assumes the role of the "snapshot-origin" while
+merging.  As such the "snapshot-origin" is replaced with
+"snapshot-merge".  The "-real" device is not changed and the "-cow"
+device is renamed to <origin name>-cow to aid LVM2's cleanup of the
+merging snapshot after it completes.  The "snapshot" that hands over its
+COW device to the "snapshot-merge" is deactivated (unless using lvchange
+--refresh); but if it is left active it will simply return I/O errors.
+
+A snapshot will merge into its origin with the following command:
+
+lvconvert --merge volumeGroup/snap
+
+we'll now have this situation:
+
+# dmsetup table|grep volumeGroup
+
+volumeGroup-base-real: 0 2097152 linear 8:19 384
+volumeGroup-base-cow: 0 204800 linear 8:19 2097536
+volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
+
+# ls -lL /dev/mapper/volumeGroup-*
+brw-------  1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
+brw-------  1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
+brw-------  1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 288994ee714..446827f9823 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -25,6 +25,11 @@
 
 #define DM_MSG_PREFIX "snapshots"
 
+static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
+
+#define dm_target_is_snapshot_merge(ti) \
+	((ti)->type->name == dm_snapshot_merge_target_name)
+
 /*
  * The percentage increment we will wake up users at
  */
@@ -1743,6 +1748,21 @@ static struct target_type snapshot_target = {
 	.iterate_devices = snapshot_iterate_devices,
 };
 
+static struct target_type merge_target = {
+	.name    = dm_snapshot_merge_target_name,
+	.version = {1, 0, 0},
+	.module  = THIS_MODULE,
+	.ctr     = snapshot_ctr,
+	.dtr     = snapshot_dtr,
+	.map     = snapshot_map,
+	.end_io  = snapshot_end_io,
+	.postsuspend = snapshot_postsuspend,
+	.preresume  = snapshot_preresume,
+	.resume  = snapshot_resume,
+	.status  = snapshot_status,
+	.iterate_devices = snapshot_iterate_devices,
+};
+
 static int __init dm_snapshot_init(void)
 {
 	int r;
@@ -1754,7 +1774,7 @@ static int __init dm_snapshot_init(void)
 	}
 
 	r = dm_register_target(&snapshot_target);
-	if (r) {
+	if (r < 0) {
 		DMERR("snapshot target register failed %d", r);
 		goto bad_register_snapshot_target;
 	}
@@ -1762,34 +1782,40 @@ static int __init dm_snapshot_init(void)
 	r = dm_register_target(&origin_target);
 	if (r < 0) {
 		DMERR("Origin target register failed %d", r);
-		goto bad1;
+		goto bad_register_origin_target;
+	}
+
+	r = dm_register_target(&merge_target);
+	if (r < 0) {
+		DMERR("Merge target register failed %d", r);
+		goto bad_register_merge_target;
 	}
 
 	r = init_origin_hash();
 	if (r) {
 		DMERR("init_origin_hash failed.");
-		goto bad2;
+		goto bad_origin_hash;
 	}
 
 	exception_cache = KMEM_CACHE(dm_exception, 0);
 	if (!exception_cache) {
 		DMERR("Couldn't create exception cache.");
 		r = -ENOMEM;
-		goto bad3;
+		goto bad_exception_cache;
 	}
 
 	pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
 	if (!pending_cache) {
 		DMERR("Couldn't create pending cache.");
 		r = -ENOMEM;
-		goto bad4;
+		goto bad_pending_cache;
 	}
 
 	tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
 	if (!tracked_chunk_cache) {
 		DMERR("Couldn't create cache to track chunks in use.");
 		r = -ENOMEM;
-		goto bad5;
+		goto bad_tracked_chunk_cache;
 	}
 
 	ksnapd = create_singlethread_workqueue("ksnapd");
@@ -1803,19 +1829,21 @@ static int __init dm_snapshot_init(void)
 
 bad_pending_pool:
 	kmem_cache_destroy(tracked_chunk_cache);
-bad5:
+bad_tracked_chunk_cache:
 	kmem_cache_destroy(pending_cache);
-bad4:
+bad_pending_cache:
 	kmem_cache_destroy(exception_cache);
-bad3:
+bad_exception_cache:
 	exit_origin_hash();
-bad2:
+bad_origin_hash:
+	dm_unregister_target(&merge_target);
+bad_register_merge_target:
 	dm_unregister_target(&origin_target);
-bad1:
+bad_register_origin_target:
 	dm_unregister_target(&snapshot_target);
-
 bad_register_snapshot_target:
 	dm_exception_store_exit();
+
 	return r;
 }
 
@@ -1825,6 +1853,7 @@ static void __exit dm_snapshot_exit(void)
 
 	dm_unregister_target(&snapshot_target);
 	dm_unregister_target(&origin_target);
+	dm_unregister_target(&merge_target);
 
 	exit_origin_hash();
 	kmem_cache_destroy(pending_cache);
-- 
cgit v1.2.3


From 515ad66cc4c82f210d726340349c8f7c1ec6b125 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:30 +0000
Subject: dm snapshot: rework writing to origin

To track the completion of exceptions relating to the same location on
the device, the current code selects one exception as primary_pe, links
the other exceptions to it and uses reference counting to wait until all
the reallocations are complete.

It is considered too complicated to extend this code to handle the new
snapshot-merge target, where sets of non-overlapping chunks would also
need to become linked.

Instead, a simpler (but less efficient) approach is taken.  Bios are
linked to one exception.  When it completes, bios are simply retried,
and if other related exceptions are still outstanding, they'll get
queued again to wait for another one.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 155 ++++++++++++++++-----------------------------------
 1 file changed, 49 insertions(+), 106 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 446827f9823..c01e0dafec3 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -142,28 +142,6 @@ struct dm_snap_pending_exception {
 	struct bio_list origin_bios;
 	struct bio_list snapshot_bios;
 
-	/*
-	 * Short-term queue of pending exceptions prior to submission.
-	 */
-	struct list_head list;
-
-	/*
-	 * The primary pending_exception is the one that holds
-	 * the ref_count and the list of origin_bios for a
-	 * group of pending_exceptions.  It is always last to get freed.
-	 * These fields get set up when writing to the origin.
-	 */
-	struct dm_snap_pending_exception *primary_pe;
-
-	/*
-	 * Number of pending_exceptions processing this chunk.
-	 * When this drops to zero we must complete the origin bios.
-	 * If incrementing or decrementing this, hold pe->snap->lock for
-	 * the sibling concerned and not pe->primary_pe->snap->lock unless
-	 * they are the same.
-	 */
-	atomic_t ref_count;
-
 	/* Pointer back to snapshot context */
 	struct dm_snapshot *snap;
 
@@ -1019,6 +997,26 @@ static void flush_queued_bios(struct work_struct *work)
 	flush_bios(queued_bios);
 }
 
+static int do_origin(struct dm_dev *origin, struct bio *bio);
+
+/*
+ * Flush a list of buffers.
+ */
+static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
+{
+	struct bio *n;
+	int r;
+
+	while (bio) {
+		n = bio->bi_next;
+		bio->bi_next = NULL;
+		r = do_origin(s->origin, bio);
+		if (r == DM_MAPIO_REMAPPED)
+			generic_make_request(bio);
+		bio = n;
+	}
+}
+
 /*
  * Error a list of buffers.
  */
@@ -1052,39 +1050,6 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
 	dm_table_event(s->ti->table);
 }
 
-static void get_pending_exception(struct dm_snap_pending_exception *pe)
-{
-	atomic_inc(&pe->ref_count);
-}
-
-static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
-{
-	struct dm_snap_pending_exception *primary_pe;
-	struct bio *origin_bios = NULL;
-
-	primary_pe = pe->primary_pe;
-
-	/*
-	 * If this pe is involved in a write to the origin and
-	 * it is the last sibling to complete then release
-	 * the bios for the original write to the origin.
-	 */
-	if (primary_pe &&
-	    atomic_dec_and_test(&primary_pe->ref_count)) {
-		origin_bios = bio_list_get(&primary_pe->origin_bios);
-		free_pending_exception(primary_pe);
-	}
-
-	/*
-	 * Free the pe if it's not linked to an origin write or if
-	 * it's not itself a primary pe.
-	 */
-	if (!primary_pe || primary_pe != pe)
-		free_pending_exception(pe);
-
-	return origin_bios;
-}
-
 static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 {
 	struct dm_exception *e;
@@ -1129,7 +1094,8 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
  out:
 	dm_remove_exception(&pe->e);
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
-	origin_bios = put_pending_exception(pe);
+	origin_bios = bio_list_get(&pe->origin_bios);
+	free_pending_exception(pe);
 
 	up_write(&s->lock);
 
@@ -1139,7 +1105,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 	else
 		flush_bios(snapshot_bios);
 
-	flush_bios(origin_bios);
+	retry_origin_bios(s, origin_bios);
 }
 
 static void commit_callback(void *context, int success)
@@ -1226,8 +1192,6 @@ __find_pending_exception(struct dm_snapshot *s,
 	pe->e.old_chunk = chunk;
 	bio_list_init(&pe->origin_bios);
 	bio_list_init(&pe->snapshot_bios);
-	pe->primary_pe = NULL;
-	atomic_set(&pe->ref_count, 0);
 	pe->started = 0;
 
 	if (s->store->type->prepare_exception(s->store, &pe->e)) {
@@ -1235,7 +1199,6 @@ __find_pending_exception(struct dm_snapshot *s,
 		return NULL;
 	}
 
-	get_pending_exception(pe);
 	dm_insert_exception(&s->pending, &pe->e);
 
 	return pe;
@@ -1492,16 +1455,16 @@ static int snapshot_iterate_devices(struct dm_target *ti,
 static int __origin_write(struct list_head *snapshots, sector_t sector,
 			  struct bio *bio)
 {
-	int r = DM_MAPIO_REMAPPED, first = 0;
+	int r = DM_MAPIO_REMAPPED;
 	struct dm_snapshot *snap;
 	struct dm_exception *e;
-	struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL;
+	struct dm_snap_pending_exception *pe;
+	struct dm_snap_pending_exception *pe_to_start_now = NULL;
+	struct dm_snap_pending_exception *pe_to_start_last = NULL;
 	chunk_t chunk;
-	LIST_HEAD(pe_queue);
 
 	/* Do all the snapshots on this origin */
 	list_for_each_entry (snap, snapshots, list) {
-
 		down_write(&snap->lock);
 
 		/* Only deal with valid and active snapshots */
@@ -1522,9 +1485,6 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 		 * Check exception table to see if block
 		 * is already remapped in this snapshot
 		 * and trigger an exception if not.
-		 *
-		 * ref_count is initialised to 1 so pending_complete()
-		 * won't destroy the primary_pe while we're inside this loop.
 		 */
 		e = dm_lookup_exception(&snap->complete, chunk);
 		if (e)
@@ -1554,60 +1514,43 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 			}
 		}
 
-		if (!primary_pe) {
-			/*
-			 * Either every pe here has same
-			 * primary_pe or none has one yet.
-			 */
-			if (pe->primary_pe)
-				primary_pe = pe->primary_pe;
-			else {
-				primary_pe = pe;
-				first = 1;
-			}
-
-			if (bio)
-				bio_list_add(&primary_pe->origin_bios, bio);
+		r = DM_MAPIO_SUBMITTED;
 
-			r = DM_MAPIO_SUBMITTED;
-		}
+		/*
+		 * If an origin bio was supplied, queue it to wait for the
+		 * completion of this exception, and start this one last,
+		 * at the end of the function.
+		 */
+		if (bio) {
+			bio_list_add(&pe->origin_bios, bio);
+			bio = NULL;
 
-		if (!pe->primary_pe) {
-			pe->primary_pe = primary_pe;
-			get_pending_exception(primary_pe);
+			if (!pe->started) {
+				pe->started = 1;
+				pe_to_start_last = pe;
+			}
 		}
 
 		if (!pe->started) {
 			pe->started = 1;
-			list_add_tail(&pe->list, &pe_queue);
+			pe_to_start_now = pe;
 		}
 
  next_snapshot:
 		up_write(&snap->lock);
-	}
 
-	if (!primary_pe)
-		return r;
-
-	/*
-	 * If this is the first time we're processing this chunk and
-	 * ref_count is now 1 it means all the pending exceptions
-	 * got completed while we were in the loop above, so it falls to
-	 * us here to remove the primary_pe and submit any origin_bios.
-	 */
-
-	if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
-		flush_bios(bio_list_get(&primary_pe->origin_bios));
-		free_pending_exception(primary_pe);
-		/* If we got here, pe_queue is necessarily empty. */
-		return r;
+		if (pe_to_start_now) {
+			start_copy(pe_to_start_now);
+			pe_to_start_now = NULL;
+		}
 	}
 
 	/*
-	 * Now that we have a complete pe list we can start the copying.
+	 * Submit the exception against which the bio is queued last,
+	 * to give the other exceptions a head start.
 	 */
-	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
-		start_copy(pe);
+	if (pe_to_start_last)
+		start_copy(pe_to_start_last);
 
 	return r;
 }
-- 
cgit v1.2.3


From 3452c2a1eb5b93c1b9fb0d22bd5b07c0cee4dc29 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:31 +0000
Subject: dm snapshot: avoid allocating exceptions in merge

The snapshot-merge target should not allocate new exceptions because the
intent is to merge all of its exceptions as quickly and safely as
possible.

This patch introduces the snapshot-merge mapping function and updates
__origin_write() so that it doesn't allocate exceptions on any snapshots
that are being merged.

If a write request to a merging snapshot device is to be dispatched
directly to the origin (because the chunk is not remapped or was already
merged), snapshot_merge_map() must make exceptions in other snapshots so
calls do_origin().

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c01e0dafec3..59d9ef6f505 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1308,6 +1308,54 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	return r;
 }
 
+/*
+ * A snapshot-merge target behaves like a combination of a snapshot
+ * target and a snapshot-origin target.  It only generates new
+ * exceptions in other snapshots and not in the one that is being
+ * merged.
+ *
+ * For each chunk, if there is an existing exception, it is used to
+ * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
+ * which in turn might generate exceptions in other snapshots.
+ */
+static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
+			      union map_info *map_context)
+{
+	struct dm_exception *e;
+	struct dm_snapshot *s = ti->private;
+	int r = DM_MAPIO_REMAPPED;
+	chunk_t chunk;
+
+	chunk = sector_to_chunk(s->store, bio->bi_sector);
+
+	down_read(&s->lock);
+
+	/* Full snapshots are not usable */
+	if (!s->valid) {
+		r = -EIO;
+		goto out_unlock;
+	}
+
+	/* If the block is already remapped - use that */
+	e = dm_lookup_exception(&s->complete, chunk);
+	if (e) {
+		remap_exception(s, e, bio, chunk);
+		goto out_unlock;
+	}
+
+	bio->bi_bdev = s->origin->bdev;
+
+	if (bio_rw(bio) == WRITE) {
+		up_read(&s->lock);
+		return do_origin(s->origin, bio);
+	}
+
+out_unlock:
+	up_read(&s->lock);
+
+	return r;
+}
+
 static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
 			   int error, union map_info *map_context)
 {
@@ -1465,6 +1513,13 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 
 	/* Do all the snapshots on this origin */
 	list_for_each_entry (snap, snapshots, list) {
+		/*
+		 * Don't make new exceptions in a merging snapshot
+		 * because it has effectively been deleted
+		 */
+		if (dm_target_is_snapshot_merge(snap->ti))
+			continue;
+
 		down_write(&snap->lock);
 
 		/* Only deal with valid and active snapshots */
@@ -1697,7 +1752,7 @@ static struct target_type merge_target = {
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
-	.map     = snapshot_map,
+	.map     = snapshot_merge_map,
 	.end_io  = snapshot_end_io,
 	.postsuspend = snapshot_postsuspend,
 	.preresume  = snapshot_preresume,
-- 
cgit v1.2.3


From 10b8106a70433e14153469ebbdd189776500e238 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:31 +0000
Subject: dm snapshot: support barriers in snapshot merge target

Sets num_flush_requests=2 to support flushing both the origin and cow
devices used by the snapshot-merge target.

Also, snapshot_ctr() now gets the origin device using FMODE_WRITE if the
target is snapshot-merge (which writes to the origin device).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 59d9ef6f505..23e3396e43b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -709,7 +709,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	int i;
 	int r = -EINVAL;
 	char *origin_path, *cow_path;
-	unsigned args_used;
+	unsigned args_used, num_flush_requests = 1;
+	fmode_t origin_mode = FMODE_READ;
 
 	if (argc != 4) {
 		ti->error = "requires exactly 4 arguments";
@@ -717,6 +718,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	if (dm_target_is_snapshot_merge(ti)) {
+		num_flush_requests = 2;
+		origin_mode = FMODE_WRITE;
+	}
+
 	origin_path = argv[0];
 	argv++;
 	argc--;
@@ -750,7 +756,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	argv += args_used;
 	argc -= args_used;
 
-	r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
+	r = dm_get_device(ti, origin_path, 0, ti->len, origin_mode, &s->origin);
 	if (r) {
 		ti->error = "Cannot get origin device";
 		goto bad_origin;
@@ -801,7 +807,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	INIT_WORK(&s->queued_bios_work, flush_queued_bios);
 
 	ti->private = s;
-	ti->num_flush_requests = 1;
+	ti->num_flush_requests = num_flush_requests;
 
 	/* Add snapshot to the list of snapshots for this origin */
 	/* Exceptions aren't triggered till snapshot_resume() is called */
@@ -1326,6 +1332,15 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 	int r = DM_MAPIO_REMAPPED;
 	chunk_t chunk;
 
+	if (unlikely(bio_empty_barrier(bio))) {
+		if (!map_context->flush_request)
+			bio->bi_bdev = s->origin->bdev;
+		else
+			bio->bi_bdev = s->cow->bdev;
+		map_context->ptr = NULL;
+		return DM_MAPIO_REMAPPED;
+	}
+
 	chunk = sector_to_chunk(s->store, bio->bi_sector);
 
 	down_read(&s->lock);
-- 
cgit v1.2.3


From 9d3b15c4c776b041f9ee81810cbd375275411829 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:32 +0000
Subject: dm snapshot: permit only one merge at once

Merging more than one snapshot is not supported, so prevent
this happening.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 23e3396e43b..7ddee7c0c51 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -300,8 +300,10 @@ static void __insert_origin(struct origin *o)
  * Returns number of snapshots registered using the supplied cow device, plus:
  * snap_src - a snapshot suitable for use as a source of exception handover
  * snap_dest - a snapshot capable of receiving exception handover.
+ * snap_merge - an existing snapshot-merge target linked to the same origin.
+ *   There can be at most one snapshot-merge target. The parameter is optional.
  *
- * Possible return values and states:
+ * Possible return values and states of snap_src and snap_dest.
  *   0: NULL, NULL  - first new snapshot
  *   1: snap_src, NULL - normal snapshot
  *   2: snap_src, snap_dest  - waiting for handover
@@ -310,7 +312,8 @@ static void __insert_origin(struct origin *o)
  */
 static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
 					struct dm_snapshot **snap_src,
-					struct dm_snapshot **snap_dest)
+					struct dm_snapshot **snap_dest,
+					struct dm_snapshot **snap_merge)
 {
 	struct dm_snapshot *s;
 	struct origin *o;
@@ -322,6 +325,8 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
 		goto out;
 
 	list_for_each_entry(s, &o->snapshots, list) {
+		if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
+			*snap_merge = s;
 		if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
 			continue;
 
@@ -349,9 +354,11 @@ out:
 static int __validate_exception_handover(struct dm_snapshot *snap)
 {
 	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+	struct dm_snapshot *snap_merge = NULL;
 
 	/* Does snapshot need exceptions handed over to it? */
-	if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest) == 2) ||
+	if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
+					  &snap_merge) == 2) ||
 	    snap_dest) {
 		snap->ti->error = "Snapshot cow pairing for exception "
 				  "table handover failed";
@@ -365,6 +372,20 @@ static int __validate_exception_handover(struct dm_snapshot *snap)
 	if (!snap_src)
 		return 0;
 
+	/*
+	 * Non-snapshot-merge handover?
+	 */
+	if (!dm_target_is_snapshot_merge(snap->ti))
+		return 1;
+
+	/*
+	 * Do not allow more than one merging snapshot.
+	 */
+	if (snap_merge) {
+		snap->ti->error = "A snapshot is already merging.";
+		return -EINVAL;
+	}
+
 	return 1;
 }
 
@@ -933,7 +954,7 @@ static void snapshot_dtr(struct dm_target *ti)
 
 	down_read(&_origins_lock);
 	/* Check whether exception handover must be cancelled */
-	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest && (s == snap_src)) {
 		down_write(&snap_dest->lock);
 		snap_dest->valid = 0;
@@ -1399,7 +1420,7 @@ static int snapshot_preresume(struct dm_target *ti)
 	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 
 	down_read(&_origins_lock);
-	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest) {
 		down_read(&snap_src->lock);
 		if (s == snap_src) {
@@ -1424,7 +1445,7 @@ static void snapshot_resume(struct dm_target *ti)
 	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 
 	down_read(&_origins_lock);
-	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest) {
 		down_write(&snap_src->lock);
 		down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
-- 
cgit v1.2.3


From 1e03f97e4301f75a2f3b649787f7876516764929 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:32 +0000
Subject: dm snapshot: add merging

Merging is started when origin is resumed and it is stopped when
origin is suspended or when the merging snapshot is destroyed or
errors are detected.

Merging is not yet interlocked with writes: this will be handled in
subsequent patches.

The code relies on callbacks from a private kcopyd thread.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.h |  11 ++
 drivers/md/dm-snap.c            | 239 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 244 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index c53e08935b4..e8dfa06af3b 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -154,6 +154,13 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
 	BUG_ON(!dm_consecutive_chunk_count(e));
 }
 
+static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
+{
+	BUG_ON(!dm_consecutive_chunk_count(e));
+
+	e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
+}
+
 #  else
 #    define DM_CHUNK_CONSECUTIVE_BITS 0
 
@@ -171,6 +178,10 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
 {
 }
 
+static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
+{
+}
+
 #  endif
 
 /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 7ddee7c0c51..dc2412e6c5c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -106,8 +106,20 @@ struct dm_snapshot {
 	mempool_t *tracked_chunk_pool;
 	spinlock_t tracked_chunk_lock;
 	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
+
+	/* Wait for events based on state_bits */
+	unsigned long state_bits;
 };
 
+/*
+ * state_bits:
+ *   RUNNING_MERGE  - Merge operation is in progress.
+ *   SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
+ *                    cleared afterwards.
+ */
+#define RUNNING_MERGE          0
+#define SHUTDOWN_MERGE         1
+
 struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
 {
 	return s->cow;
@@ -386,6 +398,13 @@ static int __validate_exception_handover(struct dm_snapshot *snap)
 		return -EINVAL;
 	}
 
+	if (!snap_src->store->type->prepare_merge ||
+	    !snap_src->store->type->commit_merge) {
+		snap->ti->error = "Snapshot exception store does not "
+				  "support snapshot-merge.";
+		return -EINVAL;
+	}
+
 	return 1;
 }
 
@@ -721,6 +740,178 @@ static int init_hash_tables(struct dm_snapshot *s)
 	return 0;
 }
 
+static void merge_shutdown(struct dm_snapshot *s)
+{
+	clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&s->state_bits, RUNNING_MERGE);
+}
+
+/*
+ * Remove one chunk from the index of completed exceptions.
+ */
+static int __remove_single_exception_chunk(struct dm_snapshot *s,
+					   chunk_t old_chunk)
+{
+	struct dm_exception *e;
+
+	/* FIXME: interlock writes to this chunk */
+
+	e = dm_lookup_exception(&s->complete, old_chunk);
+	if (!e) {
+		DMERR("Corruption detected: exception for block %llu is "
+		      "on disk but not in memory",
+		      (unsigned long long)old_chunk);
+		return -EINVAL;
+	}
+
+	/*
+	 * If this is the only chunk using this exception, remove exception.
+	 */
+	if (!dm_consecutive_chunk_count(e)) {
+		dm_remove_exception(e);
+		free_completed_exception(e);
+		return 0;
+	}
+
+	/*
+	 * The chunk may be either at the beginning or the end of a
+	 * group of consecutive chunks - never in the middle.  We are
+	 * removing chunks in the opposite order to that in which they
+	 * were added, so this should always be true.
+	 * Decrement the consecutive chunk counter and adjust the
+	 * starting point if necessary.
+	 */
+	if (old_chunk == e->old_chunk) {
+		e->old_chunk++;
+		e->new_chunk++;
+	} else if (old_chunk != e->old_chunk +
+		   dm_consecutive_chunk_count(e)) {
+		DMERR("Attempt to merge block %llu from the "
+		      "middle of a chunk range [%llu - %llu]",
+		      (unsigned long long)old_chunk,
+		      (unsigned long long)e->old_chunk,
+		      (unsigned long long)
+		      e->old_chunk + dm_consecutive_chunk_count(e));
+		return -EINVAL;
+	}
+
+	dm_consecutive_chunk_count_dec(e);
+
+	return 0;
+}
+
+static int remove_single_exception_chunk(struct dm_snapshot *s,
+					 chunk_t old_chunk)
+{
+	int r = 0;
+
+	down_write(&s->lock);
+	r = __remove_single_exception_chunk(s, old_chunk);
+	up_write(&s->lock);
+
+	return r;
+}
+
+static void merge_callback(int read_err, unsigned long write_err,
+			   void *context);
+
+static void snapshot_merge_next_chunks(struct dm_snapshot *s)
+{
+	int r;
+	chunk_t old_chunk, new_chunk;
+	struct dm_io_region src, dest;
+
+	BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
+	if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
+		goto shut;
+
+	/*
+	 * valid flag never changes during merge, so no lock required.
+	 */
+	if (!s->valid) {
+		DMERR("Snapshot is invalid: can't merge");
+		goto shut;
+	}
+
+	r = s->store->type->prepare_merge(s->store, &old_chunk, &new_chunk);
+	if (r <= 0) {
+		if (r < 0)
+			DMERR("Read error in exception store: "
+			      "shutting down merge");
+		goto shut;
+	}
+
+	/* TODO: use larger I/O size once we verify that kcopyd handles it */
+
+	if (remove_single_exception_chunk(s, old_chunk) < 0)
+		goto shut;
+
+	dest.bdev = s->origin->bdev;
+	dest.sector = chunk_to_sector(s->store, old_chunk);
+	dest.count = min((sector_t)s->store->chunk_size,
+			 get_dev_size(dest.bdev) - dest.sector);
+
+	src.bdev = s->cow->bdev;
+	src.sector = chunk_to_sector(s->store, new_chunk);
+	src.count = dest.count;
+
+	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
+	return;
+
+shut:
+	merge_shutdown(s);
+}
+
+static void merge_callback(int read_err, unsigned long write_err, void *context)
+{
+	struct dm_snapshot *s = context;
+
+	if (read_err || write_err) {
+		if (read_err)
+			DMERR("Read error: shutting down merge.");
+		else
+			DMERR("Write error: shutting down merge.");
+		goto shut;
+	}
+
+	if (s->store->type->commit_merge(s->store, 1) < 0) {
+		DMERR("Write error in exception store: shutting down merge");
+		goto shut;
+	}
+
+	snapshot_merge_next_chunks(s);
+
+	return;
+
+shut:
+	merge_shutdown(s);
+}
+
+static void start_merge(struct dm_snapshot *s)
+{
+	if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
+		snapshot_merge_next_chunks(s);
+}
+
+static int wait_schedule(void *ptr)
+{
+	schedule();
+
+	return 0;
+}
+
+/*
+ * Stop the merging process and wait until it finishes.
+ */
+static void stop_merge(struct dm_snapshot *s)
+{
+	set_bit(SHUTDOWN_MERGE, &s->state_bits);
+	wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule,
+		    TASK_UNINTERRUPTIBLE);
+	clear_bit(SHUTDOWN_MERGE, &s->state_bits);
+}
+
 /*
  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
  */
@@ -791,6 +982,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	init_rwsem(&s->lock);
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
+	s->state_bits = 0;
 
 	/* Allocate hash table for COW data */
 	if (init_hash_tables(s)) {
@@ -963,6 +1155,9 @@ static void snapshot_dtr(struct dm_target *ti)
 	}
 	up_read(&_origins_lock);
 
+	if (dm_target_is_snapshot_merge(ti))
+		stop_merge(s);
+
 	/* Prevent further origin writes from using this snapshot. */
 	/* After this returns there can be no new kcopyd jobs. */
 	unregister_snapshot(s);
@@ -1404,6 +1599,13 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
 	return 0;
 }
 
+static void snapshot_merge_presuspend(struct dm_target *ti)
+{
+	struct dm_snapshot *s = ti->private;
+
+	stop_merge(s);
+}
+
 static void snapshot_postsuspend(struct dm_target *ti)
 {
 	struct dm_snapshot *s = ti->private;
@@ -1464,6 +1666,34 @@ static void snapshot_resume(struct dm_target *ti)
 	up_write(&s->lock);
 }
 
+static sector_t get_origin_minimum_chunksize(struct block_device *bdev)
+{
+	sector_t min_chunksize;
+
+	down_read(&_origins_lock);
+	min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
+	up_read(&_origins_lock);
+
+	return min_chunksize;
+}
+
+static void snapshot_merge_resume(struct dm_target *ti)
+{
+	struct dm_snapshot *s = ti->private;
+
+	/*
+	 * Handover exceptions from existing snapshot.
+	 */
+	snapshot_resume(ti);
+
+	/*
+	 * snapshot-merge acts as an origin, so set ti->split_io
+	 */
+	ti->split_io = get_origin_minimum_chunksize(s->origin->bdev);
+
+	start_merge(s);
+}
+
 static int snapshot_status(struct dm_target *ti, status_type_t type,
 			   char *result, unsigned int maxlen)
 {
@@ -1722,11 +1952,7 @@ static void origin_resume(struct dm_target *ti)
 {
 	struct dm_dev *dev = ti->private;
 
-	down_read(&_origins_lock);
-
-	ti->split_io = __minimum_chunk_size(__lookup_origin(dev->bdev));
-
-	up_read(&_origins_lock);
+	ti->split_io = get_origin_minimum_chunksize(dev->bdev);
 }
 
 static int origin_status(struct dm_target *ti, status_type_t type, char *result,
@@ -1790,9 +2016,10 @@ static struct target_type merge_target = {
 	.dtr     = snapshot_dtr,
 	.map     = snapshot_merge_map,
 	.end_io  = snapshot_end_io,
+	.presuspend = snapshot_merge_presuspend,
 	.postsuspend = snapshot_postsuspend,
 	.preresume  = snapshot_preresume,
-	.resume  = snapshot_resume,
+	.resume  = snapshot_merge_resume,
 	.status  = snapshot_status,
 	.iterate_devices = snapshot_iterate_devices,
 };
-- 
cgit v1.2.3


From 9fe862548821b0c206c58e8057b782530a173703 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:33 +0000
Subject: dm snapshot: queue writes to chunks being merged

While a set of chunks is being merged, any overlapping writes need to be
queued.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 91 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index dc2412e6c5c..91a47c522b0 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -109,6 +109,16 @@ struct dm_snapshot {
 
 	/* Wait for events based on state_bits */
 	unsigned long state_bits;
+
+	/* Range of chunks currently being merged. */
+	chunk_t first_merging_chunk;
+	int num_merging_chunks;
+
+	/*
+	 * Incoming bios that overlap with chunks being merged must wait
+	 * for them to be committed.
+	 */
+	struct bio_list bios_queued_during_merge;
 };
 
 /*
@@ -747,6 +757,14 @@ static void merge_shutdown(struct dm_snapshot *s)
 	wake_up_bit(&s->state_bits, RUNNING_MERGE);
 }
 
+static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
+{
+	s->first_merging_chunk = 0;
+	s->num_merging_chunks = 0;
+
+	return bio_list_get(&s->bios_queued_during_merge);
+}
+
 /*
  * Remove one chunk from the index of completed exceptions.
  */
@@ -755,8 +773,6 @@ static int __remove_single_exception_chunk(struct dm_snapshot *s,
 {
 	struct dm_exception *e;
 
-	/* FIXME: interlock writes to this chunk */
-
 	e = dm_lookup_exception(&s->complete, old_chunk);
 	if (!e) {
 		DMERR("Corruption detected: exception for block %llu is "
@@ -801,14 +817,32 @@ static int __remove_single_exception_chunk(struct dm_snapshot *s,
 	return 0;
 }
 
-static int remove_single_exception_chunk(struct dm_snapshot *s,
-					 chunk_t old_chunk)
+static void flush_bios(struct bio *bio);
+
+static int remove_single_exception_chunk(struct dm_snapshot *s)
 {
-	int r = 0;
+	struct bio *b = NULL;
+	int r;
+	chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
 
 	down_write(&s->lock);
-	r = __remove_single_exception_chunk(s, old_chunk);
+
+	/*
+	 * Process chunks (and associated exceptions) in reverse order
+	 * so that dm_consecutive_chunk_count_dec() accounting works.
+	 */
+	do {
+		r = __remove_single_exception_chunk(s, old_chunk);
+		if (r)
+			goto out;
+	} while (old_chunk-- > s->first_merging_chunk);
+
+	b = __release_queued_bios_after_merge(s);
+
+out:
 	up_write(&s->lock);
+	if (b)
+		flush_bios(b);
 
 	return r;
 }
@@ -844,9 +878,6 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 
 	/* TODO: use larger I/O size once we verify that kcopyd handles it */
 
-	if (remove_single_exception_chunk(s, old_chunk) < 0)
-		goto shut;
-
 	dest.bdev = s->origin->bdev;
 	dest.sector = chunk_to_sector(s->store, old_chunk);
 	dest.count = min((sector_t)s->store->chunk_size,
@@ -856,6 +887,13 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 	src.sector = chunk_to_sector(s->store, new_chunk);
 	src.count = dest.count;
 
+	down_write(&s->lock);
+	s->first_merging_chunk = old_chunk;
+	s->num_merging_chunks = 1;
+	up_write(&s->lock);
+
+	/* !!! FIXME: wait until writes to this chunk drain */
+
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
 	return;
 
@@ -863,9 +901,12 @@ shut:
 	merge_shutdown(s);
 }
 
+static void error_bios(struct bio *bio);
+
 static void merge_callback(int read_err, unsigned long write_err, void *context)
 {
 	struct dm_snapshot *s = context;
+	struct bio *b = NULL;
 
 	if (read_err || write_err) {
 		if (read_err)
@@ -875,16 +916,25 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
 		goto shut;
 	}
 
-	if (s->store->type->commit_merge(s->store, 1) < 0) {
+	if (s->store->type->commit_merge(s->store,
+					 s->num_merging_chunks) < 0) {
 		DMERR("Write error in exception store: shutting down merge");
 		goto shut;
 	}
 
+	if (remove_single_exception_chunk(s) < 0)
+		goto shut;
+
 	snapshot_merge_next_chunks(s);
 
 	return;
 
 shut:
+	down_write(&s->lock);
+	b = __release_queued_bios_after_merge(s);
+	up_write(&s->lock);
+	error_bios(b);
+
 	merge_shutdown(s);
 }
 
@@ -983,6 +1033,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
 	s->state_bits = 0;
+	s->first_merging_chunk = 0;
+	s->num_merging_chunks = 0;
+	bio_list_init(&s->bios_queued_during_merge);
 
 	/* Allocate hash table for COW data */
 	if (init_hash_tables(s)) {
@@ -1539,6 +1592,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
  * For each chunk, if there is an existing exception, it is used to
  * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
  * which in turn might generate exceptions in other snapshots.
+ * If merging is currently taking place on the chunk in question, the
+ * I/O is deferred by adding it to s->bios_queued_during_merge.
  */
 static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 			      union map_info *map_context)
@@ -1559,7 +1614,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 
 	chunk = sector_to_chunk(s->store, bio->bi_sector);
 
-	down_read(&s->lock);
+	down_write(&s->lock);
 
 	/* Full snapshots are not usable */
 	if (!s->valid) {
@@ -1570,6 +1625,16 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 	/* If the block is already remapped - use that */
 	e = dm_lookup_exception(&s->complete, chunk);
 	if (e) {
+		/* Queue writes overlapping with chunks being merged */
+		if (bio_rw(bio) == WRITE &&
+		    chunk >= s->first_merging_chunk &&
+		    chunk < (s->first_merging_chunk +
+			     s->num_merging_chunks)) {
+			bio->bi_bdev = s->origin->bdev;
+			bio_list_add(&s->bios_queued_during_merge, bio);
+			r = DM_MAPIO_SUBMITTED;
+			goto out_unlock;
+		}
 		remap_exception(s, e, bio, chunk);
 		goto out_unlock;
 	}
@@ -1577,12 +1642,12 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 	bio->bi_bdev = s->origin->bdev;
 
 	if (bio_rw(bio) == WRITE) {
-		up_read(&s->lock);
+		up_write(&s->lock);
 		return do_origin(s->origin, bio);
 	}
 
 out_unlock:
-	up_read(&s->lock);
+	up_write(&s->lock);
 
 	return r;
 }
-- 
cgit v1.2.3


From 17aa03326d40614db94bc51fbbc92df628a5c97c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:33 +0000
Subject: dm snapshot: delay merging a chunk until writes to it complete

Track writes to chunks that are currently being merged and delay merging
a chunk until all writes to that chunk finish.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 91a47c522b0..bc52776c69c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -892,7 +892,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 	s->num_merging_chunks = 1;
 	up_write(&s->lock);
 
-	/* !!! FIXME: wait until writes to this chunk drain */
+	__check_for_conflicting_io(s, old_chunk);
 
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
 	return;
@@ -1635,7 +1635,11 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 			r = DM_MAPIO_SUBMITTED;
 			goto out_unlock;
 		}
+
 		remap_exception(s, e, bio, chunk);
+
+		if (bio_rw(bio) == WRITE)
+			map_context->ptr = track_chunk(s, chunk);
 		goto out_unlock;
 	}
 
-- 
cgit v1.2.3


From 73dfd078cf8bfee4018fb22f1e2a24f2e05b69dc Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:34 +0000
Subject: dm snapshot: trigger exceptions in remaining snapshots during merge

When there is one merging snapshot and other non-merging snapshots,
snapshot_merge_process() must make exceptions in the non-merging
snapshots.

Use a sequence count to resolve the race between I/O to chunks that are
about to be merged.  The count increases each time an exception
reallocation finishes.  Use wait_event() to wait until the count
changes.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index bc52776c69c..1498704467a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -270,6 +270,10 @@ struct origin {
 static struct list_head *_origins;
 static struct rw_semaphore _origins_lock;
 
+static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
+static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
+static uint64_t _pending_exceptions_done_count;
+
 static int init_origin_hash(void)
 {
 	int i;
@@ -847,14 +851,38 @@ out:
 	return r;
 }
 
+static int origin_write_extent(struct dm_snapshot *merging_snap,
+			       sector_t sector, unsigned chunk_size);
+
 static void merge_callback(int read_err, unsigned long write_err,
 			   void *context);
 
+static uint64_t read_pending_exceptions_done_count(void)
+{
+	uint64_t pending_exceptions_done;
+
+	spin_lock(&_pending_exceptions_done_spinlock);
+	pending_exceptions_done = _pending_exceptions_done_count;
+	spin_unlock(&_pending_exceptions_done_spinlock);
+
+	return pending_exceptions_done;
+}
+
+static void increment_pending_exceptions_done_count(void)
+{
+	spin_lock(&_pending_exceptions_done_spinlock);
+	_pending_exceptions_done_count++;
+	spin_unlock(&_pending_exceptions_done_spinlock);
+
+	wake_up_all(&_pending_exceptions_done);
+}
+
 static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 {
 	int r;
 	chunk_t old_chunk, new_chunk;
 	struct dm_io_region src, dest;
+	uint64_t previous_count;
 
 	BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
 	if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
@@ -887,6 +915,24 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 	src.sector = chunk_to_sector(s->store, new_chunk);
 	src.count = dest.count;
 
+	/*
+	 * Reallocate any exceptions needed in other snapshots then
+	 * wait for the pending exceptions to complete.
+	 * Each time any pending exception (globally on the system)
+	 * completes we are woken and repeat the process to find out
+	 * if we can proceed.  While this may not seem a particularly
+	 * efficient algorithm, it is not expected to have any
+	 * significant impact on performance.
+	 */
+	previous_count = read_pending_exceptions_done_count();
+	while (origin_write_extent(s, dest.sector, s->store->chunk_size)) {
+		wait_event(_pending_exceptions_done,
+			   (read_pending_exceptions_done_count() !=
+			    previous_count));
+		/* Retry after the wait, until all exceptions are done. */
+		previous_count = read_pending_exceptions_done_count();
+	}
+
 	down_write(&s->lock);
 	s->first_merging_chunk = old_chunk;
 	s->num_merging_chunks = 1;
@@ -1372,6 +1418,8 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 	origin_bios = bio_list_get(&pe->origin_bios);
 	free_pending_exception(pe);
 
+	increment_pending_exceptions_done_count();
+
 	up_write(&s->lock);
 
 	/* Submit any pending write bios */
@@ -1962,6 +2010,41 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
 	return r;
 }
 
+/*
+ * Trigger exceptions in all non-merging snapshots.
+ *
+ * The chunk size of the merging snapshot may be larger than the chunk
+ * size of some other snapshot so we may need to reallocate multiple
+ * chunks in other snapshots.
+ *
+ * We scan all the overlapping exceptions in the other snapshots.
+ * Returns 1 if anything was reallocated and must be waited for,
+ * otherwise returns 0.
+ *
+ * size must be a multiple of merging_snap's chunk_size.
+ */
+static int origin_write_extent(struct dm_snapshot *merging_snap,
+			       sector_t sector, unsigned size)
+{
+	int must_wait = 0;
+	sector_t n;
+	struct origin *o;
+
+	/*
+	 * The origin's __minimum_chunk_size() got stored in split_io
+	 * by snapshot_merge_resume().
+	 */
+	down_read(&_origins_lock);
+	o = __lookup_origin(merging_snap->origin->bdev);
+	for (n = 0; n < size; n += merging_snap->ti->split_io)
+		if (__origin_write(&o->snapshots, sector + n, NULL) ==
+		    DM_MAPIO_SUBMITTED)
+			must_wait = 1;
+	up_read(&_origins_lock);
+
+	return must_wait;
+}
+
 /*
  * Origin: maps a linear range of a device, with hooks for snapshotting.
  */
-- 
cgit v1.2.3


From 8a2d528620e228ddfd0df9cec0a16e034ff8db1d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:34 +0000
Subject: dm snapshot: merge consecutive chunks together

s->store->type->prepare_merge returns the number of chunks that can be
copied linearly working backwards from the returned chunk number.

For example, if it returns 3 chunks with old_chunk == 10 and new_chunk
== 20, then chunk 20 can be copied to 10, chunk 19 to 9 and 18 to 8.

Until now kcopyd only copied one chunk at a time.  This patch now copies
the full set at once.

Consequently, snapshot_merge_process() needs to delay the merging of all
chunks if any have writes in progress, not just the first chunk in the
region that is to be merged.

snapshot-merge's performance is now comparable to the original
snapshot-origin target.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 1498704467a..bb4b733697b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -879,9 +879,10 @@ static void increment_pending_exceptions_done_count(void)
 
 static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 {
-	int r;
+	int i, linear_chunks;
 	chunk_t old_chunk, new_chunk;
 	struct dm_io_region src, dest;
+	sector_t io_size;
 	uint64_t previous_count;
 
 	BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
@@ -896,20 +897,28 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 		goto shut;
 	}
 
-	r = s->store->type->prepare_merge(s->store, &old_chunk, &new_chunk);
-	if (r <= 0) {
-		if (r < 0)
+	linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
+						      &new_chunk);
+	if (linear_chunks <= 0) {
+		if (linear_chunks < 0)
 			DMERR("Read error in exception store: "
 			      "shutting down merge");
 		goto shut;
 	}
 
-	/* TODO: use larger I/O size once we verify that kcopyd handles it */
+	/* Adjust old_chunk and new_chunk to reflect start of linear region */
+	old_chunk = old_chunk + 1 - linear_chunks;
+	new_chunk = new_chunk + 1 - linear_chunks;
+
+	/*
+	 * Use one (potentially large) I/O to copy all 'linear_chunks'
+	 * from the exception store to the origin
+	 */
+	io_size = linear_chunks * s->store->chunk_size;
 
 	dest.bdev = s->origin->bdev;
 	dest.sector = chunk_to_sector(s->store, old_chunk);
-	dest.count = min((sector_t)s->store->chunk_size,
-			 get_dev_size(dest.bdev) - dest.sector);
+	dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
 
 	src.bdev = s->cow->bdev;
 	src.sector = chunk_to_sector(s->store, new_chunk);
@@ -925,7 +934,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 	 * significant impact on performance.
 	 */
 	previous_count = read_pending_exceptions_done_count();
-	while (origin_write_extent(s, dest.sector, s->store->chunk_size)) {
+	while (origin_write_extent(s, dest.sector, io_size)) {
 		wait_event(_pending_exceptions_done,
 			   (read_pending_exceptions_done_count() !=
 			    previous_count));
@@ -935,10 +944,12 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 
 	down_write(&s->lock);
 	s->first_merging_chunk = old_chunk;
-	s->num_merging_chunks = 1;
+	s->num_merging_chunks = linear_chunks;
 	up_write(&s->lock);
 
-	__check_for_conflicting_io(s, old_chunk);
+	/* Wait until writes to all 'linear_chunks' drain */
+	for (i = 0; i < linear_chunks; i++)
+		__check_for_conflicting_io(s, old_chunk + i);
 
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
 	return;
-- 
cgit v1.2.3


From d8ddb1cfff0070479c1f4a07c1d4a48ef8cb188e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 10 Dec 2009 23:52:35 +0000
Subject: dm snapshot: report merge failure in status

Set 'merge_failed' flag if a snapshot fails to merge.  Update
snapshot_status() to report "Merge failed" if 'merge_failed' is set.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index bb4b733697b..4c80e82f941 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -71,7 +71,10 @@ struct dm_snapshot {
 	/* List of snapshots per Origin */
 	struct list_head list;
 
-	/* You can't use a snapshot if this is 0 (e.g. if full) */
+	/*
+	 * You can't use a snapshot if this is 0 (e.g. if full).
+	 * A snapshot-merge target never clears this.
+	 */
 	int valid;
 
 	/* Origin writes don't trigger exceptions until this is set */
@@ -107,6 +110,21 @@ struct dm_snapshot {
 	spinlock_t tracked_chunk_lock;
 	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
 
+	/*
+	 * The merge operation failed if this flag is set.
+	 * Failure modes are handled as follows:
+	 * - I/O error reading the header
+	 *   	=> don't load the target; abort.
+	 * - Header does not have "valid" flag set
+	 *   	=> use the origin; forget about the snapshot.
+	 * - I/O error when reading exceptions
+	 *   	=> don't load the target; abort.
+	 *         (We can't use the intermediate origin state.)
+	 * - I/O error while merging
+	 *	=> stop merging; set merge_failed; process I/O normally.
+	 */
+	int merge_failed;
+
 	/* Wait for events based on state_bits */
 	unsigned long state_bits;
 
@@ -900,9 +918,13 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 	linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
 						      &new_chunk);
 	if (linear_chunks <= 0) {
-		if (linear_chunks < 0)
+		if (linear_chunks < 0) {
 			DMERR("Read error in exception store: "
 			      "shutting down merge");
+			down_write(&s->lock);
+			s->merge_failed = 1;
+			up_write(&s->lock);
+		}
 		goto shut;
 	}
 
@@ -988,6 +1010,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
 
 shut:
 	down_write(&s->lock);
+	s->merge_failed = 1;
 	b = __release_queued_bios_after_merge(s);
 	up_write(&s->lock);
 	error_bios(b);
@@ -1090,6 +1113,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
 	s->state_bits = 0;
+	s->merge_failed = 0;
 	s->first_merging_chunk = 0;
 	s->num_merging_chunks = 0;
 	bio_list_init(&s->bios_queued_during_merge);
@@ -1835,6 +1859,8 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 
 		if (!snap->valid)
 			DMEMIT("Invalid");
+		else if (snap->merge_failed)
+			DMEMIT("Merge failed");
 		else {
 			if (snap->store->type->usage) {
 				sector_t total_sectors, sectors_allocated,
-- 
cgit v1.2.3


From d2fdb776e08d4231d7e86a879cc663a93913c202 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 10 Dec 2009 23:52:36 +0000
Subject: dm snapshot: use merge origin if snapshot invalid

If the snapshot we are merging became invalid (e.g. it ran out of
space) redirect all I/O directly to the origin device.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 4c80e82f941..ee8eb283650 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1699,11 +1699,9 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 
 	down_write(&s->lock);
 
-	/* Full snapshots are not usable */
-	if (!s->valid) {
-		r = -EIO;
-		goto out_unlock;
-	}
+	/* Full merging snapshots are redirected to the origin */
+	if (!s->valid)
+		goto redirect_to_origin;
 
 	/* If the block is already remapped - use that */
 	e = dm_lookup_exception(&s->complete, chunk);
@@ -1726,6 +1724,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 		goto out_unlock;
 	}
 
+redirect_to_origin:
 	bio->bi_bdev = s->origin->bdev;
 
 	if (bio_rw(bio) == WRITE) {
-- 
cgit v1.2.3