diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/Kconfig | 24 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 486 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 32 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 51 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 664 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 95 | ||||
-rw-r--r-- | drivers/md/dm-snap.h | 50 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 105 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 20 | ||||
-rw-r--r-- | drivers/md/dm.c | 238 |
12 files changed, 1361 insertions, 408 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3fa7c77d9bd..610af916891 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -204,7 +204,7 @@ config BLK_DEV_DM config DM_DEBUG boolean "Device mapper debugging support" - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM ---help--- Enable this for messages that may help debug device-mapper problems. @@ -212,7 +212,7 @@ config DM_DEBUG config DM_CRYPT tristate "Crypt target support" - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM select CRYPTO select CRYPTO_CBC ---help--- @@ -230,34 +230,34 @@ config DM_CRYPT If unsure, say N. config DM_SNAPSHOT - tristate "Snapshot target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Snapshot target" + depends on BLK_DEV_DM ---help--- Allow volume managers to take writable snapshots of a device. config DM_MIRROR - tristate "Mirror target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Mirror target" + depends on BLK_DEV_DM ---help--- Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. config DM_ZERO - tristate "Zero target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Zero target" + depends on BLK_DEV_DM ---help--- A target that discards writes, and returns all zeroes for reads. Useful in some recovery situations. config DM_MULTIPATH - tristate "Multipath target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Multipath target" + depends on BLK_DEV_DM ---help--- Allow volume managers to support multipath hardware. config DM_MULTIPATH_EMC - tristate "EMC CX/AX multipath support (EXPERIMENTAL)" - depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL + tristate "EMC CX/AX multipath support" + depends on DM_MULTIPATH && BLK_DEV_DM ---help--- Multipath support for EMC CX/AX series hardware. diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6b66ee46b87..b04f98df94e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,11 +1,12 @@ /* * Copyright (C) 2003 Christophe Saout <christophe@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> - * Copyright (C) 2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ +#include <linux/completion.h> #include <linux/err.h> #include <linux/module.h> #include <linux/init.h> @@ -28,20 +29,10 @@ #define MESG_STR(x) x, sizeof(x) /* - * per bio private data - */ -struct dm_crypt_io { - struct dm_target *target; - struct bio *base_bio; - struct work_struct work; - atomic_t pending; - int error; -}; - -/* * context holding the current state of a multi-part conversion */ struct convert_context { + struct completion restart; struct bio *bio_in; struct bio *bio_out; unsigned int offset_in; @@ -49,7 +40,27 @@ struct convert_context { unsigned int idx_in; unsigned int idx_out; sector_t sector; - int write; + atomic_t pending; +}; + +/* + * per bio private data + */ +struct dm_crypt_io { + struct dm_target *target; + struct bio *base_bio; + struct work_struct work; + + struct convert_context ctx; + + atomic_t pending; + int error; + sector_t sector; +}; + +struct dm_crypt_request { + struct scatterlist sg_in; + struct scatterlist sg_out; }; struct crypt_config; @@ -72,10 +83,11 @@ struct crypt_config { sector_t start; /* - * pool for per bio private data and - * for encryption buffer pages + * pool for per bio private data, crypto requests and + * encryption requeusts/buffer pages */ mempool_t *io_pool; + mempool_t *req_pool; mempool_t *page_pool; struct bio_set *bs; @@ -93,9 +105,25 @@ struct crypt_config { sector_t iv_offset; unsigned int iv_size; + /* + * Layout of each crypto request: + * + * struct ablkcipher_request + * context + * padding + * struct dm_crypt_request + * padding + * IV + * + * The padding is added so that dm_crypt_request and the IV are + * correctly aligned. + */ + unsigned int dmreq_start; + struct ablkcipher_request *req; + char cipher[CRYPTO_MAX_ALG_NAME]; char chainmode[CRYPTO_MAX_ALG_NAME]; - struct crypto_blkcipher *tfm; + struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; u8 key[0]; @@ -108,6 +136,7 @@ struct crypt_config { static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); +static void kcryptd_queue_crypt(struct dm_crypt_io *io); /* * Different IV generation algorithms: @@ -188,7 +217,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, return PTR_ERR(essiv_tfm); } if (crypto_cipher_blocksize(essiv_tfm) != - crypto_blkcipher_ivsize(cc->tfm)) { + crypto_ablkcipher_ivsize(cc->tfm)) { ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; crypto_free_cipher(essiv_tfm); @@ -225,7 +254,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned int bs = crypto_blkcipher_blocksize(cc->tfm); + unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -289,42 +318,10 @@ static struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; -static int -crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, - struct scatterlist *in, unsigned int length, - int write, sector_t sector) -{ - u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64)))); - struct blkcipher_desc desc = { - .tfm = cc->tfm, - .info = iv, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP, - }; - int r; - - if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, sector); - if (r < 0) - return r; - - if (write) - r = crypto_blkcipher_encrypt_iv(&desc, out, in, length); - else - r = crypto_blkcipher_decrypt_iv(&desc, out, in, length); - } else { - if (write) - r = crypto_blkcipher_encrypt(&desc, out, in, length); - else - r = crypto_blkcipher_decrypt(&desc, out, in, length); - } - - return r; -} - static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, - sector_t sector, int write) + sector_t sector) { ctx->bio_in = bio_in; ctx->bio_out = bio_out; @@ -333,7 +330,79 @@ static void crypt_convert_init(struct crypt_config *cc, ctx->idx_in = bio_in ? bio_in->bi_idx : 0; ctx->idx_out = bio_out ? bio_out->bi_idx : 0; ctx->sector = sector + cc->iv_offset; - ctx->write = write; + init_completion(&ctx->restart); + /* + * Crypto operation can be asynchronous, + * ctx->pending is increased after request submission. + * We need to ensure that we don't call the crypt finish + * operation before pending got incremented + * (dependent on crypt submission return code). + */ + atomic_set(&ctx->pending, 2); +} + +static int crypt_convert_block(struct crypt_config *cc, + struct convert_context *ctx, + struct ablkcipher_request *req) +{ + struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); + struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); + struct dm_crypt_request *dmreq; + u8 *iv; + int r = 0; + + dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start); + iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_ablkcipher_alignmask(cc->tfm) + 1); + + sg_init_table(&dmreq->sg_in, 1); + sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, + bv_in->bv_offset + ctx->offset_in); + + sg_init_table(&dmreq->sg_out, 1); + sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, + bv_out->bv_offset + ctx->offset_out); + + ctx->offset_in += 1 << SECTOR_SHIFT; + if (ctx->offset_in >= bv_in->bv_len) { + ctx->offset_in = 0; + ctx->idx_in++; + } + + ctx->offset_out += 1 << SECTOR_SHIFT; + if (ctx->offset_out >= bv_out->bv_len) { + ctx->offset_out = 0; + ctx->idx_out++; + } + + if (cc->iv_gen_ops) { + r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); + if (r < 0) + return r; + } + + ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, + 1 << SECTOR_SHIFT, iv); + + if (bio_data_dir(ctx->bio_in) == WRITE) + r = crypto_ablkcipher_encrypt(req); + else + r = crypto_ablkcipher_decrypt(req); + + return r; +} + +static void kcryptd_async_done(struct crypto_async_request *async_req, + int error); +static void crypt_alloc_req(struct crypt_config *cc, + struct convert_context *ctx) +{ + if (!cc->req) + cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + ablkcipher_request_set_tfm(cc->req, cc->tfm); + ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, ctx); } /* @@ -346,36 +415,38 @@ static int crypt_convert(struct crypt_config *cc, while(ctx->idx_in < ctx->bio_in->bi_vcnt && ctx->idx_out < ctx->bio_out->bi_vcnt) { - struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); - struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); - struct scatterlist sg_in, sg_out; - - sg_init_table(&sg_in, 1); - sg_set_page(&sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in); - - sg_init_table(&sg_out, 1); - sg_set_page(&sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, bv_out->bv_offset + ctx->offset_out); - ctx->offset_in += sg_in.length; - if (ctx->offset_in >= bv_in->bv_len) { - ctx->offset_in = 0; - ctx->idx_in++; + crypt_alloc_req(cc, ctx); + + r = crypt_convert_block(cc, ctx, cc->req); + + switch (r) { + case -EBUSY: + wait_for_completion(&ctx->restart); + INIT_COMPLETION(ctx->restart); + /* fall through*/ + case -EINPROGRESS: + atomic_inc(&ctx->pending); + cc->req = NULL; + r = 0; + /* fall through*/ + case 0: + ctx->sector++; + continue; } - ctx->offset_out += sg_out.length; - if (ctx->offset_out >= bv_out->bv_len) { - ctx->offset_out = 0; - ctx->idx_out++; - } - - r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length, - ctx->write, ctx->sector); - if (r < 0) - break; - - ctx->sector++; + break; } + /* + * If there are pending crypto operation run async + * code. Otherwise process return code synchronously. + * The step of 2 ensures that async finish doesn't + * call crypto finish too early. + */ + if (atomic_sub_return(2, &ctx->pending)) + return -EINPROGRESS; + return r; } @@ -455,18 +526,14 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. */ -static void crypt_dec_pending(struct dm_crypt_io *io, int error) +static void crypt_dec_pending(struct dm_crypt_io *io) { - struct crypt_config *cc = (struct crypt_config *) io->target->private; - - if (error < 0) - io->error = error; + struct crypt_config *cc = io->target->private; if (!atomic_dec_and_test(&io->pending)) return; bio_endio(io->base_bio, io->error); - mempool_free(io, cc->io_pool); } @@ -484,30 +551,11 @@ static void crypt_dec_pending(struct dm_crypt_io *io, int error) * starved by new requests which can block in the first stages due * to memory allocation. */ -static void kcryptd_do_work(struct work_struct *work); -static void kcryptd_do_crypt(struct work_struct *work); - -static void kcryptd_queue_io(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - - INIT_WORK(&io->work, kcryptd_do_work); - queue_work(cc->io_queue, &io->work); -} - -static void kcryptd_queue_crypt(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - - INIT_WORK(&io->work, kcryptd_do_crypt); - queue_work(cc->crypt_queue, &io->work); -} - static void crypt_endio(struct bio *clone, int error) { struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->target->private; - unsigned read_io = bio_data_dir(clone) == READ; + unsigned rw = bio_data_dir(clone); if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) error = -EIO; @@ -515,21 +563,20 @@ static void crypt_endio(struct bio *clone, int error) /* * free the processed pages */ - if (!read_io) { + if (rw == WRITE) crypt_free_buffer_pages(cc, clone); - goto out; + + bio_put(clone); + + if (rw == READ && !error) { + kcryptd_queue_crypt(io); + return; } if (unlikely(error)) - goto out; - - bio_put(clone); - kcryptd_queue_crypt(io); - return; + io->error = error; -out: - bio_put(clone); - crypt_dec_pending(io, error); + crypt_dec_pending(io); } static void clone_init(struct dm_crypt_io *io, struct bio *clone) @@ -543,12 +590,11 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_destructor = dm_crypt_bio_destructor; } -static void process_read(struct dm_crypt_io *io) +static void kcryptd_io_read(struct dm_crypt_io *io) { struct crypt_config *cc = io->target->private; struct bio *base_bio = io->base_bio; struct bio *clone; - sector_t sector = base_bio->bi_sector - io->target->begin; atomic_inc(&io->pending); @@ -559,7 +605,8 @@ static void process_read(struct dm_crypt_io *io) */ clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); if (unlikely(!clone)) { - crypt_dec_pending(io, -ENOMEM); + io->error = -ENOMEM; + crypt_dec_pending(io); return; } @@ -567,25 +614,71 @@ static void process_read(struct dm_crypt_io *io) clone->bi_idx = 0; clone->bi_vcnt = bio_segments(base_bio); clone->bi_size = base_bio->bi_size; - clone->bi_sector = cc->start + sector; + clone->bi_sector = cc->start + io->sector; memcpy(clone->bi_io_vec, bio_iovec(base_bio), sizeof(struct bio_vec) * clone->bi_vcnt); generic_make_request(clone); } -static void process_write(struct dm_crypt_io *io) +static void kcryptd_io_write(struct dm_crypt_io *io) +{ + struct bio *clone = io->ctx.bio_out; + + generic_make_request(clone); +} + +static void kcryptd_io(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + + if (bio_data_dir(io->base_bio) == READ) + kcryptd_io_read(io); + else + kcryptd_io_write(io); +} + +static void kcryptd_queue_io(struct dm_crypt_io *io) { struct crypt_config *cc = io->target->private; - struct bio *base_bio = io->base_bio; - struct bio *clone; - struct convert_context ctx; - unsigned remaining = base_bio->bi_size; - sector_t sector = base_bio->bi_sector - io->target->begin; - atomic_inc(&io->pending); + INIT_WORK(&io->work, kcryptd_io); + queue_work(cc->io_queue, &io->work); +} - crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1); +static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, + int error, int async) +{ + struct bio *clone = io->ctx.bio_out; + struct crypt_config *cc = io->target->private; + + if (unlikely(error < 0)) { + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + io->error = -EIO; + return; + } + + /* crypt_convert should have filled the clone bio */ + BUG_ON(io->ctx.idx_out < clone->bi_vcnt); + + clone->bi_sector = cc->start + io->sector; + io->sector += bio_sectors(clone); + + if (async) + kcryptd_queue_io(io); + else { + atomic_inc(&io->pending); + generic_make_request(clone); + } +} + +static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + struct bio *clone; + unsigned remaining = io->base_bio->bi_size; + int r; /* * The allocated buffers can be smaller than the whole bio, @@ -594,70 +687,110 @@ static void process_write(struct dm_crypt_io *io) while (remaining) { clone = crypt_alloc_buffer(io, remaining); if (unlikely(!clone)) { - crypt_dec_pending(io, -ENOMEM); + io->error = -ENOMEM; return; } - ctx.bio_out = clone; - ctx.idx_out = 0; + io->ctx.bio_out = clone; + io->ctx.idx_out = 0; - if (unlikely(crypt_convert(cc, &ctx) < 0)) { - crypt_free_buffer_pages(cc, clone); - bio_put(clone); - crypt_dec_pending(io, -EIO); - return; - } - - /* crypt_convert should have filled the clone bio */ - BUG_ON(ctx.idx_out < clone->bi_vcnt); - - clone->bi_sector = cc->start + sector; remaining -= clone->bi_size; - sector += bio_sectors(clone); - /* Grab another reference to the io struct - * before we kick off the request */ - if (remaining) - atomic_inc(&io->pending); + r = crypt_convert(cc, &io->ctx); - generic_make_request(clone); - - /* Do not reference clone after this - it - * may be gone already. */ + if (r != -EINPROGRESS) { + kcryptd_crypt_write_io_submit(io, r, 0); + if (unlikely(r < 0)) + return; + } else + atomic_inc(&io->pending); /* out of memory -> run queues */ - if (remaining) + if (unlikely(remaining)) congestion_wait(WRITE, HZ/100); } } -static void process_read_endio(struct dm_crypt_io *io) +static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->target->private; - struct convert_context ctx; - crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio, - io->base_bio->bi_sector - io->target->begin, 0); + /* + * Prevent io from disappearing until this function completes. + */ + atomic_inc(&io->pending); + + crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector); + kcryptd_crypt_write_convert_loop(io); - crypt_dec_pending(io, crypt_convert(cc, &ctx)); + crypt_dec_pending(io); } -static void kcryptd_do_work(struct work_struct *work) +static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) { - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + if (unlikely(error < 0)) + io->error = -EIO; + + crypt_dec_pending(io); +} + +static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + int r = 0; + + atomic_inc(&io->pending); + + crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, + io->sector); + + r = crypt_convert(cc, &io->ctx); + + if (r != -EINPROGRESS) + kcryptd_crypt_read_done(io, r); + + crypt_dec_pending(io); +} + +static void kcryptd_async_done(struct crypto_async_request *async_req, + int error) +{ + struct convert_context *ctx = async_req->data; + struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); + struct crypt_config *cc = io->target->private; + + if (error == -EINPROGRESS) { + complete(&ctx->restart); + return; + } + + mempool_free(ablkcipher_request_cast(async_req), cc->req_pool); + + if (!atomic_dec_and_test(&ctx->pending)) + return; if (bio_data_dir(io->base_bio) == READ) - process_read(io); + kcryptd_crypt_read_done(io, error); + else + kcryptd_crypt_write_io_submit(io, error, 1); } -static void kcryptd_do_crypt(struct work_struct *work) +static void kcryptd_crypt(struct work_struct *work) { struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); if (bio_data_dir(io->base_bio) == READ) - process_read_endio(io); + kcryptd_crypt_read_convert(io); else - process_write(io); + kcryptd_crypt_write_convert(io); +} + +static void kcryptd_queue_crypt(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + + INIT_WORK(&io->work, kcryptd_crypt); + queue_work(cc->crypt_queue, &io->work); } /* @@ -733,7 +866,7 @@ static int crypt_wipe_key(struct crypt_config *cc) static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; - struct crypto_blkcipher *tfm; + struct crypto_ablkcipher *tfm; char *tmp; char *cipher; char *chainmode; @@ -787,7 +920,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_cipher; } - tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); if (IS_ERR(tfm)) { ti->error = "Error allocating crypto tfm"; goto bad_cipher; @@ -821,7 +954,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) goto bad_ivmode; - cc->iv_size = crypto_blkcipher_ivsize(tfm); + cc->iv_size = crypto_ablkcipher_ivsize(tfm); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -841,6 +974,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_slab_pool; } + cc->dmreq_start = sizeof(struct ablkcipher_request); + cc->dmreq_start += crypto_ablkcipher_reqsize(tfm); + cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); + cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) & + ~(crypto_tfm_ctx_alignment() - 1); + + cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + + sizeof(struct dm_crypt_request) + cc->iv_size); + if (!cc->req_pool) { + ti->error = "Cannot allocate crypt request mempool"; + goto bad_req_pool; + } + cc->req = NULL; + cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; @@ -853,7 +1000,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_bs; } - if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) { + if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { ti->error = "Error setting key"; goto bad_device; } @@ -914,12 +1061,14 @@ bad_device: bad_bs: mempool_destroy(cc->page_pool); bad_page_pool: + mempool_destroy(cc->req_pool); +bad_req_pool: mempool_destroy(cc->io_pool); bad_slab_pool: if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); bad_ivmode: - crypto_free_blkcipher(tfm); + crypto_free_ablkcipher(tfm); bad_cipher: /* Must zero key material before freeing */ memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); @@ -934,14 +1083,18 @@ static void crypt_dtr(struct dm_target *ti) destroy_workqueue(cc->io_queue); destroy_workqueue(cc->crypt_queue); + if (cc->req) + mempool_free(cc->req, cc->req_pool); + bioset_free(cc->bs); mempool_destroy(cc->page_pool); + mempool_destroy(cc->req_pool); mempool_destroy(cc->io_pool); kfree(cc->iv_mode); if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); - crypto_free_blkcipher(cc->tfm); + crypto_free_ablkcipher(cc->tfm); dm_put_device(ti, cc->dev); /* Must zero key material before freeing */ @@ -958,6 +1111,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, io = mempool_alloc(cc->io_pool, GFP_NOIO); io->target = ti; io->base_bio = bio; + io->sector = bio->bi_sector - ti->begin; io->error = 0; atomic_set(&io->pending, 0); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 8fe81e1807e..5bbce29f143 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -449,7 +449,7 @@ static void persistent_destroy(struct exception_store *store) static int persistent_read_metadata(struct exception_store *store) { - int r, new_snapshot; + int r, uninitialized_var(new_snapshot); struct pstore *ps = get_info(store); /* diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 9627fa0f947..b262c0042de 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/dm-ioctl.h> #include <linux/hdreg.h> +#include <linux/compat.h> #include <asm/uaccess.h> @@ -702,7 +703,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) int r; char *new_name = (char *) param + param->data_start; - if (new_name < (char *) param->data || + if (new_name < param->data || invalid_str(new_name, (void *) param + param_size)) { DMWARN("Invalid new logical volume name supplied."); return -EINVAL; @@ -728,7 +729,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - if (geostr < (char *) param->data || + if (geostr < param->data || invalid_str(geostr, (void *) param + param_size)) { DMWARN("Invalid geometry supplied."); goto out; @@ -1350,10 +1351,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) { struct dm_ioctl tmp, *dmi; - if (copy_from_user(&tmp, user, sizeof(tmp))) + if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) return -EFAULT; - if (tmp.data_size < sizeof(tmp)) + if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) return -EINVAL; dmi = vmalloc(tmp.data_size); @@ -1397,13 +1398,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param) return 0; } -static int ctl_ioctl(struct inode *inode, struct file *file, - uint command, ulong u) +static int ctl_ioctl(uint command, struct dm_ioctl __user *user) { int r = 0; unsigned int cmd; - struct dm_ioctl *param; - struct dm_ioctl __user *user = (struct dm_ioctl __user *) u; + struct dm_ioctl *uninitialized_var(param); ioctl_fn fn = NULL; size_t param_size; @@ -1471,8 +1470,23 @@ static int ctl_ioctl(struct inode *inode, struct file *file, return r; } +static long dm_ctl_ioctl(struct file *file, uint command, ulong u) +{ + return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u); +} + +#ifdef CONFIG_COMPAT +static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) +{ + return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define dm_compat_ctl_ioctl NULL +#endif + static const struct file_operations _ctl_fops = { - .ioctl = ctl_ioctl, + .unlocked_ioctl = dm_ctl_ioctl, + .compat_ioctl = dm_compat_ctl_ioctl, .owner = THIS_MODULE, }; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 072ee4353ea..2a74b2142f5 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -41,7 +41,7 @@ int dm_unregister_dirty_log_type(struct dirty_log_type *type) return 0; } -static struct dirty_log_type *get_type(const char *type_name) +static struct dirty_log_type *_get_type(const char *type_name) { struct dirty_log_type *type; @@ -61,6 +61,55 @@ static struct dirty_log_type *get_type(const char *type_name) return NULL; } +/* + * get_type + * @type_name + * + * Attempt to retrieve the dirty_log_type by name. If not already + * available, attempt to load the appropriate module. + * + * Log modules are named "dm-log-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-log-<type_name>", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-disk", it would search + * 'dm-log-clustered-disk' then 'dm-log-clustered'. + * + * Returns: dirty_log_type* on success, NULL on failure + */ +static struct dirty_log_type *get_type(const char *type_name) +{ + char *p, *type_name_dup; + struct dirty_log_type *type; + + type = _get_type(type_name); + if (type) + return type; + + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMWARN("No memory left to attempt log module load for \"%s\"", + type_name); + return NULL; + } + + while (request_module("dm-log-%s", type_name_dup) || + !(type = _get_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; + } + + if (!type) + DMWARN("Module for logging type \"%s\" not found.", type_name); + + kfree(type_name_dup); + + return type; +} + static void put_type(struct dirty_log_type *type) { spin_lock(&_lock); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 24b2b1e32fa..e7ee59e655d 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -106,7 +106,7 @@ typedef int (*action_fn) (struct pgpath *pgpath); static struct kmem_cache *_mpio_cache; -struct workqueue_struct *kmultipathd; +static struct workqueue_struct *kmultipathd; static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 31123d4a6b9..edc057f5cdc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -6,6 +6,7 @@ #include "dm.h" #include "dm-bio-list.h" +#include "dm-bio-record.h" #include "dm-io.h" #include "dm-log.h" #include "kcopyd.h" @@ -20,6 +21,7 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/log2.h> +#include <linux/hardirq.h> #define DM_MSG_PREFIX "raid1" #define DM_IO_PAGES 64 @@ -113,9 +115,16 @@ struct region { /*----------------------------------------------------------------- * Mirror set structures. *---------------------------------------------------------------*/ +enum dm_raid1_error { + DM_RAID1_WRITE_ERROR, + DM_RAID1_SYNC_ERROR, + DM_RAID1_READ_ERROR +}; + struct mirror { struct mirror_set *ms; atomic_t error_count; + uint32_t error_type; struct dm_dev *dev; sector_t offset; }; @@ -127,21 +136,25 @@ struct mirror_set { struct kcopyd_client *kcopyd_client; uint64_t features; - spinlock_t lock; /* protects the next two lists */ + spinlock_t lock; /* protects the lists */ struct bio_list reads; struct bio_list writes; + struct bio_list failures; struct dm_io_client *io_client; + mempool_t *read_record_pool; /* recovery */ region_t nr_regions; int in_sync; int log_failure; + atomic_t suspend; - struct mirror *default_mirror; /* Default mirror */ + atomic_t default_mirror; /* Default mirror */ struct workqueue_struct *kmirrord_wq; struct work_struct kmirrord_work; + struct work_struct trigger_event; unsigned int nr_mirrors; struct mirror mirror[0]; @@ -362,6 +375,16 @@ static void complete_resync_work(struct region *reg, int success) struct region_hash *rh = reg->rh; rh->log->type->set_region_sync(rh->log, reg->key, success); + + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ dispatch_bios(rh->ms, ®->delayed_bios); if (atomic_dec_and_test(&rh->recovery_in_flight)) wake_up_all(&_kmirrord_recovery_stopped); @@ -626,24 +649,101 @@ static void rh_start_recovery(struct region_hash *rh) wake(rh->ms); } +#define MIN_READ_RECORDS 20 +struct dm_raid1_read_record { + struct mirror *m; + struct dm_bio_details details; +}; + /* * Every mirror should look like this one. */ #define DEFAULT_MIRROR 0 /* - * This is yucky. We squirrel the mirror_set struct away inside - * bi_next for write buffers. This is safe since the bh + * This is yucky. We squirrel the mirror struct away inside + * bi_next for read/write buffers. This is safe since the bh * doesn't get submitted to the lower levels of block layer. */ -static struct mirror_set *bio_get_ms(struct bio *bio) +static struct mirror *bio_get_m(struct bio *bio) +{ + return (struct mirror *) bio->bi_next; +} + +static void bio_set_m(struct bio *bio, struct mirror *m) +{ + bio->bi_next = (struct bio *) m; +} + +static struct mirror *get_default_mirror(struct mirror_set *ms) { - return (struct mirror_set *) bio->bi_next; + return &ms->mirror[atomic_read(&ms->default_mirror)]; } -static void bio_set_ms(struct bio *bio, struct mirror_set *ms) +static void set_default_mirror(struct mirror *m) { - bio->bi_next = (struct bio *) ms; + struct mirror_set *ms = m->ms; + struct mirror *m0 = &(ms->mirror[0]); + + atomic_set(&ms->default_mirror, m - m0); +} + +/* fail_mirror + * @m: mirror device to fail + * @error_type: one of the enum's, DM_RAID1_*_ERROR + * + * If errors are being handled, record the type of + * error encountered for this device. If this type + * of error has already been recorded, we can return; + * otherwise, we must signal userspace by triggering + * an event. Additionally, if the device is the + * primary device, we must choose a new primary, but + * only if the mirror is in-sync. + * + * This function must not block. + */ +static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) +{ + struct mirror_set *ms = m->ms; + struct mirror *new; + + if (!errors_handled(ms)) + return; + + /* + * error_count is used for nothing more than a + * simple way to tell if a device has encountered + * errors. + */ + atomic_inc(&m->error_count); + + if (test_and_set_bit(error_type, &m->error_type)) + return; + + if (m != get_default_mirror(ms)) + goto out; + + if (!ms->in_sync) { + /* + * Better to issue requests to same failing device + * than to risk returning corrupt data. + */ + DMERR("Primary mirror (%s) failed while out-of-sync: " + "Reads may fail.", m->dev->name); + goto out; + } + + for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) + if (!atomic_read(&new->error_count)) { + set_default_mirror(new); + break; + } + + if (unlikely(new == ms->mirror + ms->nr_mirrors)) + DMWARN("All sides of mirror have failed."); + +out: + schedule_work(&ms->trigger_event); } /*----------------------------------------------------------------- @@ -656,15 +756,32 @@ static void bio_set_ms(struct bio *bio, struct mirror_set *ms) static void recovery_complete(int read_err, unsigned int write_err, void *context) { - struct region *reg = (struct region *) context; + struct region *reg = (struct region *)context; + struct mirror_set *ms = reg->rh->ms; + int m, bit = 0; - if (read_err) + if (read_err) { /* Read error means the failure of default mirror. */ DMERR_LIMIT("Unable to read primary mirror during recovery"); + fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); + } - if (write_err) + if (write_err) { DMERR_LIMIT("Write error during recovery (error = 0x%x)", write_err); + /* + * Bits correspond to devices (excluding default mirror). + * The default mirror cannot change during recovery. + */ + for (m = 0; m < ms->nr_mirrors; m++) { + if (&ms->mirror[m] == get_default_mirror(ms)) + continue; + if (test_bit(bit, &write_err)) + fail_mirror(ms->mirror + m, + DM_RAID1_SYNC_ERROR); + bit++; + } + } rh_recovery_end(reg, !(read_err || write_err)); } @@ -678,7 +795,7 @@ static int recover(struct mirror_set *ms, struct region *reg) unsigned long flags = 0; /* fill in the source */ - m = ms->default_mirror; + m = get_default_mirror(ms); from.bdev = m->dev->bdev; from.sector = m->offset + region_to_sector(reg->rh, reg->key); if (reg->key == (ms->nr_regions - 1)) { @@ -694,7 +811,7 @@ static int recover(struct mirror_set *ms, struct region *reg) /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (&ms->mirror[i] == ms->default_mirror) + if (&ms->mirror[i] == get_default_mirror(ms)) continue; m = ms->mirror + i; @@ -748,17 +865,105 @@ static void do_recovery(struct mirror_set *ms) *---------------------------------------------------------------*/ static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) { - /* FIXME: add read balancing */ - return ms->default_mirror; + struct mirror *m = get_default_mirror(ms); + + do { + if (likely(!atomic_read(&m->error_count))) + return m; + + if (m-- == ms->mirror) + m += ms->nr_mirrors; + } while (m != get_default_mirror(ms)); + + return NULL; +} + +static int default_ok(struct mirror *m) +{ + struct mirror *default_mirror = get_default_mirror(m->ms); + + return !atomic_read(&default_mirror->error_count); +} + +static int mirror_available(struct mirror_set *ms, struct bio *bio) +{ + region_t region = bio_to_region(&ms->rh, bio); + + if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) + return choose_mirror(ms, bio->bi_sector) ? 1 : 0; + + return 0; } /* * remap a buffer to a particular mirror. */ -static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) +static sector_t map_sector(struct mirror *m, struct bio *bio) +{ + return m->offset + (bio->bi_sector - m->ms->ti->begin); +} + +static void map_bio(struct mirror *m, struct bio *bio) { bio->bi_bdev = m->dev->bdev; - bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); + bio->bi_sector = map_sector(m, bio); +} + +static void map_region(struct io_region *io, struct mirror *m, + struct bio *bio) +{ + io->bdev = m->dev->bdev; + io->sector = map_sector(m, bio); + io->count = bio->bi_size >> 9; +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static void read_callback(unsigned long error, void *context) +{ + struct bio *bio = context; + struct mirror *m; + + m = bio_get_m(bio); + bio_set_m(bio, NULL); + + if (likely(!error)) { + bio_endio(bio, 0); + return; + } + + fail_mirror(m, DM_RAID1_READ_ERROR); + + if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { + DMWARN_LIMIT("Read failure on mirror device %s. " + "Trying alternative device.", + m->dev->name); + queue_bio(m->ms, bio, bio_rw(bio)); + return; + } + + DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", + m->dev->name); + bio_endio(bio, -EIO); +} + +/* Asynchronous read. */ +static void read_async_bio(struct mirror *m, struct bio *bio) +{ + struct io_region io; + struct dm_io_request io_req = { + .bi_rw = READ, + .mem.type = DM_IO_BVEC, + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, + .notify.fn = read_callback, + .notify.context = bio, + .client = m->ms->io_client, + }; + + map_region(&io, m, bio); + bio_set_m(bio, m); + (void) dm_io(&io_req, 1, &io, NULL); } static void do_reads(struct mirror_set *ms, struct bio_list *reads) @@ -769,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) while ((bio = bio_list_pop(reads))) { region = bio_to_region(&ms->rh, bio); + m = get_default_mirror(ms); /* * We can only read balance if the region is in sync. */ - if (rh_in_sync(&ms->rh, region, 1)) + if (likely(rh_in_sync(&ms->rh, region, 1))) m = choose_mirror(ms, bio->bi_sector); - else - m = ms->default_mirror; + else if (m && atomic_read(&m->error_count)) + m = NULL; - map_bio(ms, m, bio); - generic_make_request(bio); + if (likely(m)) + read_async_bio(m, bio); + else + bio_endio(bio, -EIO); } } @@ -793,15 +1001,70 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) * RECOVERING: delay the io until recovery completes * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------*/ + +/* __bio_mark_nosync + * @ms + * @bio + * @done + * @error + * + * The bio was written on some mirror(s) but failed on other mirror(s). + * We can successfully endio the bio but should avoid the region being + * marked clean by setting the state RH_NOSYNC. + * + * This function is _not_ safe in interrupt context! + */ +static void __bio_mark_nosync(struct mirror_set *ms, + struct bio *bio, unsigned done, int error) +{ + unsigned long flags; + struct region_hash *rh = &ms->rh; + struct dirty_log *log = ms->rh.log; + struct region *reg; + region_t region = bio_to_region(rh, bio); + int recovering = 0; + + /* We must inform the log that the sync count has changed. */ + log->type->set_region_sync(log, region, 0); + ms->in_sync = 0; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + /* region hash entry should exist because write was in-flight */ + BUG_ON(!reg); + BUG_ON(!list_empty(®->list)); + + spin_lock_irqsave(&rh->region_lock, flags); + /* + * Possible cases: + * 1) RH_DIRTY + * 2) RH_NOSYNC: was dirty, other preceeding writes failed + * 3) RH_RECOVERING: flushing pending writes + * Either case, the region should have not been connected to list. + */ + recovering = (reg->state == RH_RECOVERING); + reg->state = RH_NOSYNC; + BUG_ON(!list_empty(®->list)); + spin_unlock_irqrestore(&rh->region_lock, flags); + + bio_endio(bio, error); + if (recovering) + complete_resync_work(reg, 0); +} + static void write_callback(unsigned long error, void *context) { - unsigned int i; - int uptodate = 1; + unsigned i, ret = 0; struct bio *bio = (struct bio *) context; struct mirror_set *ms; + int uptodate = 0; + int should_wake = 0; + unsigned long flags; - ms = bio_get_ms(bio); - bio_set_ms(bio, NULL); + ms = bio_get_m(bio)->ms; + bio_set_m(bio, NULL); /* * NOTE: We don't decrement the pending count here, @@ -809,26 +1072,42 @@ static void write_callback(unsigned long error, void *context) * This way we handle both writes to SYNC and NOSYNC * regions with the same code. */ + if (likely(!error)) + goto out; + + for (i = 0; i < ms->nr_mirrors; i++) + if (test_bit(i, &error)) + fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); + else + uptodate = 1; - if (error) { + if (unlikely(!uptodate)) { + DMERR("All replicated volumes dead, failing I/O"); + /* None of the writes succeeded, fail the I/O. */ + ret = -EIO; + } else if (errors_handled(ms)) { /* - * only error the io if all mirrors failed. - * FIXME: bogus + * Need to raise event. Since raising + * events can block, we need to do it in + * the main thread. */ - uptodate = 0; - for (i = 0; i < ms->nr_mirrors; i++) - if (!test_bit(i, &error)) { - uptodate = 1; - break; - } + spin_lock_irqsave(&ms->lock, flags); + if (!ms->failures.head) + should_wake = 1; + bio_list_add(&ms->failures, bio); + spin_unlock_irqrestore(&ms->lock, flags); + if (should_wake) + wake(ms); + return; } - bio_endio(bio, 0); +out: + bio_endio(bio, ret); } static void do_write(struct mirror_set *ms, struct bio *bio) { unsigned int i; - struct io_region io[KCOPYD_MAX_REGIONS+1]; + struct io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { .bi_rw = WRITE, @@ -839,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio) .client = ms->io_client, }; - for (i = 0; i < ms->nr_mirrors; i++) { - m = ms->mirror + i; - - io[i].bdev = m->dev->bdev; - io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); - io[i].count = bio->bi_size >> 9; - } + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) + map_region(dest++, m, bio); - bio_set_ms(bio, ms); + /* + * Use default mirror because we only need it to retrieve the reference + * to the mirror set in write_callback(). + */ + bio_set_m(bio, get_default_mirror(ms)); (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); } @@ -900,43 +1178,125 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) /* * Dispatch io. */ - if (unlikely(ms->log_failure)) + if (unlikely(ms->log_failure)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->failures, &sync); + spin_unlock_irq(&ms->lock); + } else while ((bio = bio_list_pop(&sync))) - bio_endio(bio, -EIO); - else while ((bio = bio_list_pop(&sync))) - do_write(ms, bio); + do_write(ms, bio); while ((bio = bio_list_pop(&recover))) rh_delay(&ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(ms, ms->default_mirror, bio); + map_bio(get_default_mirror(ms), bio); generic_make_request(bio); } } +static void do_failures(struct mirror_set *ms, struct bio_list *failures) +{ + struct bio *bio; + + if (!failures->head) + return; + + if (!ms->log_failure) { + while ((bio = bio_list_pop(failures))) + __bio_mark_nosync(ms, bio, bio->bi_size, 0); + return; + } + + /* + * If the log has failed, unattempted writes are being + * put on the failures list. We can't issue those writes + * until a log has been marked, so we must store them. + * + * If a 'noflush' suspend is in progress, we can requeue + * the I/O's to the core. This give userspace a chance + * to reconfigure the mirror, at which point the core + * will reissue the writes. If the 'noflush' flag is + * not set, we have no choice but to return errors. + * + * Some writes on the failures list may have been + * submitted before the log failure and represent a + * failure to write to one of the devices. It is ok + * for us to treat them the same and requeue them + * as well. + */ + if (dm_noflush_suspending(ms->ti)) { + while ((bio = bio_list_pop(failures))) + bio_endio(bio, DM_ENDIO_REQUEUE); + return; + } + + if (atomic_read(&ms->suspend)) { + while ((bio = bio_list_pop(failures))) + bio_endio(bio, -EIO); + return; + } + + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->failures, failures); + spin_unlock_irq(&ms->lock); + + wake(ms); +} + +static void trigger_event(struct work_struct *work) +{ + struct mirror_set *ms = + container_of(work, struct mirror_set, trigger_event); + + dm_table_event(ms->ti->table); +} + /*----------------------------------------------------------------- * kmirrord *---------------------------------------------------------------*/ -static void do_mirror(struct work_struct *work) +static int _do_mirror(struct work_struct *work) { struct mirror_set *ms =container_of(work, struct mirror_set, kmirrord_work); - struct bio_list reads, writes; + struct bio_list reads, writes, failures; + unsigned long flags; - spin_lock(&ms->lock); + spin_lock_irqsave(&ms->lock, flags); reads = ms->reads; writes = ms->writes; + failures = ms->failures; bio_list_init(&ms->reads); bio_list_init(&ms->writes); - spin_unlock(&ms->lock); + bio_list_init(&ms->failures); + spin_unlock_irqrestore(&ms->lock, flags); rh_update_states(&ms->rh); do_recovery(ms); do_reads(ms, &reads); do_writes(ms, &writes); + do_failures(ms, &failures); + + return (ms->failures.head) ? 1 : 0; } +static void do_mirror(struct work_struct *work) +{ + /* + * If _do_mirror returns 1, we give it + * another shot. This helps for cases like + * 'suspend' where we call flush_workqueue + * and expect all work to be finished. If + * a failure happens during a suspend, we + * couldn't issue a 'wake' because it would + * not be honored. Therefore, we return '1' + * from _do_mirror, and retry here. + */ + while (_do_mirror(work)) + schedule(); +} + + /*----------------------------------------------------------------- * Target functions *---------------------------------------------------------------*/ @@ -965,11 +1325,23 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ms->nr_mirrors = nr_mirrors; ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->in_sync = 0; - ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; + ms->log_failure = 0; + atomic_set(&ms->suspend, 0); + atomic_set(&ms->default_mirror, DEFAULT_MIRROR); + + len = sizeof(struct dm_raid1_read_record); + ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, + len); + if (!ms->read_record_pool) { + ti->error = "Error creating mirror read_record_pool"; + kfree(ms); + return NULL; + } ms->io_client = dm_io_client_create(DM_IO_PAGES); if (IS_ERR(ms->io_client)) { ti->error = "Error creating dm_io client"; + mempool_destroy(ms->read_record_pool); kfree(ms); return NULL; } @@ -977,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { ti->error = "Error creating dirty region hash"; dm_io_client_destroy(ms->io_client); + mempool_destroy(ms->read_record_pool); kfree(ms); return NULL; } @@ -992,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, dm_io_client_destroy(ms->io_client); rh_exit(&ms->rh); + mempool_destroy(ms->read_record_pool); kfree(ms); } @@ -1019,6 +1393,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, } ms->mirror[mirror].ms = ms; + atomic_set(&(ms->mirror[mirror].error_count), 0); + ms->mirror[mirror].error_type = 0; ms->mirror[mirror].offset = offset; return 0; @@ -1171,6 +1547,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_free_context; } INIT_WORK(&ms->kmirrord_work, do_mirror); + INIT_WORK(&ms->trigger_event, trigger_event); r = parse_features(ms, argc, argv, &args_used); if (r) @@ -1220,14 +1597,15 @@ static void mirror_dtr(struct dm_target *ti) static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) { + unsigned long flags; int should_wake = 0; struct bio_list *bl; bl = (rw == WRITE) ? &ms->writes : &ms->reads; - spin_lock(&ms->lock); + spin_lock_irqsave(&ms->lock, flags); should_wake = !(bl->head); bio_list_add(bl, bio); - spin_unlock(&ms->lock); + spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) wake(ms); @@ -1242,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, int r, rw = bio_rw(bio); struct mirror *m; struct mirror_set *ms = ti->private; - - map_context->ll = bio_to_region(&ms->rh, bio); + struct dm_raid1_read_record *read_record = NULL; if (rw == WRITE) { + /* Save region for mirror_end_io() handler */ + map_context->ll = bio_to_region(&ms->rh, bio); queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } @@ -1255,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, if (r < 0 && r != -EWOULDBLOCK) return r; - if (r == -EWOULDBLOCK) /* FIXME: ugly */ - r = DM_MAPIO_SUBMITTED; - /* - * We don't want to fast track a recovery just for a read - * ahead. So we just let it silently fail. - * FIXME: get rid of this. + * If region is not in-sync queue the bio. */ - if (!r && rw == READA) - return -EIO; + if (!r || (r == -EWOULDBLOCK)) { + if (rw == READA) + return -EWOULDBLOCK; - if (!r) { - /* Pass this io over to the daemon */ queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } + /* + * The region is in-sync and we can perform reads directly. + * Store enough information so we can retry if it fails. + */ m = choose_mirror(ms, bio->bi_sector); - if (!m) + if (unlikely(!m)) return -EIO; - map_bio(ms, m, bio); + read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); + if (likely(read_record)) { + dm_bio_record(&read_record->details, bio); + map_context->ptr = read_record; + read_record->m = m; + } + + map_bio(m, bio); + return DM_MAPIO_REMAPPED; } @@ -1285,71 +1670,173 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, { int rw = bio_rw(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; - region_t region = map_context->ll; + struct mirror *m = NULL; + struct dm_bio_details *bd = NULL; + struct dm_raid1_read_record *read_record = map_context->ptr; /* * We need to dec pending if this was a write. */ - if (rw == WRITE) - rh_dec(&ms->rh, region); + if (rw == WRITE) { + rh_dec(&ms->rh, map_context->ll); + return error; + } - return 0; + if (error == -EOPNOTSUPP) + goto out; + + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + goto out; + + if (unlikely(error)) { + if (!read_record) { + /* + * There wasn't enough memory to record necessary + * information for a retry or there was no other + * mirror in-sync. + */ + DMERR_LIMIT("Mirror read failed from %s.", + m->dev->name); + return -EIO; + } + DMERR("Mirror read failed from %s. Trying alternative device.", + m->dev->name); + + m = read_record->m; + fail_mirror(m, DM_RAID1_READ_ERROR); + + /* + * A failed read is requeued for another attempt using an intact + * mirror. + */ + if (default_ok(m) || mirror_available(ms, bio)) { + bd = &read_record->details; + + dm_bio_restore(bd, bio); + mempool_free(read_record, ms->read_record_pool); + map_context->ptr = NULL; + queue_bio(ms, bio, rw); + return 1; + } + DMERR("All replicated volumes dead, failing I/O"); + } + +out: + if (read_record) { + mempool_free(read_record, ms->read_record_pool); + map_context->ptr = NULL; + } + + return error; } -static void mirror_postsuspend(struct dm_target *ti) +static void mirror_presuspend(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; struct dirty_log *log = ms->rh.log; + atomic_set(&ms->suspend, 1); + + /* + * We must finish up all the work that we've + * generated (i.e. recovery work). + */ rh_stop_recovery(&ms->rh); - /* Wait for all I/O we generated to complete */ wait_event(_kmirrord_recovery_stopped, !atomic_read(&ms->rh.recovery_in_flight)); + if (log->type->presuspend && log->type->presuspend(log)) + /* FIXME: need better error handling */ + DMWARN("log presuspend failed"); + + /* + * Now that recovery is complete/stopped and the + * delayed bios are queued, we need to wait for + * the worker thread to complete. This way, + * we know that all of our I/O has been pushed. + */ + flush_workqueue(ms->kmirrord_wq); +} + +static void mirror_postsuspend(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + struct dirty_log *log = ms->rh.log; + if (log->type->postsuspend && log->type->postsuspend(log)) /* FIXME: need better error handling */ - DMWARN("log suspend failed"); + DMWARN("log postsuspend failed"); } static void mirror_resume(struct dm_target *ti) { - struct mirror_set *ms = (struct mirror_set *) ti->private; + struct mirror_set *ms = ti->private; struct dirty_log *log = ms->rh.log; + + atomic_set(&ms->suspend, 0); if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); rh_start_recovery(&ms->rh); } +/* + * device_status_char + * @m: mirror device/leg we want the status of + * + * We return one character representing the most severe error + * we have encountered. + * A => Alive - No failures + * D => Dead - A write failure occurred leaving mirror out-of-sync + * S => Sync - A sychronization failure occurred, mirror out-of-sync + * R => Read - A read failure occurred, mirror data unaffected + * + * Returns: <char> + */ +static char device_status_char(struct mirror *m) +{ + if (!atomic_read(&(m->error_count))) + return 'A'; + + return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : + (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : + (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; +} + + static int mirror_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { unsigned int m, sz = 0; struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dirty_log *log = ms->rh.log; + char buffer[ms->nr_mirrors + 1]; switch (type) { case STATUSTYPE_INFO: DMEMIT("%d ", ms->nr_mirrors); - for (m = 0; m < ms->nr_mirrors; m++) + for (m = 0; m < ms->nr_mirrors; m++) { DMEMIT("%s ", ms->mirror[m].dev->name); + buffer[m] = device_status_char(&(ms->mirror[m])); + } + buffer[m] = '\0'; - DMEMIT("%llu/%llu 0 ", - (unsigned long long)ms->rh.log->type-> - get_sync_count(ms->rh.log), - (unsigned long long)ms->nr_regions); + DMEMIT("%llu/%llu 1 %s ", + (unsigned long long)log->type->get_sync_count(ms->rh.log), + (unsigned long long)ms->nr_regions, buffer); - sz += ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); + sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz); break; case STATUSTYPE_TABLE: - sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); + sz = log->type->status(ms->rh.log, type, result, maxlen); DMEMIT("%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) DMEMIT(" %s %llu", ms->mirror[m].dev->name, - (unsigned long long)ms->mirror[m].offset); + (unsigned long long)ms->mirror[m].offset); if (ms->features & DM_RAID1_HANDLE_ERRORS) DMEMIT(" 1 handle_errors"); @@ -1360,12 +1847,13 @@ static int mirror_status(struct dm_target *ti, status_type_t type, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 0, 3}, + .version = {1, 0, 20}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, .map = mirror_map, .end_io = mirror_end_io, + .presuspend = mirror_presuspend, .postsuspend = mirror_postsuspend, .resume = mirror_resume, .status = mirror_status, diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index cee16fadd9e..ae24eab8cd8 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -213,11 +213,15 @@ static void unregister_snapshot(struct dm_snapshot *s) /* * Implementation of the exception hash tables. + * The lowest hash_shift bits of the chunk number are ignored, allowing + * some consecutive chunks to be grouped together. */ -static int init_exception_table(struct exception_table *et, uint32_t size) +static int init_exception_table(struct exception_table *et, uint32_t size, + unsigned hash_shift) { unsigned int i; + et->hash_shift = hash_shift; et->hash_mask = size - 1; et->table = dm_vcalloc(size, sizeof(struct list_head)); if (!et->table) @@ -248,7 +252,7 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache * static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) { - return chunk & et->hash_mask; + return (chunk >> et->hash_shift) & et->hash_mask; } static void insert_exception(struct exception_table *eh, @@ -275,7 +279,8 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et, slot = &et->table[exception_hash(et, chunk)]; list_for_each_entry (e, slot, hash_list) - if (e->old_chunk == chunk) + if (chunk >= e->old_chunk && + chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) return e; return NULL; @@ -307,6 +312,49 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) mempool_free(pe, pending_pool); } +static void insert_completed_exception(struct dm_snapshot *s, + struct dm_snap_exception *new_e) +{ + struct exception_table *eh = &s->complete; + struct list_head *l; + struct dm_snap_exception *e = NULL; + + l = &eh->table[exception_hash(eh, new_e->old_chunk)]; + + /* Add immediately if this table doesn't support consecutive chunks */ + if (!eh->hash_shift) + goto out; + + /* List is ordered by old_chunk */ + list_for_each_entry_reverse(e, l, hash_list) { + /* Insert after an existing chunk? */ + if (new_e->old_chunk == (e->old_chunk + + dm_consecutive_chunk_count(e) + 1) && + new_e->new_chunk == (dm_chunk_number(e->new_chunk) + + dm_consecutive_chunk_count(e) + 1)) { + dm_consecutive_chunk_count_inc(e); + free_exception(new_e); + return; + } + + /* Insert before an existing chunk? */ + if (new_e->old_chunk == (e->old_chunk - 1) && + new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { + dm_consecutive_chunk_count_inc(e); + e->old_chunk--; + e->new_chunk--; + free_exception(new_e); + return; + } + + if (new_e->old_chunk > e->old_chunk) + break; + } + +out: + list_add(&new_e->hash_list, e ? &e->hash_list : l); +} + int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) { struct dm_snap_exception *e; @@ -316,8 +364,12 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) return -ENOMEM; e->old_chunk = old; + + /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; - insert_exception(&s->complete, e); + + insert_completed_exception(s, e); + return 0; } @@ -334,16 +386,6 @@ static int calc_max_buckets(void) } /* - * Rounds a number down to a power of 2. - */ -static uint32_t round_down(uint32_t n) -{ - while (n & (n - 1)) - n &= (n - 1); - return n; -} - -/* * Allocate room for a suitable hash table. */ static int init_hash_tables(struct dm_snapshot *s) @@ -361,9 +403,9 @@ static int init_hash_tables(struct dm_snapshot *s) hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; hash_size = min(hash_size, max_buckets); - /* Round it down to a power of 2 */ - hash_size = round_down(hash_size); - if (init_exception_table(&s->complete, hash_size)) + hash_size = rounddown_pow_of_two(hash_size); + if (init_exception_table(&s->complete, hash_size, + DM_CHUNK_CONSECUTIVE_BITS)) return -ENOMEM; /* @@ -374,7 +416,7 @@ static int init_hash_tables(struct dm_snapshot *s) if (hash_size < 64) hash_size = 64; - if (init_exception_table(&s->pending, hash_size)) { + if (init_exception_table(&s->pending, hash_size, 0)) { exit_exception_table(&s->complete, exception_cache); return -ENOMEM; } @@ -733,7 +775,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) * Add a proper exception, and remove the * in-flight exception from the list. */ - insert_exception(&s->complete, e); + insert_completed_exception(s, e); out: remove_exception(&pe->e); @@ -867,11 +909,12 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) } static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, - struct bio *bio) + struct bio *bio, chunk_t chunk) { bio->bi_bdev = s->cow->bdev; - bio->bi_sector = chunk_to_sector(s, e->new_chunk) + - (bio->bi_sector & s->chunk_mask); + bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_sector & s->chunk_mask); } static int snapshot_map(struct dm_target *ti, struct bio *bio, @@ -902,7 +945,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, /* If the block is already remapped - use that, else remap it */ e = lookup_exception(&s->complete, chunk); if (e) { - remap_exception(s, e, bio); + remap_exception(s, e, bio, chunk); goto out_unlock; } @@ -919,7 +962,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, goto out_unlock; } - remap_exception(s, &pe->e, bio); + remap_exception(s, &pe->e, bio, chunk); bio_list_add(&pe->snapshot_bios, bio); r = DM_MAPIO_SUBMITTED; @@ -1207,7 +1250,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -1218,7 +1261,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h index 650e0f1f51d..93bce5d4974 100644 --- a/drivers/md/dm-snap.h +++ b/drivers/md/dm-snap.h @@ -16,19 +16,22 @@ struct exception_table { uint32_t hash_mask; + unsigned hash_shift; struct list_head *table; }; /* * The snapshot code deals with largish chunks of the disk at a - * time. Typically 64k - 256k. + * time. Typically 32k - 512k. */ -/* FIXME: can we get away with limiting these to a uint32_t ? */ typedef sector_t chunk_t; /* * An exception is used where an old chunk of data has been * replaced by a new one. + * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number + * of chunks that follow contiguously. Remaining bits hold the number of the + * chunk within the device. */ struct dm_snap_exception { struct list_head hash_list; @@ -38,6 +41,49 @@ struct dm_snap_exception { }; /* + * Funtions to manipulate consecutive chunks + */ +# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) +# define DM_CHUNK_CONSECUTIVE_BITS 8 +# define DM_CHUNK_NUMBER_BITS 56 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); +} + +static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) +{ + return e->new_chunk >> DM_CHUNK_NUMBER_BITS; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) +{ + e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); + + BUG_ON(!dm_consecutive_chunk_count(e)); +} + +# else +# define DM_CHUNK_CONSECUTIVE_BITS 0 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk; +} + +static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) +{ + return 0; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) +{ +} + +# endif + +/* * Abstraction to handle the meta/layout of exception stores (the * COW device). */ diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 969944a8aba..4de90ab3968 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -14,10 +14,13 @@ #include <linux/log2.h> #define DM_MSG_PREFIX "striped" +#define DM_IO_ERROR_THRESHOLD 15 struct stripe { struct dm_dev *dev; sector_t physical_start; + + atomic_t error_count; }; struct stripe_c { @@ -30,9 +33,29 @@ struct stripe_c { uint32_t chunk_shift; sector_t chunk_mask; + /* Needed for handling events */ + struct dm_target *ti; + + /* Work struct used for triggering events*/ + struct work_struct kstriped_ws; + struct stripe stripe[0]; }; +static struct workqueue_struct *kstriped; + +/* + * An event is triggered whenever a drive + * drops out of a stripe volume. + */ +static void trigger_event(struct work_struct *work) +{ + struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); + + dm_table_event(sc->ti->table); + +} + static inline struct stripe_c *alloc_context(unsigned int stripes) { size_t len; @@ -63,6 +86,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, return -ENXIO; sc->stripe[stripe].physical_start = start; + return 0; } @@ -135,6 +159,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } + INIT_WORK(&sc->kstriped_ws, trigger_event); + + /* Set pointer to dm target; used in trigger_event */ + sc->ti = ti; + sc->stripes = stripes; sc->stripe_width = width; ti->split_io = chunk_size; @@ -158,9 +187,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) kfree(sc); return r; } + atomic_set(&(sc->stripe[i].error_count), 0); } ti->private = sc; + return 0; } @@ -172,6 +203,7 @@ static void stripe_dtr(struct dm_target *ti) for (i = 0; i < sc->stripes; i++) dm_put_device(ti, sc->stripe[i].dev); + flush_workqueue(kstriped); kfree(sc); } @@ -190,16 +222,37 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, return DM_MAPIO_REMAPPED; } +/* + * Stripe status: + * + * INFO + * #stripes [stripe_name <stripe_name>] [group word count] + * [error count 'A|D' <error count 'A|D'>] + * + * TABLE + * #stripes [stripe chunk size] + * [stripe_name physical_start <stripe_name physical_start>] + * + */ + static int stripe_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { struct stripe_c *sc = (struct stripe_c *) ti->private; + char buffer[sc->stripes + 1]; unsigned int sz = 0; unsigned int i; switch (type) { case STATUSTYPE_INFO: - result[0] = '\0'; + DMEMIT("%d ", sc->stripes); + for (i = 0; i < sc->stripes; i++) { + DMEMIT("%s ", sc->stripe[i].dev->name); + buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ? + 'D' : 'A'; + } + buffer[i] = '\0'; + DMEMIT("1 %s", buffer); break; case STATUSTYPE_TABLE: @@ -213,13 +266,52 @@ static int stripe_status(struct dm_target *ti, return 0; } +static int stripe_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + unsigned i; + char major_minor[16]; + struct stripe_c *sc = ti->private; + + if (!error) + return 0; /* I/O complete */ + + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + return error; + + if (error == -EOPNOTSUPP) + return error; + + memset(major_minor, 0, sizeof(major_minor)); + sprintf(major_minor, "%d:%d", + bio->bi_bdev->bd_disk->major, + bio->bi_bdev->bd_disk->first_minor); + + /* + * Test to see which stripe drive triggered the event + * and increment error count for all stripes on that device. + * If the error count for a given device exceeds the threshold + * value we will no longer trigger any further events. + */ + for (i = 0; i < sc->stripes; i++) + if (!strcmp(sc->stripe[i].dev->name, major_minor)) { + atomic_inc(&(sc->stripe[i].error_count)); + if (atomic_read(&(sc->stripe[i].error_count)) < + DM_IO_ERROR_THRESHOLD) + queue_work(kstriped, &sc->kstriped_ws); + } + + return error; +} + static struct target_type stripe_target = { .name = "striped", - .version= {1, 0, 2}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, .map = stripe_map, + .end_io = stripe_end_io, .status = stripe_status, }; @@ -231,6 +323,13 @@ int __init dm_stripe_init(void) if (r < 0) DMWARN("target registration failed"); + kstriped = create_singlethread_workqueue("kstriped"); + if (!kstriped) { + DMERR("failed to create workqueue kstriped"); + dm_unregister_target(&stripe_target); + return -ENOMEM; + } + return r; } @@ -239,5 +338,7 @@ void dm_stripe_exit(void) if (dm_unregister_target(&stripe_target)) DMWARN("target unregistration failed"); + destroy_workqueue(kstriped); + return; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 47818d8249c..f1606298238 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -287,9 +287,8 @@ static void free_devices(struct list_head *devices) { struct list_head *tmp, *next; - for (tmp = devices->next; tmp != devices; tmp = next) { + list_for_each_safe(tmp, next, devices) { struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); - next = tmp->next; kfree(dd); } } @@ -476,7 +475,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, int mode, struct dm_dev **result) { int r; - dev_t dev; + dev_t uninitialized_var(dev); struct dm_dev *dd; unsigned int major, minor; @@ -805,7 +804,7 @@ static int setup_indexes(struct dm_table *t) return -ENOMEM; /* set up internal nodes, bottom-up */ - for (i = t->depth - 2, total = 0; i >= 0; i--) { + for (i = t->depth - 2; i >= 0; i--) { t->index[i] = indexes; indexes += (KEYS_PER_NODE * t->counts[i]); setup_btree_index(i, t); @@ -993,12 +992,11 @@ int dm_table_resume_targets(struct dm_table *t) int dm_table_any_congested(struct dm_table *t, int bdi_bits) { - struct list_head *d, *devices; + struct dm_dev *dd; + struct list_head *devices = dm_table_get_devices(t); int r = 0; - devices = dm_table_get_devices(t); - for (d = devices->next; d != devices; d = d->next) { - struct dm_dev *dd = list_entry(d, struct dm_dev, list); + list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->bdev); r |= bdi_congested(&q->backing_dev_info, bdi_bits); } @@ -1008,10 +1006,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) void dm_table_unplug_all(struct dm_table *t) { - struct list_head *d, *devices = dm_table_get_devices(t); + struct dm_dev *dd; + struct list_head *devices = dm_table_get_devices(t); - for (d = devices->next; d != devices; d = d->next) { - struct dm_dev *dd = list_entry(d, struct dm_dev, list); + list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->bdev); blk_unplug(q); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f2d24eb3208..6617ce4af09 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -71,9 +71,22 @@ union map_info *dm_get_mapinfo(struct bio *bio) #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 +/* + * Work processed by per-device workqueue. + */ +struct dm_wq_req { + enum { + DM_WQ_FLUSH_ALL, + DM_WQ_FLUSH_DEFERRED, + } type; + struct work_struct work; + struct mapped_device *md; + void *context; +}; + struct mapped_device { struct rw_semaphore io_lock; - struct semaphore suspend_lock; + struct mutex suspend_lock; spinlock_t pushback_lock; rwlock_t map_lock; atomic_t holders; @@ -96,6 +109,11 @@ struct mapped_device { struct bio_list pushback; /* + * Processing queue (flush/barriers) + */ + struct workqueue_struct *wq; + + /* * The current mapping. */ struct dm_table *map; @@ -181,7 +199,7 @@ static void local_exit(void) DMINFO("cleaned up"); } -int (*_inits[])(void) __initdata = { +static int (*_inits[])(void) __initdata = { local_init, dm_target_init, dm_linear_init, @@ -189,7 +207,7 @@ int (*_inits[])(void) __initdata = { dm_interface_init, }; -void (*_exits[])(void) = { +static void (*_exits[])(void) = { local_exit, dm_target_exit, dm_linear_exit, @@ -982,7 +1000,7 @@ static struct mapped_device *alloc_dev(int minor) } if (!try_module_get(THIS_MODULE)) - goto bad0; + goto bad_module_get; /* get a minor number for the dev */ if (minor == DM_ANY_MINOR) @@ -990,11 +1008,11 @@ static struct mapped_device *alloc_dev(int minor) else r = specific_minor(md, minor); if (r < 0) - goto bad1; + goto bad_minor; memset(md, 0, sizeof(*md)); init_rwsem(&md->io_lock); - init_MUTEX(&md->suspend_lock); + mutex_init(&md->suspend_lock); spin_lock_init(&md->pushback_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); @@ -1006,7 +1024,7 @@ static struct mapped_device *alloc_dev(int minor) md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) - goto bad1_free_minor; + goto bad_queue; md->queue->queuedata = md; md->queue->backing_dev_info.congested_fn = dm_any_congested; @@ -1017,11 +1035,11 @@ static struct mapped_device *alloc_dev(int minor) md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); if (!md->io_pool) - goto bad2; + goto bad_io_pool; md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); if (!md->tio_pool) - goto bad3; + goto bad_tio_pool; md->bs = bioset_create(16, 16); if (!md->bs) @@ -1029,7 +1047,7 @@ static struct mapped_device *alloc_dev(int minor) md->disk = alloc_disk(1); if (!md->disk) - goto bad4; + goto bad_disk; atomic_set(&md->pending, 0); init_waitqueue_head(&md->wait); @@ -1044,6 +1062,10 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); + md->wq = create_singlethread_workqueue("kdmflush"); + if (!md->wq) + goto bad_thread; + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -1053,19 +1075,21 @@ static struct mapped_device *alloc_dev(int minor) return md; - bad4: +bad_thread: + put_disk(md->disk); +bad_disk: bioset_free(md->bs); - bad_no_bioset: +bad_no_bioset: mempool_destroy(md->tio_pool); - bad3: +bad_tio_pool: mempool_destroy(md->io_pool); - bad2: +bad_io_pool: blk_cleanup_queue(md->queue); - bad1_free_minor: +bad_queue: free_minor(minor); - bad1: +bad_minor: module_put(THIS_MODULE); - bad0: +bad_module_get: kfree(md); return NULL; } @@ -1080,6 +1104,7 @@ static void free_dev(struct mapped_device *md) unlock_fs(md); bdput(md->suspended_bdev); } + destroy_workqueue(md->wq); mempool_destroy(md->tio_pool); mempool_destroy(md->io_pool); bioset_free(md->bs); @@ -1259,20 +1284,91 @@ void dm_put(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_put); +static int dm_wait_for_completion(struct mapped_device *md) +{ + int r = 0; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + smp_mb(); + if (!atomic_read(&md->pending)) + break; + + if (signal_pending(current)) { + r = -EINTR; + break; + } + + io_schedule(); + } + set_current_state(TASK_RUNNING); + + return r; +} + /* * Process the deferred bios */ -static void __flush_deferred_io(struct mapped_device *md, struct bio *c) +static void __flush_deferred_io(struct mapped_device *md) { - struct bio *n; + struct bio *c; - while (c) { - n = c->bi_next; - c->bi_next = NULL; + while ((c = bio_list_pop(&md->deferred))) { if (__split_bio(md, c)) bio_io_error(c); - c = n; } + + clear_bit(DMF_BLOCK_IO, &md->flags); +} + +static void __merge_pushback_list(struct mapped_device *md) +{ + unsigned long flags; + + spin_lock_irqsave(&md->pushback_lock, flags); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + bio_list_merge_head(&md->deferred, &md->pushback); + bio_list_init(&md->pushback); + spin_unlock_irqrestore(&md->pushback_lock, flags); +} + +static void dm_wq_work(struct work_struct *work) +{ + struct dm_wq_req *req = container_of(work, struct dm_wq_req, work); + struct mapped_device *md = req->md; + + down_write(&md->io_lock); + switch (req->type) { + case DM_WQ_FLUSH_ALL: + __merge_pushback_list(md); + /* pass through */ + case DM_WQ_FLUSH_DEFERRED: + __flush_deferred_io(md); + break; + default: + DMERR("dm_wq_work: unrecognised work type %d", req->type); + BUG(); + } + up_write(&md->io_lock); +} + +static void dm_wq_queue(struct mapped_device *md, int type, void *context, + struct dm_wq_req *req) +{ + req->type = type; + req->md = md; + req->context = context; + INIT_WORK(&req->work, dm_wq_work); + queue_work(md->wq, &req->work); +} + +static void dm_queue_flush(struct mapped_device *md, int type, void *context) +{ + struct dm_wq_req req; + + dm_wq_queue(md, type, context, &req); + flush_workqueue(md->wq); } /* @@ -1282,7 +1378,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) { int r = -EINVAL; - down(&md->suspend_lock); + mutex_lock(&md->suspend_lock); /* device must be suspended */ if (!dm_suspended(md)) @@ -1297,7 +1393,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) r = __bind(md, table); out: - up(&md->suspend_lock); + mutex_unlock(&md->suspend_lock); return r; } @@ -1346,17 +1442,17 @@ static void unlock_fs(struct mapped_device *md) int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { struct dm_table *map = NULL; - unsigned long flags; DECLARE_WAITQUEUE(wait, current); - struct bio *def; - int r = -EINVAL; + int r = 0; int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; - down(&md->suspend_lock); + mutex_lock(&md->suspend_lock); - if (dm_suspended(md)) + if (dm_suspended(md)) { + r = -EINVAL; goto out_unlock; + } map = dm_get_table(md); @@ -1378,16 +1474,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) r = -ENOMEM; goto flush_and_out; } - } - /* - * Flush I/O to the device. - * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. - */ - if (do_lockfs && !noflush) { - r = lock_fs(md); - if (r) - goto out; + /* + * Flush I/O to the device. noflush supersedes do_lockfs, + * because lock_fs() needs to flush I/Os. + */ + if (do_lockfs) { + r = lock_fs(md); + if (r) + goto out; + } } /* @@ -1404,66 +1500,36 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) dm_table_unplug_all(map); /* - * Then we wait for the already mapped ios to - * complete. + * Wait for the already-mapped ios to complete. */ - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - if (!atomic_read(&md->pending) || signal_pending(current)) - break; - - io_schedule(); - } - set_current_state(TASK_RUNNING); + r = dm_wait_for_completion(md); down_write(&md->io_lock); remove_wait_queue(&md->wait, &wait); - if (noflush) { - spin_lock_irqsave(&md->pushback_lock, flags); - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - bio_list_merge_head(&md->deferred, &md->pushback); - bio_list_init(&md->pushback); - spin_unlock_irqrestore(&md->pushback_lock, flags); - } + if (noflush) + __merge_pushback_list(md); + up_write(&md->io_lock); /* were we interrupted ? */ - r = -EINTR; - if (atomic_read(&md->pending)) { - clear_bit(DMF_BLOCK_IO, &md->flags); - def = bio_list_get(&md->deferred); - __flush_deferred_io(md, def); - up_write(&md->io_lock); + if (r < 0) { + dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); + unlock_fs(md); goto out; /* pushback list is already flushed, so skip flush */ } - up_write(&md->io_lock); dm_table_postsuspend_targets(map); set_bit(DMF_SUSPENDED, &md->flags); - r = 0; - flush_and_out: - if (r && noflush) { + if (r && noflush) /* * Because there may be already I/Os in the pushback list, * flush them before return. */ - down_write(&md->io_lock); - - spin_lock_irqsave(&md->pushback_lock, flags); - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - bio_list_merge_head(&md->deferred, &md->pushback); - bio_list_init(&md->pushback); - spin_unlock_irqrestore(&md->pushback_lock, flags); - - def = bio_list_get(&md->deferred); - __flush_deferred_io(md, def); - up_write(&md->io_lock); - } + dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL); out: if (r && md->suspended_bdev) { @@ -1474,17 +1540,16 @@ out: dm_table_put(map); out_unlock: - up(&md->suspend_lock); + mutex_unlock(&md->suspend_lock); return r; } int dm_resume(struct mapped_device *md) { int r = -EINVAL; - struct bio *def; struct dm_table *map = NULL; - down(&md->suspend_lock); + mutex_lock(&md->suspend_lock); if (!dm_suspended(md)) goto out; @@ -1496,12 +1561,7 @@ int dm_resume(struct mapped_device *md) if (r) goto out; - down_write(&md->io_lock); - clear_bit(DMF_BLOCK_IO, &md->flags); - - def = bio_list_get(&md->deferred); - __flush_deferred_io(md, def); - up_write(&md->io_lock); + dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); unlock_fs(md); @@ -1520,7 +1580,7 @@ int dm_resume(struct mapped_device *md) out: dm_table_put(map); - up(&md->suspend_lock); + mutex_unlock(&md->suspend_lock); return r; } |