From 7a67f63b3233ff28e753854fe27891c44f8588ae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 8 Aug 2008 11:17:12 +0200 Subject: block: add bio_has_data() to detect whether a bio carries data or not Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 0933a14e641..9e93c929947 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -445,6 +445,14 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, __bio_kmap_irq((bio), (bio)->bi_idx, (flags)) #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) +/* + * Check whether this bio carries any data or not. A NULL bio is allowed. + */ +static inline int bio_has_data(struct bio *bio) +{ + return bio && bio->bi_io_vec != NULL; +} + #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) -- cgit v1.2.3 From a9c701e594669dd49fed448c27c64f20cfacc8a7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 8 Aug 2008 11:04:44 +0200 Subject: block: use bio_has_data() to check for data carrying bio Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9e93c929947..dbeb66f813a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -185,7 +185,7 @@ struct bio { #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) -#define bio_empty_barrier(bio) (bio_barrier(bio) && !(bio)->bi_size) +#define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio)) static inline unsigned int bio_cur_sectors(struct bio *bio) { -- cgit v1.2.3 From d628eaef310533767ce68664873869c2d7f78f09 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 9 Aug 2008 16:22:17 +0100 Subject: Fix up comments about matching flags between bio and rq Signed-off-by: David Woodhouse Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 ++-- include/linux/blkdev.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index dbeb66f813a..17f1fbdb31b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -150,8 +150,8 @@ struct bio { * bit 3 -- fail fast, don't want low level driver retries * bit 4 -- synchronous I/O hint: the block layer will unplug immediately */ -#define BIO_RW 0 -#define BIO_RW_AHEAD 1 +#define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ +#define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ #define BIO_RW_BARRIER 2 #define BIO_RW_FAILFAST 3 #define BIO_RW_SYNC 4 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53ea933cf60..e0ba018f5e8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -84,7 +84,7 @@ enum { }; /* - * request type modified bits. first three bits match BIO_RW* bits, important + * request type modified bits. first two bits match BIO_RW* bits, important */ enum rq_flag_bits { __REQ_RW, /* not set, read. set, write */ -- cgit v1.2.3 From fb2dce862d9f9a68e6b9374579056ec9eca02a63 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 5 Aug 2008 18:01:53 +0100 Subject: Add 'discard' request handling Some block devices benefit from a hint that they can forget the contents of certain sectors. Add basic support for this to the block core, along with a 'blkdev_issue_discard()' helper function which issues such requests. The caller doesn't get to provide an end_io functio, since blkdev_issue_discard() will automatically split the request up into multiple bios if appropriate. Neither does the function wait for completion -- it's expected that callers won't care about when, or even _if_, the request completes. It's only a hint to the device anyway. By definition, the file system doesn't _care_ about these sectors any more. [With feedback from OGAWA Hirofumi and Jens Axboe Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 ++++++-- include/linux/blkdev.h | 16 ++++++++++++++++ include/linux/fs.h | 3 ++- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 17f1fbdb31b..1fdfc5621c8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -149,6 +149,8 @@ struct bio { * bit 2 -- barrier * bit 3 -- fail fast, don't want low level driver retries * bit 4 -- synchronous I/O hint: the block layer will unplug immediately + * bit 5 -- metadata request + * bit 6 -- discard sectors */ #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ @@ -156,6 +158,7 @@ struct bio { #define BIO_RW_FAILFAST 3 #define BIO_RW_SYNC 4 #define BIO_RW_META 5 +#define BIO_RW_DISCARD 6 /* * upper 16 bits of bi_rw define the io priority of this bio @@ -186,13 +189,14 @@ struct bio { #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio)) +#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) static inline unsigned int bio_cur_sectors(struct bio *bio) { if (bio->bi_vcnt) return bio_iovec(bio)->bv_len >> 9; - - return 0; + else /* dataless requests such as discard */ + return bio->bi_size >> 9; } static inline void *bio_data(struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e0ba018f5e8..26ececbbebe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -89,6 +89,7 @@ enum { enum rq_flag_bits { __REQ_RW, /* not set, read. set, write */ __REQ_FAILFAST, /* no low level driver retries */ + __REQ_DISCARD, /* request to discard sectors */ __REQ_SORTED, /* elevator knows about this request */ __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ __REQ_HARDBARRIER, /* may not be passed by drive either */ @@ -111,6 +112,7 @@ enum rq_flag_bits { }; #define REQ_RW (1 << __REQ_RW) +#define REQ_DISCARD (1 << __REQ_DISCARD) #define REQ_FAILFAST (1 << __REQ_FAILFAST) #define REQ_SORTED (1 << __REQ_SORTED) #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) @@ -252,6 +254,7 @@ typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); +typedef int (prepare_discard_fn) (struct request_queue *, struct request *); struct bio_vec; struct bvec_merge_data { @@ -307,6 +310,7 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unplug_fn *unplug_fn; + prepare_discard_fn *prepare_discard_fn; merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; softirq_done_fn *softirq_done_fn; @@ -546,6 +550,7 @@ enum { #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED) #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER) #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) +#define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) #define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors) /* rq->queuelist of dequeued request must be list_empty() */ @@ -796,6 +801,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); +extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); extern int blk_do_ordered(struct request_queue *, struct request **); @@ -837,6 +843,16 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, } extern int blkdev_issue_flush(struct block_device *, sector_t *); +extern int blkdev_issue_discard(struct block_device *, sector_t sector, + unsigned nr_sects); + +static inline int sb_issue_discard(struct super_block *sb, + sector_t block, unsigned nr_blocks) +{ + block <<= (sb->s_blocksize_bits - 9); + nr_blocks <<= (sb->s_blocksize_bits - 9); + return blkdev_issue_discard(sb->s_bdev, block, nr_blocks); +} /* * command filter functions diff --git a/include/linux/fs.h b/include/linux/fs.h index 580b513668f..eb013131913 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -86,7 +86,8 @@ extern int dir_notify_enable; #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) #define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC)) -#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) +#define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) +#define WRITE_DISCARD (WRITE | (1 << BIO_RW_DISCARD)) #define SEL_IN 1 #define SEL_OUT 2 -- cgit v1.2.3 From eae9acd13a8d14b50c00a961fa959606f34bbd92 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 5 Aug 2008 18:08:25 +0100 Subject: Support 'discard sectors' operation in translation layer support core Signed-off-by: David Woodhouse Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + include/linux/mtd/blktrans.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 26ececbbebe..727886d25c4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -81,6 +81,7 @@ enum { */ REQ_LB_OP_EJECT = 0x40, /* eject request */ REQ_LB_OP_FLUSH = 0x41, /* flush device */ + REQ_LB_OP_DISCARD = 0x42, /* discard sectors */ }; /* diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h index 310e6160641..8b4aa0523db 100644 --- a/include/linux/mtd/blktrans.h +++ b/include/linux/mtd/blktrans.h @@ -41,6 +41,8 @@ struct mtd_blktrans_ops { unsigned long block, char *buffer); int (*writesect)(struct mtd_blktrans_dev *dev, unsigned long block, char *buffer); + int (*discard)(struct mtd_blktrans_dev *dev, + unsigned long block, unsigned nr_blocks); /* Block layer ioctls */ int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo); -- cgit v1.2.3 From 27b29e86bf3d4b3cf6641a0efd78ed11a9b633b2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 10 Aug 2008 11:21:57 +0100 Subject: blktrace: support discard requests Signed-off-by: David Woodhouse Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index d084b8d227a..27da2cc682e 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -21,6 +21,7 @@ enum blktrace_cat { BLK_TC_NOTIFY = 1 << 10, /* special message */ BLK_TC_AHEAD = 1 << 11, /* readahead */ BLK_TC_META = 1 << 12, /* metadata */ + BLK_TC_DISCARD = 1 << 13, /* discard requests */ BLK_TC_END = 1 << 15, /* only 16-bits, reminder */ }; @@ -195,6 +196,9 @@ static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (likely(!bt)) return; + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + if (blk_pc_request(rq)) { what |= BLK_TC_ACT(BLK_TC_PC); __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd); -- cgit v1.2.3 From d30a2605be9d5132d95944916e8f578fcfe4f976 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 11 Aug 2008 15:58:42 +0100 Subject: Add BLKDISCARD ioctl to allow userspace to discard sectors We may well want mkfs tools to use this to mark the whole device as unwanted before they format it, for example. The ioctl takes a pair of uint64_ts, which are start offset and length in _bytes_. Although at the moment it might make sense for them both to be in 512-byte sectors, I don't want to limit the ABI to that. Signed-off-by: David Woodhouse Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index eb013131913..88358ca6af2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -223,6 +223,7 @@ extern int dir_notify_enable; #define BLKTRACESTART _IO(0x12,116) #define BLKTRACESTOP _IO(0x12,117) #define BLKTRACETEARDOWN _IO(0x12,118) +#define BLKDISCARD _IO(0x12,119) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From e17fc0a1ccf88f6d4dcb363729f3141b0958c325 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 9 Aug 2008 16:42:20 +0100 Subject: Allow elevators to sort/merge discard requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit But blkdev_issue_discard() still emits requests which are interpreted as soft barriers, because naïve callers might otherwise issue subsequent writes to those same sectors, which might cross on the queue (if they're reallocated quickly enough). Callers still _can_ issue non-barrier discard requests, but they have to take care of queue ordering for themselves. Signed-off-by: David Woodhouse Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- include/linux/blkdev.h | 5 +++-- include/linux/fs.h | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 1fdfc5621c8..33c3947d61e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -188,8 +188,8 @@ struct bio { #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) -#define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio)) #define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) +#define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio)) static inline unsigned int bio_cur_sectors(struct bio *bio) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 727886d25c4..e9eb35c9bf2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -541,7 +541,7 @@ enum { #define blk_noretry_request(rq) ((rq)->cmd_flags & REQ_FAILFAST) #define blk_rq_started(rq) ((rq)->cmd_flags & REQ_STARTED) -#define blk_account_rq(rq) (blk_rq_started(rq) && blk_fs_request(rq)) +#define blk_account_rq(rq) (blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq))) #define blk_pm_suspend_request(rq) ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND) #define blk_pm_resume_request(rq) ((rq)->cmd_type == REQ_TYPE_PM_RESUME) @@ -598,7 +598,8 @@ static inline void blk_clear_queue_full(struct request_queue *q, int rw) #define RQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER) #define rq_mergeable(rq) \ - (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq))) + (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ + (blk_discard_rq(rq) || blk_fs_request((rq)))) /* * q->prep_rq_fn return values diff --git a/include/linux/fs.h b/include/linux/fs.h index 88358ca6af2..860689f541b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -87,7 +87,8 @@ extern int dir_notify_enable; #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) #define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC)) #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) -#define WRITE_DISCARD (WRITE | (1 << BIO_RW_DISCARD)) +#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) +#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) #define SEL_IN 1 #define SEL_OUT 2 -- cgit v1.2.3 From 1a8e2bddd5c29008f311613e75925fecbf522c5b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 13 Aug 2008 12:35:09 +0100 Subject: Kill REQ_TYPE_FLUSH It was only used by ps3disk, and it should probably have been REQ_TYPE_LINUX_BLOCK + REQ_LB_OP_FLUSH. Signed-off-by: David Woodhouse Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e9eb35c9bf2..f131776f029 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -54,7 +54,6 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_FLUSH, /* flush request */ REQ_TYPE_SPECIAL, /* driver defined type */ REQ_TYPE_LINUX_BLOCK, /* generic block layer message */ /* @@ -76,11 +75,8 @@ enum rq_cmd_type_bits { * */ enum { - /* - * just examples for now - */ REQ_LB_OP_EJECT = 0x40, /* eject request */ - REQ_LB_OP_FLUSH = 0x41, /* flush device */ + REQ_LB_OP_FLUSH = 0x41, /* flush request */ REQ_LB_OP_DISCARD = 0x42, /* discard sectors */ }; -- cgit v1.2.3 From 766ca4428d1239a970926856c447310c9c191af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?= Date: Thu, 14 Aug 2008 09:59:13 +0200 Subject: virtio_blk: use a wrapper function to access io context information of IO requests struct request has an ioprio member but it is never updated because currently bios do not hold io context information. The implication of this is that virtio_blk ends up passing useless information to the backend driver. That said, some IO schedulers such as CFQ do store io context information in struct request, but use private members for that, which means that that information cannot be directly accessed in a IO scheduler-independent way. This patch adds a function to obtain the ioprio of a request. We should avoid accessing ioprio directly and use this function instead, so that its users do not have to care about future changes in block layer structures or what the currently active IO controller is. This patch does not introduce any functional changes but paves the way for future clean-ups and enhancements. Signed-off-by: Fernando Luis Vazquez Cao Acked-by: Rusty Russell Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f131776f029..490ce458b03 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -232,6 +232,11 @@ struct request { struct request *next_rq; }; +static inline unsigned short req_get_ioprio(struct request *req) +{ + return req->ioprio; +} + /* * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME * requests. Some step values could eventually be made generic. -- cgit v1.2.3 From b8b3e16cfe6435d961f6aaebcfd52a1ff2a988c5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 15 Aug 2008 10:15:19 +0200 Subject: block: drop virtual merging accounting Remove virtual merge accounting. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- include/linux/bio.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 33c3947d61e..894d16ce002 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -26,21 +26,8 @@ #ifdef CONFIG_BLOCK -/* Platforms may set this to teach the BIO layer about IOMMU hardware. */ #include -#if defined(BIO_VMERGE_MAX_SIZE) && defined(BIO_VMERGE_BOUNDARY) -#define BIOVEC_VIRT_START_SIZE(x) (bvec_to_phys(x) & (BIO_VMERGE_BOUNDARY - 1)) -#define BIOVEC_VIRT_OVERSIZE(x) ((x) > BIO_VMERGE_MAX_SIZE) -#else -#define BIOVEC_VIRT_START_SIZE(x) 0 -#define BIOVEC_VIRT_OVERSIZE(x) 0 -#endif - -#ifndef BIO_VMERGE_BOUNDARY -#define BIO_VMERGE_BOUNDARY 0 -#endif - #define BIO_DEBUG #ifdef BIO_DEBUG @@ -240,8 +227,6 @@ static inline void *bio_data(struct bio *bio) ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) #endif -#define BIOVEC_VIRT_MERGEABLE(vec1, vec2) \ - ((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0) #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ (((addr1) | (mask)) == (((addr2) - 1) | (mask))) #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ -- cgit v1.2.3 From 5df97b91b5d7ed426034fcc84cb6e7cf682b8838 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 15 Aug 2008 10:20:02 +0200 Subject: drop vmerge accounting Remove hw_segments field from struct bio and struct request. Without virtual merge accounting they have no purpose. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- include/linux/bio.h | 16 +--------------- include/linux/blkdev.h | 7 ------- 2 files changed, 1 insertion(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 894d16ce002..dfc3556d311 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -77,21 +77,8 @@ struct bio { */ unsigned short bi_phys_segments; - /* Number of segments after physical and DMA remapping - * hardware coalescing is performed. - */ - unsigned short bi_hw_segments; - unsigned int bi_size; /* residual I/O count */ - /* - * To keep track of the max hw size, we account for the - * sizes of the first and last virtually mergeable segments - * in this bio - */ - unsigned int bi_hw_front_size; - unsigned int bi_hw_back_size; - unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ struct bio_vec *bi_io_vec; /* the actual vec list */ @@ -113,7 +100,7 @@ struct bio { #define BIO_UPTODATE 0 /* ok after I/O completion */ #define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ #define BIO_EOF 2 /* out-out-bounds error */ -#define BIO_SEG_VALID 3 /* nr_hw_seg valid */ +#define BIO_SEG_VALID 3 /* bi_phys_segments valid */ #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ @@ -324,7 +311,6 @@ extern void bio_free(struct bio *, struct bio_set *); extern void bio_endio(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); -extern int bio_hw_segments(struct request_queue *, struct bio *); extern void __bio_clone(struct bio *, struct bio *); extern struct bio *bio_clone(struct bio *, gfp_t); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 490ce458b03..1adb03827bd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -189,13 +189,6 @@ struct request { */ unsigned short nr_phys_segments; - /* Number of scatter-gather addr+len pairs after - * physical and DMA remapping hardware coalescing is performed. - * This is the number of scatter-gather entries the driver - * will actually have to deal with after DMA mapping is done. - */ - unsigned short nr_hw_segments; - unsigned short ioprio; void *special; -- cgit v1.2.3 From 5b99c2ffa980528a197f26c7d876cceeccce8dd5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2008 10:56:11 +0200 Subject: block: make bi_phys_segments an unsigned int instead of short raid5 can overflow with more than 255 stripes, and we can increase it to an int for free on both 32 and 64-bit archs due to the padding. Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index dfc3556d311..2c0c09034fd 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -75,7 +75,7 @@ struct bio { /* Number of segments in this BIO after * physical address coalescing is performed. */ - unsigned short bi_phys_segments; + unsigned int bi_phys_segments; unsigned int bi_size; /* residual I/O count */ -- cgit v1.2.3 From a1ed5b0cffe4b16a93a6a3390e8cee0fbef94f86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:50:16 +0200 Subject: klist: don't iterate over deleted entries A klist entry is kept on the list till all its current iterations are finished; however, a new iteration after deletion also iterates over deleted entries as long as their reference count stays above zero. This causes problems for cases where there are users which iterate over the list while synchronized against list manipulations and natuarally expect already deleted entries to not show up during iteration. This patch implements dead flag which gets set on deletion so that iteration can skip already deleted entries. The dead flag piggy backs on the lowest bit of knode->n_klist and only visible to klist implementation proper. While at it, drop klist_iter->i_head as it's redundant and doesn't offer anything in semantics or performance wise as klist_iter->i_klist is dereferenced on every iteration anyway. Signed-off-by: Tejun Heo Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/klist.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/klist.h b/include/linux/klist.h index 06c338ef7f1..8ea98db223e 100644 --- a/include/linux/klist.h +++ b/include/linux/klist.h @@ -38,7 +38,7 @@ extern void klist_init(struct klist *k, void (*get)(struct klist_node *), void (*put)(struct klist_node *)); struct klist_node { - struct klist *n_klist; + void *n_klist; /* never access directly */ struct list_head n_node; struct kref n_ref; struct completion n_removed; @@ -57,7 +57,6 @@ extern int klist_node_attached(struct klist_node *n); struct klist_iter { struct klist *i_klist; - struct list_head *i_head; struct klist_node *i_cur; }; -- cgit v1.2.3 From 5a3ceb861663040f9ef0176df4aaa494bba5e352 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:50:19 +0200 Subject: driver-core: use klist for class device list and implement iterator Iterating over entries using callback usually isn't too fun especially when the entry being iterated over can't be manipulated freely. This patch converts class->p->class_devices to klist and implements class device iterator so that the users can freely build their own control structure. The users are also free to call back into class code without worrying about locking. class_for_each_device() and class_find_device() are converted to use the new iterators, so their users don't have to worry about locking anymore either. Note: This depends on klist-dont-iterate-over-deleted-entries patch because class_intf->add/remove_dev() depends on proper synchronization with device removal. Signed-off-by: Tejun Heo Cc: Greg Kroah-Hartman Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/device.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 4d8372d135d..246937c9cbc 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -199,6 +199,11 @@ struct class { struct class_private *p; }; +struct class_dev_iter { + struct klist_iter ki; + const struct device_type *type; +}; + extern struct kobject *sysfs_dev_block_kobj; extern struct kobject *sysfs_dev_char_kobj; extern int __must_check __class_register(struct class *class, @@ -213,6 +218,13 @@ extern void class_unregister(struct class *class); __class_register(class, &__key); \ }) +extern void class_dev_iter_init(struct class_dev_iter *iter, + struct class *class, + struct device *start, + const struct device_type *type); +extern struct device *class_dev_iter_next(struct class_dev_iter *iter); +extern void class_dev_iter_exit(struct class_dev_iter *iter); + extern int class_for_each_device(struct class *class, struct device *start, void *data, int (*fn)(struct device *dev, void *data)); @@ -396,7 +408,7 @@ struct device { spinlock_t devres_lock; struct list_head devres_head; - struct list_head node; + struct klist_node knode_class; struct class *class; dev_t devt; /* dev_t, creates the sysfs "dev" */ struct attribute_group **groups; /* optional groups */ -- cgit v1.2.3 From 310a2c1012934f590192377f65940cad4aa72b15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:17 +0900 Subject: block: misc updates This patch makes the following misc updates in preparation for disk->part dereference fix and extended block devt support. * implment part_to_disk() * fix comment about gendisk->part indexing * rename get_part() to disk_map_sector() * don't use n which is always zero while printing disk information in diskstats_show() Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index be4f5e5bfe0..c64e659c984 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -116,7 +116,7 @@ struct gendisk { int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */ char disk_name[32]; /* name of major driver */ - struct hd_struct **part; /* [indexed by minor] */ + struct hd_struct **part; /* [indexed by minor - 1] */ struct block_device_operations *fops; struct request_queue *queue; void *private_data; @@ -145,14 +145,21 @@ struct gendisk { #endif }; +static inline struct gendisk *part_to_disk(struct hd_struct *part) +{ + if (likely(part)) + return dev_to_disk((part)->dev.parent); + return NULL; +} + /* * Macros to operate on percpu disk statistics: * * The __ variants should only be called in critical sections. The full * variants disable/enable preemption. */ -static inline struct hd_struct *get_part(struct gendisk *gendiskp, - sector_t sector) +static inline struct hd_struct *disk_map_sector(struct gendisk *gendiskp, + sector_t sector) { struct hd_struct *part; int i; -- cgit v1.2.3 From cf771cb5a7b716f3f9e532fd42a1e3a0a75adec5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:01:09 +0200 Subject: block: make variable and argument names more consistent In hd_struct, @partno is used to denote partition number and a number of other places use @part to denote hd_struct. Functions use @part and @index instead. This causes confusion and makes it difficult to use consistent variable names for hd_struct. Always use @partno if a variable represents partition number. Also, print out functions use @f or @part for seq_file argument. Use @seqf uniformly instead. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c64e659c984..d1723c0a860 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -365,7 +365,7 @@ extern int get_blkdev_list(char *, int); extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern void unlink_gendisk(struct gendisk *gp); -extern struct gendisk *get_gendisk(dev_t dev, int *part); +extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern void set_device_ro(struct block_device *bdev, int flag); extern void set_disk_ro(struct gendisk *disk, int flag); @@ -534,8 +534,8 @@ struct unixware_disklabel { #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -extern dev_t blk_lookup_devt(const char *name, int part); -extern char *disk_name (struct gendisk *hd, int part, char *buf); +extern dev_t blk_lookup_devt(const char *name, int partno); +extern char *disk_name (struct gendisk *hd, int partno, char *buf); extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int); @@ -553,16 +553,16 @@ extern void blk_register_region(dev_t devt, unsigned long range, void *data); extern void blk_unregister_region(dev_t devt, unsigned long range); -static inline struct block_device *bdget_disk(struct gendisk *disk, int index) +static inline struct block_device *bdget_disk(struct gendisk *disk, int partno) { - return bdget(MKDEV(disk->major, disk->first_minor) + index); + return bdget(MKDEV(disk->major, disk->first_minor) + partno); } #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } -static inline dev_t blk_lookup_devt(const char *name, int part) +static inline dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); return devt; -- cgit v1.2.3 From f331c0296f2a9fee0d396a70598b954062603015 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:01:48 +0200 Subject: block: don't depend on consecutive minor space * Implement disk_devt() and part_devt() and use them to directly access devt instead of computing it from ->major and ->first_minor. Note that all references to ->major and ->first_minor outside of block layer is used to determine devt of the disk (the part0) and as ->major and ->first_minor will continue to represent devt for the disk, converting these users aren't strictly necessary. However, convert them for consistency. * Implement disk_max_parts() to avoid directly deferencing genhd->minors. * Update bdget_disk() such that it doesn't assume consecutive minor space. * Move devt computation from register_disk() to add_disk() and make it the only one (all other usages use the initially determined value). These changes clean up the code and will help disk->part dereference fix and extended block device numbers. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index d1723c0a860..0ff75329199 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -111,10 +111,14 @@ struct hd_struct { #define GENHD_FL_FAIL 64 struct gendisk { + /* major, first_minor and minors are input parameters only, + * don't use directly. Use disk_devt() and disk_max_parts(). + */ int major; /* major number of driver */ int first_minor; int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */ + char disk_name[32]; /* name of major driver */ struct hd_struct **part; /* [indexed by minor - 1] */ struct block_device_operations *fops; @@ -152,6 +156,21 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part) return NULL; } +static inline int disk_max_parts(struct gendisk *disk) +{ + return disk->minors - 1; +} + +static inline dev_t disk_devt(struct gendisk *disk) +{ + return disk->dev.devt; +} + +static inline dev_t part_devt(struct hd_struct *part) +{ + return part->dev.devt; +} + /* * Macros to operate on percpu disk statistics: * @@ -163,7 +182,7 @@ static inline struct hd_struct *disk_map_sector(struct gendisk *gendiskp, { struct hd_struct *part; int i; - for (i = 0; i < gendiskp->minors - 1; i++) { + for (i = 0; i < disk_max_parts(gendiskp); i++) { part = gendiskp->part[i]; if (part && part->start_sect <= sector && sector < part->start_sect + part->nr_sects) @@ -366,6 +385,7 @@ extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern void unlink_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); +extern struct block_device *bdget_disk(struct gendisk *disk, int partno); extern void set_device_ro(struct block_device *bdev, int flag); extern void set_disk_ro(struct gendisk *disk, int flag); @@ -553,11 +573,6 @@ extern void blk_register_region(dev_t devt, unsigned long range, void *data); extern void blk_unregister_region(dev_t devt, unsigned long range); -static inline struct block_device *bdget_disk(struct gendisk *disk, int partno) -{ - return bdget(MKDEV(disk->major, disk->first_minor) + partno); -} - #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } -- cgit v1.2.3 From e71bf0d0ee89e51b92776391c5634938236977d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:03:02 +0200 Subject: block: fix disk->part[] dereferencing race disk->part[] is protected by its matching bdev's lock. However, non-critical accesses like collecting stats and printing out sysfs and proc information used to be performed without any locking. As partitions can come and go dynamically, partitions can go away underneath those non-critical accesses. As some of those accesses are writes, this theoretically can lead to silent corruption. This patch fixes the race by using RCU for the partition array and dev reference counter to hold partitions. * Rename disk->part[] to disk->__part[] to make sure no one outside genhd layer proper accesses it directly. * Use RCU for disk->__part[] dereferencing. * Implement disk_{get|put}_part() which can be used to get and put partitions from gendisk respectively. * Iterators are implemented to help iterate through all partitions safely. * Functions which require RCU readlock are marked with _rcu suffix. * Use disk_put_part() in __blkdev_put() instead of directly putting the contained kobject. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 53 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0ff75329199..7fbba19e076 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -11,6 +11,7 @@ #include #include +#include #ifdef CONFIG_BLOCK @@ -100,6 +101,7 @@ struct hd_struct { #else struct disk_stats dkstats; #endif + struct rcu_head rcu_head; }; #define GENHD_FL_REMOVABLE 1 @@ -120,7 +122,14 @@ struct gendisk { * disks that can't be partitioned. */ char disk_name[32]; /* name of major driver */ - struct hd_struct **part; /* [indexed by minor - 1] */ + + /* Array of pointers to partitions indexed by partno - 1. + * Protected with matching bdev lock but stat and other + * non-critical accesses use RCU. Always access through + * helpers. + */ + struct hd_struct **__part; + struct block_device_operations *fops; struct request_queue *queue; void *private_data; @@ -171,25 +180,41 @@ static inline dev_t part_devt(struct hd_struct *part) return part->dev.devt; } +extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); + +static inline void disk_put_part(struct hd_struct *part) +{ + if (likely(part)) + put_device(&part->dev); +} + +/* + * Smarter partition iterator without context limits. + */ +#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */ +#define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */ + +struct disk_part_iter { + struct gendisk *disk; + struct hd_struct *part; + int idx; + unsigned int flags; +}; + +extern void disk_part_iter_init(struct disk_part_iter *piter, + struct gendisk *disk, unsigned int flags); +extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter); +extern void disk_part_iter_exit(struct disk_part_iter *piter); + +extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, + sector_t sector); + /* * Macros to operate on percpu disk statistics: * * The __ variants should only be called in critical sections. The full * variants disable/enable preemption. */ -static inline struct hd_struct *disk_map_sector(struct gendisk *gendiskp, - sector_t sector) -{ - struct hd_struct *part; - int i; - for (i = 0; i < disk_max_parts(gendiskp); i++) { - part = gendiskp->part[i]; - if (part && part->start_sect <= sector - && sector < part->start_sect + part->nr_sects) - return part; - } - return NULL; -} #ifdef CONFIG_SMP #define __disk_stat_add(gendiskp, field, addnd) \ -- cgit v1.2.3 From c9959059161ddd7bf4670cf47367033d6b2f79c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:21 +0900 Subject: block: fix diskstats access There are two variants of stat functions - ones prefixed with double underbars which don't care about preemption and ones without which disable preemption before manipulating per-cpu counters. It's unclear whether the underbarred ones assume that preemtion is disabled on entry as some callers don't do that. This patch unifies diskstats access by implementing disk_stat_lock() and disk_stat_unlock() which take care of both RCU (for partition access) and preemption (for per-cpu counter access). diskstats access should always be enclosed between the two functions. As such, there's no need for the versions which disables preemption. They're removed and double underbars ones are renamed to drop the underbars. As an extra argument is added, there's no danger of using the old version unconverted. disk_stat_lock() uses get_cpu() and returns the cpu index and all diskstat functions which access per-cpu counters now has @cpu argument to help RT. This change adds RCU or preemption operations at some places but also collapses several preemption ops into one at others. Overall, the performance difference should be negligible as all involved ops are very lightweight per-cpu ones. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Signed-off-by: Jens Axboe --- include/linux/genhd.h | 139 +++++++++++++++++++++----------------------------- 1 file changed, 57 insertions(+), 82 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7fbba19e076..ac8a901f200 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -209,16 +209,24 @@ extern void disk_part_iter_exit(struct disk_part_iter *piter); extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); -/* +/* * Macros to operate on percpu disk statistics: * - * The __ variants should only be called in critical sections. The full - * variants disable/enable preemption. + * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters + * and should be called between disk_stat_lock() and + * disk_stat_unlock(). + * + * part_stat_read() can be called at any time. + * + * part_stat_{add|set_all}() and {init|free}_part_stats are for + * internal use only. */ - #ifdef CONFIG_SMP -#define __disk_stat_add(gendiskp, field, addnd) \ - (per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd) +#define disk_stat_lock() ({ rcu_read_lock(); get_cpu(); }) +#define disk_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) + +#define disk_stat_add(cpu, gendiskp, field, addnd) \ + (per_cpu_ptr(gendiskp->dkstats, cpu)->field += addnd) #define disk_stat_read(gendiskp, field) \ ({ \ @@ -229,7 +237,8 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, res; \ }) -static inline void disk_stat_set_all(struct gendisk *gendiskp, int value) { +static inline void disk_stat_set_all(struct gendisk *gendiskp, int value) +{ int i; for_each_possible_cpu(i) @@ -237,14 +246,14 @@ static inline void disk_stat_set_all(struct gendisk *gendiskp, int value) { sizeof(struct disk_stats)); } -#define __part_stat_add(part, field, addnd) \ - (per_cpu_ptr(part->dkstats, smp_processor_id())->field += addnd) +#define part_stat_add(cpu, part, field, addnd) \ + (per_cpu_ptr(part->dkstats, cpu)->field += addnd) -#define __all_stat_add(gendiskp, part, field, addnd, sector) \ -({ \ - if (part) \ - __part_stat_add(part, field, addnd); \ - __disk_stat_add(gendiskp, field, addnd); \ +#define all_stat_add(cpu, gendiskp, part, field, addnd, sector) \ +({ \ + if (part) \ + part_stat_add(cpu, part, field, addnd); \ + disk_stat_add(cpu, gendiskp, field, addnd); \ }) #define part_stat_read(part, field) \ @@ -264,10 +273,13 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) memset(per_cpu_ptr(part->dkstats, i), value, sizeof(struct disk_stats)); } - + #else /* !CONFIG_SMP */ -#define __disk_stat_add(gendiskp, field, addnd) \ - (gendiskp->dkstats.field += addnd) +#define disk_stat_lock() ({ rcu_read_lock(); 0; }) +#define disk_stat_unlock() rcu_read_unlock() + +#define disk_stat_add(cpu, gendiskp, field, addnd) \ + (gendiskp->dkstats.field += addnd) #define disk_stat_read(gendiskp, field) (gendiskp->dkstats.field) static inline void disk_stat_set_all(struct gendisk *gendiskp, int value) @@ -275,14 +287,14 @@ static inline void disk_stat_set_all(struct gendisk *gendiskp, int value) memset(&gendiskp->dkstats, value, sizeof (struct disk_stats)); } -#define __part_stat_add(part, field, addnd) \ +#define part_stat_add(cpu, part, field, addnd) \ (part->dkstats.field += addnd) -#define __all_stat_add(gendiskp, part, field, addnd, sector) \ -({ \ - if (part) \ - part->dkstats.field += addnd; \ - __disk_stat_add(gendiskp, field, addnd); \ +#define all_stat_add(cpu, gendiskp, part, field, addnd, sector) \ +({ \ + if (part) \ + part_stat_add(cpu, part, field, addnd); \ + disk_stat_add(cpu, gendiskp, field, addnd); \ }) #define part_stat_read(part, field) (part->dkstats.field) @@ -294,63 +306,26 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) #endif /* CONFIG_SMP */ -#define disk_stat_add(gendiskp, field, addnd) \ - do { \ - preempt_disable(); \ - __disk_stat_add(gendiskp, field, addnd); \ - preempt_enable(); \ - } while (0) - -#define __disk_stat_dec(gendiskp, field) __disk_stat_add(gendiskp, field, -1) -#define disk_stat_dec(gendiskp, field) disk_stat_add(gendiskp, field, -1) - -#define __disk_stat_inc(gendiskp, field) __disk_stat_add(gendiskp, field, 1) -#define disk_stat_inc(gendiskp, field) disk_stat_add(gendiskp, field, 1) - -#define __disk_stat_sub(gendiskp, field, subnd) \ - __disk_stat_add(gendiskp, field, -subnd) -#define disk_stat_sub(gendiskp, field, subnd) \ - disk_stat_add(gendiskp, field, -subnd) - -#define part_stat_add(gendiskp, field, addnd) \ - do { \ - preempt_disable(); \ - __part_stat_add(gendiskp, field, addnd);\ - preempt_enable(); \ - } while (0) - -#define __part_stat_dec(gendiskp, field) __part_stat_add(gendiskp, field, -1) -#define part_stat_dec(gendiskp, field) part_stat_add(gendiskp, field, -1) - -#define __part_stat_inc(gendiskp, field) __part_stat_add(gendiskp, field, 1) -#define part_stat_inc(gendiskp, field) part_stat_add(gendiskp, field, 1) - -#define __part_stat_sub(gendiskp, field, subnd) \ - __part_stat_add(gendiskp, field, -subnd) -#define part_stat_sub(gendiskp, field, subnd) \ - part_stat_add(gendiskp, field, -subnd) - -#define all_stat_add(gendiskp, part, field, addnd, sector) \ - do { \ - preempt_disable(); \ - __all_stat_add(gendiskp, part, field, addnd, sector); \ - preempt_enable(); \ - } while (0) - -#define __all_stat_dec(gendiskp, field, sector) \ - __all_stat_add(gendiskp, field, -1, sector) -#define all_stat_dec(gendiskp, field, sector) \ - all_stat_add(gendiskp, field, -1, sector) - -#define __all_stat_inc(gendiskp, part, field, sector) \ - __all_stat_add(gendiskp, part, field, 1, sector) -#define all_stat_inc(gendiskp, part, field, sector) \ - all_stat_add(gendiskp, part, field, 1, sector) - -#define __all_stat_sub(gendiskp, part, field, subnd, sector) \ - __all_stat_add(gendiskp, part, field, -subnd, sector) -#define all_stat_sub(gendiskp, part, field, subnd, sector) \ - all_stat_add(gendiskp, part, field, -subnd, sector) +#define disk_stat_dec(cpu, gendiskp, field) \ + disk_stat_add(cpu, gendiskp, field, -1) +#define disk_stat_inc(cpu, gendiskp, field) \ + disk_stat_add(cpu, gendiskp, field, 1) +#define disk_stat_sub(cpu, gendiskp, field, subnd) \ + disk_stat_add(cpu, gendiskp, field, -subnd) + +#define part_stat_dec(cpu, gendiskp, field) \ + part_stat_add(cpu, gendiskp, field, -1) +#define part_stat_inc(cpu, gendiskp, field) \ + part_stat_add(cpu, gendiskp, field, 1) +#define part_stat_sub(cpu, gendiskp, field, subnd) \ + part_stat_add(cpu, gendiskp, field, -subnd) + +#define all_stat_dec(cpu, gendiskp, field, sector) \ + all_stat_add(cpu, gendiskp, field, -1, sector) +#define all_stat_inc(cpu, gendiskp, part, field, sector) \ + all_stat_add(cpu, gendiskp, part, field, 1, sector) +#define all_stat_sub(cpu, gendiskp, part, field, subnd, sector) \ + all_stat_add(cpu, gendiskp, part, field, -subnd, sector) /* Inlines to alloc and free disk stats in struct gendisk */ #ifdef CONFIG_SMP @@ -401,8 +376,8 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ /* drivers/block/ll_rw_blk.c */ -extern void disk_round_stats(struct gendisk *disk); -extern void part_round_stats(struct hd_struct *part); +extern void disk_round_stats(int cpu, struct gendisk *disk); +extern void part_round_stats(int cpu, struct hd_struct *part); /* drivers/block/genhd.c */ extern int get_blkdev_list(char *, int); -- cgit v1.2.3 From bcce3de1be61e424deef35d1e86e86a35c4b6e65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:22 +0900 Subject: block: implement extended dev numbers Implement extended device numbers. A block driver can tell block layer that it wants to use extended device numbers. After the usual minor space is used up, block layer automatically allocates devt's from EXT_BLOCK_MAJOR. Currently only one major number is allocated for this but as the allocation is strictly on-demand, ~1mil minor space under it should suffice unless the system actually has more than ~1mil partitions and if that ever happens adding more majors to the extended devt area is easy. Due to internal implementation issues, the first partition can't be allocated on the extended area. In other words, genhd->minors should at least be 1. This limitation will be lifted by later changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 13 ++++++++++--- include/linux/major.h | 2 ++ 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ac8a901f200..6fc53242406 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -113,13 +113,15 @@ struct hd_struct { #define GENHD_FL_FAIL 64 struct gendisk { - /* major, first_minor and minors are input parameters only, - * don't use directly. Use disk_devt() and disk_max_parts(). + /* major, first_minor, minors and ext_minors are input + * parameters only, don't use directly. Use disk_devt() and + * disk_max_parts(). */ int major; /* major number of driver */ int first_minor; int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */ + int ext_minors; /* number of extended dynamic minors */ char disk_name[32]; /* name of major driver */ @@ -167,7 +169,7 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part) static inline int disk_max_parts(struct gendisk *disk) { - return disk->minors - 1; + return disk->minors + disk->ext_minors - 1; } static inline dev_t disk_devt(struct gendisk *disk) @@ -554,6 +556,8 @@ struct unixware_disklabel { #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 +extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +extern void blk_free_devt(dev_t devt); extern dev_t blk_lookup_devt(const char *name, int partno); extern char *disk_name (struct gendisk *hd, int partno, char *buf); @@ -564,6 +568,9 @@ extern void printk_all_partitions(void); extern struct gendisk *alloc_disk_node(int minors, int node_id); extern struct gendisk *alloc_disk(int minors); +extern struct gendisk *alloc_disk_ext_node(int minors, int ext_minrs, + int node_id); +extern struct gendisk *alloc_disk_ext(int minors, int ext_minors); extern struct kobject *get_disk(struct gendisk *disk); extern void put_disk(struct gendisk *disk); extern void blk_register_region(dev_t devt, unsigned long range, diff --git a/include/linux/major.h b/include/linux/major.h index 53d5fafd85c..88249452b93 100644 --- a/include/linux/major.h +++ b/include/linux/major.h @@ -170,4 +170,6 @@ #define VIOTAPE_MAJOR 230 +#define BLOCK_EXT_MAJOR 259 + #endif -- cgit v1.2.3 From 1f0142905d4812966831613847db38a66da29eb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:23 +0900 Subject: block: adjust formatting for large minors and add ext_range sysfs attr With extended minors and the soon-to-follow debug feature, large minor numbers for block devices will be common. This patch does the followings to make printouts pretty. * Adapt print formats such that large minors don't break the formatting. * For extended MAJ:MIN, %02x%02x for MAJ:MIN used in printk_all_partitions() doesn't cut it anymore. Update it such that %03x:%05x is used if either MAJ or MIN doesn't fit in %02x. * Implement ext_range sysfs attribute which shows total minors the device can use including both conventional minor space and the extended one. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 860689f541b..02a9fb5a830 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1685,6 +1685,7 @@ extern void chrdev_show(struct seq_file *,off_t); /* fs/block_dev.c */ #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ +#define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */ #ifdef CONFIG_BLOCK #define BLKDEV_MAJOR_HASH_SIZE 255 -- cgit v1.2.3 From ed9e1982347b36573cd622ee5f4e2a7ccd79b3fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:05 +0900 Subject: block: implement and use {disk|part}_to_dev() Implement {disk|part}_to_dev() and use them to access generic device instead of directly dereferencing {disk|part}->dev. To make sure no user is left behind, rename generic devices fields to __dev. This is in preparation of unifying partition 0 handling with other partitions. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6fc53242406..e4e18c509ac 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -15,9 +15,11 @@ #ifdef CONFIG_BLOCK -#define kobj_to_dev(k) container_of(k, struct device, kobj) -#define dev_to_disk(device) container_of(device, struct gendisk, dev) -#define dev_to_part(device) container_of(device, struct hd_struct, dev) +#define kobj_to_dev(k) container_of((k), struct device, kobj) +#define dev_to_disk(device) container_of((device), struct gendisk, __dev) +#define dev_to_part(device) container_of((device), struct hd_struct, __dev) +#define disk_to_dev(disk) (&((disk)->__dev)) +#define part_to_dev(part) (&((part)->__dev)) extern struct device_type part_type; extern struct kobject *block_depr; @@ -88,7 +90,7 @@ struct disk_stats { struct hd_struct { sector_t start_sect; sector_t nr_sects; - struct device dev; + struct device __dev; struct kobject *holder_dir; int policy, partno; #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -139,7 +141,7 @@ struct gendisk { int flags; struct device *driverfs_dev; // FIXME: remove - struct device dev; + struct device __dev; struct kobject *holder_dir; struct kobject *slave_dir; @@ -163,7 +165,7 @@ struct gendisk { static inline struct gendisk *part_to_disk(struct hd_struct *part) { if (likely(part)) - return dev_to_disk((part)->dev.parent); + return dev_to_disk(part_to_dev(part)->parent); return NULL; } @@ -174,12 +176,12 @@ static inline int disk_max_parts(struct gendisk *disk) static inline dev_t disk_devt(struct gendisk *disk) { - return disk->dev.devt; + return disk_to_dev(disk)->devt; } static inline dev_t part_devt(struct hd_struct *part) { - return part->dev.devt; + return part_to_dev(part)->devt; } extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); @@ -187,7 +189,7 @@ extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); static inline void disk_put_part(struct hd_struct *part) { if (likely(part)) - put_device(&part->dev); + put_device(part_to_dev(part)); } /* -- cgit v1.2.3 From b5d0b9df0ba5d9a044f3a21e7544f53d90bd1465 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:06:42 +0200 Subject: block: introduce partition 0 genhd and partition code handled disk and partitions separately. All information about the whole disk was in struct genhd and partitions in struct hd_struct. However, the whole disk (part0) and other partitions have a lot in common and the data structures end up having good number of common fields and thus separate code paths doing the same thing. Also, the partition array was indexed by partno - 1 which gets pretty confusing at times. This patch introduces partition 0 and makes the partition array indexed by partno. Following patches will unify the handling of disk and parts piece-by-piece. This patch also implements disk_partitionable() which tests whether a disk is partitionable. With coming dynamic partition array change, the most common usage of disk_max_parts() will be testing whether a disk is partitionable and the number of max partitions will become much less important. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e4e18c509ac..9e866a2aee5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -127,12 +127,13 @@ struct gendisk { char disk_name[32]; /* name of major driver */ - /* Array of pointers to partitions indexed by partno - 1. + /* Array of pointers to partitions indexed by partno. * Protected with matching bdev lock but stat and other * non-critical accesses use RCU. Always access through * helpers. */ struct hd_struct **__part; + struct hd_struct part0; struct block_device_operations *fops; struct request_queue *queue; @@ -171,7 +172,12 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part) static inline int disk_max_parts(struct gendisk *disk) { - return disk->minors + disk->ext_minors - 1; + return disk->minors + disk->ext_minors; +} + +static inline bool disk_partitionable(struct gendisk *disk) +{ + return disk_max_parts(disk) > 1; } static inline dev_t disk_devt(struct gendisk *disk) @@ -197,6 +203,7 @@ static inline void disk_put_part(struct hd_struct *part) */ #define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */ #define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */ +#define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */ struct disk_part_iter { struct gendisk *disk; -- cgit v1.2.3 From 80795aefb76d10c5d698e60c7e7750b5330787da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:07 +0900 Subject: block: move capacity from disk to part0 Move disk->capacity to part0->nr_sects and convert all users who directly accessed the field to use {get|set}_capacity(). This is done early to allow the __dev field to be moved. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9e866a2aee5..1cf828148ec 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -138,7 +138,6 @@ struct gendisk { struct block_device_operations *fops; struct request_queue *queue; void *private_data; - sector_t capacity; int flags; struct device *driverfs_dev; // FIXME: remove @@ -411,11 +410,11 @@ static inline sector_t get_start_sect(struct block_device *bdev) } static inline sector_t get_capacity(struct gendisk *disk) { - return disk->capacity; + return disk->part0.nr_sects; } static inline void set_capacity(struct gendisk *disk, sector_t size) { - disk->capacity = size; + disk->part0.nr_sects = size; } #ifdef CONFIG_SOLARIS_X86_PARTITION -- cgit v1.2.3 From 548b10eb2959c96cef6fc29fc96e0931eeb53bc5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Aug 2008 09:01:47 +0200 Subject: block: move __dev from disk to part0 Move disk->__dev to part0->__dev. This simplifies bdget_disk() and lookup_devt() and allows common sysfs attributes to be unified. part_to_disk() is updated to handle part0 -> disk. Updated to include a fix from Bartlomiej Zolnierkiewicz , he writes: "part0 is a "special" partition and doesn't need to have capacity set - this fixes regression caused by "block: move __dev from disk to part0" commit." Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 1cf828148ec..ff293ec8b3f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -16,9 +16,9 @@ #ifdef CONFIG_BLOCK #define kobj_to_dev(k) container_of((k), struct device, kobj) -#define dev_to_disk(device) container_of((device), struct gendisk, __dev) +#define dev_to_disk(device) container_of((device), struct gendisk, part0.__dev) #define dev_to_part(device) container_of((device), struct hd_struct, __dev) -#define disk_to_dev(disk) (&((disk)->__dev)) +#define disk_to_dev(disk) (&(disk)->part0.__dev) #define part_to_dev(part) (&((part)->__dev)) extern struct device_type part_type; @@ -141,7 +141,6 @@ struct gendisk { int flags; struct device *driverfs_dev; // FIXME: remove - struct device __dev; struct kobject *holder_dir; struct kobject *slave_dir; @@ -164,8 +163,12 @@ struct gendisk { static inline struct gendisk *part_to_disk(struct hd_struct *part) { - if (likely(part)) - return dev_to_disk(part_to_dev(part)->parent); + if (likely(part)) { + if (part->partno) + return dev_to_disk(part_to_dev(part)->parent); + else + return dev_to_disk(part_to_dev(part)); + } return NULL; } -- cgit v1.2.3 From e56105214943ce5f0901d20e972a7cfd0d1d0656 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:09 +0900 Subject: block: unify sysfs size node handling Now that capacity and __dev are moved to part0, part0 and others can share the same method. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ff293ec8b3f..9cb8380cf0e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -591,6 +591,9 @@ extern void blk_register_region(dev_t devt, unsigned long range, void *data); extern void blk_unregister_region(dev_t devt, unsigned long range); +extern ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf); + #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } -- cgit v1.2.3 From b7db9956e57c8151b930d5e5fe5c766e6aad3ff7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:10 +0900 Subject: block: move policy from disk to part0 Move disk->policy to part0->policy. Implement and use get_disk_ro(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9cb8380cf0e..4411bdd671d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -145,7 +145,6 @@ struct gendisk { struct kobject *slave_dir; struct timer_rand_state *random; - int policy; atomic_t sync_io; /* RAID */ unsigned long stamp; @@ -403,6 +402,11 @@ extern struct block_device *bdget_disk(struct gendisk *disk, int partno); extern void set_device_ro(struct block_device *bdev, int flag); extern void set_disk_ro(struct gendisk *disk, int flag); +static inline int get_disk_ro(struct gendisk *disk) +{ + return disk->part0.policy; +} + /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk); extern void rand_initialize_disk(struct gendisk *disk); -- cgit v1.2.3 From 4c46501d1659475dc6c89554af6ce7fe6ecf615c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:11 +0900 Subject: block: move holder_dir from disk to part0 Move disk->holder_dir to part0->holder_dir. Kill now mostly superflous bdev_get_holder(). While at it, kill superflous kobject_get/put() around holder_dir, slave_dir and cmd_filter creation and collapse disk_sysfs_add_subdirs() into register_disk(). These serve no purpose but obfuscating the code. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 4411bdd671d..2c0e1b597ab 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -141,7 +141,6 @@ struct gendisk { int flags; struct device *driverfs_dev; // FIXME: remove - struct kobject *holder_dir; struct kobject *slave_dir; struct timer_rand_state *random; -- cgit v1.2.3 From 0762b8bde9729f10f8e6249809660ff2ec3ad735 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:12 +0900 Subject: block: always set bdev->bd_part Till now, bdev->bd_part is set only if the bdev was for parts other than part0. This patch makes bdev->bd_part always set so that code paths don't have to differenciate common handling. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 2c0e1b597ab..45a3682b5d8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -412,7 +412,7 @@ extern void rand_initialize_disk(struct gendisk *disk); static inline sector_t get_start_sect(struct block_device *bdev) { - return bdev->bd_contains == bdev ? 0 : bdev->bd_part->start_sect; + return bdev->bd_part->start_sect; } static inline sector_t get_capacity(struct gendisk *disk) { -- cgit v1.2.3 From eddb2e26b5ee3c5da68ba4bf1921ba20e2097bff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:13 +0900 Subject: block: kill GENHD_FL_FAIL and use part0->make_it_fail GENHD_FL_FAIL for disk is what make_it_fail is for parts. Kill it and use part0->make_it_fail. Sysfs node handling is unified too. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 45a3682b5d8..3d15b42dc35 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -112,7 +112,6 @@ struct hd_struct { #define GENHD_FL_CD 8 #define GENHD_FL_UP 16 #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 -#define GENHD_FL_FAIL 64 struct gendisk { /* major, first_minor, minors and ext_minors are input @@ -596,6 +595,13 @@ extern void blk_unregister_region(dev_t devt, unsigned long range); extern ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); +#ifdef CONFIG_FAIL_MAKE_REQUEST +extern ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf); +extern ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +#endif /* CONFIG_FAIL_MAKE_REQUEST */ #else /* CONFIG_BLOCK */ -- cgit v1.2.3 From 074a7aca7afa6f230104e8e65eba3420263714a5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:14 +0900 Subject: block: move stats from disk to part0 Move stats related fields - stamp, in_flight, dkstats - from disk to part0 and unify stat handling such that... * part_stat_*() now updates part0 together if the specified partition is not part0. ie. part_stat_*() are now essentially all_stat_*(). * {disk|all}_stat_*() are gone. * part_round_stats() is updated similary. It handles part0 stats automatically and disk_round_stats() is killed. * part_{inc|dec}_in_fligh() is implemented which automatically updates part0 stats for parts other than part0. * disk_map_sector_rcu() is updated to return part0 if no part matches. Combined with the above changes, this makes NULL special case handling in callers unnecessary. * Separate stats show code paths for disk are collapsed into part stats show code paths. * Rename disk_stat_lock/unlock() to part_stat_lock/unlock() While at it, reposition stat handling macros a bit and add missing parentheses around macro parameters. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 159 ++++++++++++++------------------------------------ 1 file changed, 45 insertions(+), 114 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3d15b42dc35..c90e1b4fbe5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -145,13 +145,6 @@ struct gendisk { struct timer_rand_state *random; atomic_t sync_io; /* RAID */ - unsigned long stamp; - int in_flight; -#ifdef CONFIG_SMP - struct disk_stats *dkstats; -#else - struct disk_stats dkstats; -#endif struct work_struct async_notify; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; @@ -232,46 +225,18 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, * internal use only. */ #ifdef CONFIG_SMP -#define disk_stat_lock() ({ rcu_read_lock(); get_cpu(); }) -#define disk_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) - -#define disk_stat_add(cpu, gendiskp, field, addnd) \ - (per_cpu_ptr(gendiskp->dkstats, cpu)->field += addnd) - -#define disk_stat_read(gendiskp, field) \ -({ \ - typeof(gendiskp->dkstats->field) res = 0; \ - int i; \ - for_each_possible_cpu(i) \ - res += per_cpu_ptr(gendiskp->dkstats, i)->field; \ - res; \ -}) - -static inline void disk_stat_set_all(struct gendisk *gendiskp, int value) -{ - int i; - - for_each_possible_cpu(i) - memset(per_cpu_ptr(gendiskp->dkstats, i), value, - sizeof(struct disk_stats)); -} +#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) +#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) -#define part_stat_add(cpu, part, field, addnd) \ - (per_cpu_ptr(part->dkstats, cpu)->field += addnd) - -#define all_stat_add(cpu, gendiskp, part, field, addnd, sector) \ -({ \ - if (part) \ - part_stat_add(cpu, part, field, addnd); \ - disk_stat_add(cpu, gendiskp, field, addnd); \ -}) +#define __part_stat_add(cpu, part, field, addnd) \ + (per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd)) #define part_stat_read(part, field) \ ({ \ - typeof(part->dkstats->field) res = 0; \ + typeof((part)->dkstats->field) res = 0; \ int i; \ for_each_possible_cpu(i) \ - res += per_cpu_ptr(part->dkstats, i)->field; \ + res += per_cpu_ptr((part)->dkstats, i)->field; \ res; \ }) @@ -284,109 +249,73 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) sizeof(struct disk_stats)); } -#else /* !CONFIG_SMP */ -#define disk_stat_lock() ({ rcu_read_lock(); 0; }) -#define disk_stat_unlock() rcu_read_unlock() - -#define disk_stat_add(cpu, gendiskp, field, addnd) \ - (gendiskp->dkstats.field += addnd) -#define disk_stat_read(gendiskp, field) (gendiskp->dkstats.field) - -static inline void disk_stat_set_all(struct gendisk *gendiskp, int value) +static inline int init_part_stats(struct hd_struct *part) { - memset(&gendiskp->dkstats, value, sizeof (struct disk_stats)); + part->dkstats = alloc_percpu(struct disk_stats); + if (!part->dkstats) + return 0; + return 1; } -#define part_stat_add(cpu, part, field, addnd) \ - (part->dkstats.field += addnd) - -#define all_stat_add(cpu, gendiskp, part, field, addnd, sector) \ -({ \ - if (part) \ - part_stat_add(cpu, part, field, addnd); \ - disk_stat_add(cpu, gendiskp, field, addnd); \ -}) - -#define part_stat_read(part, field) (part->dkstats.field) - -static inline void part_stat_set_all(struct hd_struct *part, int value) +static inline void free_part_stats(struct hd_struct *part) { - memset(&part->dkstats, value, sizeof(struct disk_stats)); + free_percpu(part->dkstats); } -#endif /* CONFIG_SMP */ - -#define disk_stat_dec(cpu, gendiskp, field) \ - disk_stat_add(cpu, gendiskp, field, -1) -#define disk_stat_inc(cpu, gendiskp, field) \ - disk_stat_add(cpu, gendiskp, field, 1) -#define disk_stat_sub(cpu, gendiskp, field, subnd) \ - disk_stat_add(cpu, gendiskp, field, -subnd) - -#define part_stat_dec(cpu, gendiskp, field) \ - part_stat_add(cpu, gendiskp, field, -1) -#define part_stat_inc(cpu, gendiskp, field) \ - part_stat_add(cpu, gendiskp, field, 1) -#define part_stat_sub(cpu, gendiskp, field, subnd) \ - part_stat_add(cpu, gendiskp, field, -subnd) +#else /* !CONFIG_SMP */ +#define part_stat_lock() ({ rcu_read_lock(); 0; }) +#define part_stat_unlock() rcu_read_unlock() -#define all_stat_dec(cpu, gendiskp, field, sector) \ - all_stat_add(cpu, gendiskp, field, -1, sector) -#define all_stat_inc(cpu, gendiskp, part, field, sector) \ - all_stat_add(cpu, gendiskp, part, field, 1, sector) -#define all_stat_sub(cpu, gendiskp, part, field, subnd, sector) \ - all_stat_add(cpu, gendiskp, part, field, -subnd, sector) +#define __part_stat_add(cpu, part, field, addnd) \ + ((part)->dkstats.field += addnd) -/* Inlines to alloc and free disk stats in struct gendisk */ -#ifdef CONFIG_SMP -static inline int init_disk_stats(struct gendisk *disk) -{ - disk->dkstats = alloc_percpu(struct disk_stats); - if (!disk->dkstats) - return 0; - return 1; -} +#define part_stat_read(part, field) ((part)->dkstats.field) -static inline void free_disk_stats(struct gendisk *disk) +static inline void part_stat_set_all(struct hd_struct *part, int value) { - free_percpu(disk->dkstats); + memset(&part->dkstats, value, sizeof(struct disk_stats)); } static inline int init_part_stats(struct hd_struct *part) { - part->dkstats = alloc_percpu(struct disk_stats); - if (!part->dkstats) - return 0; return 1; } static inline void free_part_stats(struct hd_struct *part) { - free_percpu(part->dkstats); } -#else /* CONFIG_SMP */ -static inline int init_disk_stats(struct gendisk *disk) -{ - return 1; -} +#endif /* CONFIG_SMP */ -static inline void free_disk_stats(struct gendisk *disk) -{ -} +#define part_stat_add(cpu, part, field, addnd) do { \ + __part_stat_add((cpu), (part), field, addnd); \ + if ((part)->partno) \ + __part_stat_add((cpu), &part_to_disk((part))->part0, \ + field, addnd); \ +} while (0) -static inline int init_part_stats(struct hd_struct *part) +#define part_stat_dec(cpu, gendiskp, field) \ + part_stat_add(cpu, gendiskp, field, -1) +#define part_stat_inc(cpu, gendiskp, field) \ + part_stat_add(cpu, gendiskp, field, 1) +#define part_stat_sub(cpu, gendiskp, field, subnd) \ + part_stat_add(cpu, gendiskp, field, -subnd) + +static inline void part_inc_in_flight(struct hd_struct *part) { - return 1; + part->in_flight++; + if (part->partno) + part_to_disk(part)->part0.in_flight++; } -static inline void free_part_stats(struct hd_struct *part) +static inline void part_dec_in_flight(struct hd_struct *part) { + part->in_flight--; + if (part->partno) + part_to_disk(part)->part0.in_flight--; } -#endif /* CONFIG_SMP */ /* drivers/block/ll_rw_blk.c */ -extern void disk_round_stats(int cpu, struct gendisk *disk); extern void part_round_stats(int cpu, struct hd_struct *part); /* drivers/block/genhd.c */ @@ -595,6 +524,8 @@ extern void blk_unregister_region(dev_t devt, unsigned long range); extern ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf); #ifdef CONFIG_FAIL_MAKE_REQUEST extern ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); -- cgit v1.2.3 From 540eed5637b766bb1e881ef744c42617760b4815 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:15 +0900 Subject: block: make partition array dynamic disk->__part used to be statically allocated to the maximum possible number of partitions. This patch makes partition array allocation dynamic. The added overhead is minimal as only real change is one memory dereference changed to RCU one. This saves both a bit of memory and cpu cycles iterating through unoccupied slots and makes increasing partition limit easier. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c90e1b4fbe5..ecf649c3dee 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -113,6 +113,21 @@ struct hd_struct { #define GENHD_FL_UP 16 #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 +#define BLK_SCSI_MAX_CMDS (256) +#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) + +struct blk_scsi_cmd_filter { + unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; + unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; + struct kobject kobj; +}; + +struct disk_part_tbl { + struct rcu_head rcu_head; + int len; + struct hd_struct *part[]; +}; + struct gendisk { /* major, first_minor, minors and ext_minors are input * parameters only, don't use directly. Use disk_devt() and @@ -131,7 +146,7 @@ struct gendisk { * non-critical accesses use RCU. Always access through * helpers. */ - struct hd_struct **__part; + struct disk_part_tbl *part_tbl; struct hd_struct part0; struct block_device_operations *fops; @@ -149,6 +164,7 @@ struct gendisk { #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif + int node_id; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) @@ -503,6 +519,7 @@ extern void blk_free_devt(dev_t devt); extern dev_t blk_lookup_devt(const char *name, int partno); extern char *disk_name (struct gendisk *hd, int partno, char *buf); +extern int disk_expand_part_tbl(struct gendisk *disk, int target); extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int); extern void delete_partition(struct gendisk *, int); -- cgit v1.2.3 From 689d6fac40b41c7bf154f362deaf442548e4dc81 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:16 +0900 Subject: block: replace @ext_minors with GENHD_FL_EXT_DEVT With previous changes, it's meaningless to limit the number of partitions. Replace @ext_minors with GENHD_FL_EXT_DEVT such that setting the flag allows the disk to have maximum number of allowed partitions (only limited by the number of entries in parsed_partitions as determined by MAX_PART constant). This kills not-too-pretty alloc_disk_ext[_node]() functions and makes @minors parameter to alloc_disk[_node]() unnecessary. The parameter is left alone to avoid disturbing the users. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ecf649c3dee..04524c213de 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -58,6 +58,8 @@ enum { UNIXWARE_PARTITION = 0x63, /* Same as GNU_HURD and SCO Unix */ }; +#define DISK_MAX_PARTS 256 + #include #include #include @@ -112,6 +114,7 @@ struct hd_struct { #define GENHD_FL_CD 8 #define GENHD_FL_UP 16 #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 +#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) @@ -129,15 +132,13 @@ struct disk_part_tbl { }; struct gendisk { - /* major, first_minor, minors and ext_minors are input - * parameters only, don't use directly. Use disk_devt() and - * disk_max_parts(). + /* major, first_minor and minors are input parameters only, + * don't use directly. Use disk_devt() and disk_max_parts(). */ int major; /* major number of driver */ int first_minor; int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */ - int ext_minors; /* number of extended dynamic minors */ char disk_name[32]; /* name of major driver */ @@ -180,7 +181,9 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part) static inline int disk_max_parts(struct gendisk *disk) { - return disk->minors + disk->ext_minors; + if (disk->flags & GENHD_FL_EXT_DEVT) + return DISK_MAX_PARTS; + return disk->minors; } static inline bool disk_partitionable(struct gendisk *disk) @@ -527,9 +530,6 @@ extern void printk_all_partitions(void); extern struct gendisk *alloc_disk_node(int minors, int node_id); extern struct gendisk *alloc_disk(int minors); -extern struct gendisk *alloc_disk_ext_node(int minors, int ext_minrs, - int node_id); -extern struct gendisk *alloc_disk_ext(int minors, int ext_minors); extern struct kobject *get_disk(struct gendisk *disk); extern void put_disk(struct gendisk *disk); extern void blk_register_region(dev_t devt, unsigned long range, -- cgit v1.2.3 From 3e1a7ff8a0a7b948f2684930166954f9e8e776fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:17 +0900 Subject: block: allow disk to have extended device number Now that disk and partition handlings are mostly unified, it's easy to allow disk to have extended device number. This patch makes add_disk() use extended device number if disk->minors is zero. Both sd and ide-disk are updated to use this. * sd_format_disk_name() is implemented which can generically determine the drive name. This removes disk number restriction stemming from limited device names. * If sd index goes over SD_MAX_DISKS (which can be increased now BTW), sd simply doesn't initialize minors letting block layer choose extended device number. * If CONFIG_DEBUG_EXT_DEVT is set, both sd and ide-disk always set minors to 0 and use extended device numbers. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 04524c213de..206cdf96c3a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -59,6 +59,7 @@ enum { }; #define DISK_MAX_PARTS 256 +#define DISK_NAME_LEN 32 #include #include @@ -140,7 +141,7 @@ struct gendisk { int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */ - char disk_name[32]; /* name of major driver */ + char disk_name[DISK_NAME_LEN]; /* name of major driver */ /* Array of pointers to partitions indexed by partno. * Protected with matching bdev lock but stat and other -- cgit v1.2.3 From 18887ad910e56066233a07fd3cfb2fa11338b782 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Jul 2008 13:08:45 +0200 Subject: block: make kblockd_schedule_work() take the queue as parameter Preparatory patch for checking queuing affinity. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1adb03827bd..10aa46c8f17 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -912,7 +912,7 @@ static inline void put_dev_sector(Sector p) } struct work_struct; -int kblockd_schedule_work(struct work_struct *work); +int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); void kblockd_flush_work(struct work_struct *work); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ -- cgit v1.2.3 From c7c22e4d5c1fdebfac4dba76de7d0338c2b0d832 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Sep 2008 20:26:01 +0200 Subject: block: add support for IO CPU affinity This patch adds support for controlling the IO completion CPU of either all requests on a queue, or on a per-request basis. We export a sysfs variable (rq_affinity) which, if set, migrates completions of requests to the CPU that originally submitted it. A bio helper (bio_set_completion_cpu()) is also added, so that queuers can ask for completion on that specific CPU. In testing, this has been show to cut the system time by as much as 20-40% on synthetic workloads where CPU affinity is desired. This requires a little help from the architecture, so it'll only work as designed for archs that are using the new generic smp helper infrastructure. Signed-off-by: Jens Axboe --- include/linux/bio.h | 11 +++++++++++ include/linux/blkdev.h | 5 ++++- include/linux/elevator.h | 8 ++++---- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 2c0c09034fd..13aba20edb2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -81,6 +81,8 @@ struct bio { unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ + unsigned int bi_comp_cpu; /* completion CPU */ + struct bio_vec *bi_io_vec; /* the actual vec list */ bio_end_io_t *bi_end_io; @@ -105,6 +107,7 @@ struct bio { #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ #define BIO_EOPNOTSUPP 7 /* not supported */ +#define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* @@ -342,6 +345,14 @@ void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); extern unsigned int bvec_nr_vecs(unsigned short idx); +/* + * Allow queuer to specify a completion CPU for this bio + */ +static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) +{ + bio->bi_comp_cpu = cpu; +} + /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 10aa46c8f17..93204bf7b29 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -139,7 +140,8 @@ enum rq_flag_bits { */ struct request { struct list_head queuelist; - struct list_head donelist; + struct call_single_data csd; + int cpu; struct request_queue *q; @@ -420,6 +422,7 @@ struct request_queue #define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ +#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ static inline int queue_is_locked(struct request_queue *q) { diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 639624b55fb..bb791c311a5 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -173,15 +173,15 @@ enum { #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) /* - * Hack to reuse the donelist list_head as the fifo time holder while + * Hack to reuse the csd.list list_head as the fifo time holder while * the request is in the io scheduler. Saves an unsigned long in rq. */ -#define rq_fifo_time(rq) ((unsigned long) (rq)->donelist.next) -#define rq_set_fifo_time(rq,exp) ((rq)->donelist.next = (void *) (exp)) +#define rq_fifo_time(rq) ((unsigned long) (rq)->csd.list.next) +#define rq_set_fifo_time(rq,exp) ((rq)->csd.list.next = (void *) (exp)) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) do { \ list_del_init(&(rq)->queuelist); \ - INIT_LIST_HEAD(&(rq)->donelist); \ + INIT_LIST_HEAD(&(rq)->csd.list); \ } while (0) /* -- cgit v1.2.3 From ab780f1ece0dc8d5e8e8e85435acc5e4747ccda3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 Aug 2008 10:25:02 +0200 Subject: block: inherit CPU completion on bio->rq and rq->rq merges Somewhat incomplete, as we do allow merges of requests and bios that have different completion CPUs given. This is done on the assumption that a larger IO is still more beneficial than CPU locality. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 93204bf7b29..12df8efeef1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -545,6 +545,7 @@ enum { #define blk_pm_request(rq) \ (blk_pm_suspend_request(rq) || blk_pm_resume_request(rq)) +#define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED) #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER) #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) -- cgit v1.2.3 From a3bce90edd8f6cafe3f63b1a943800792e830178 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 28 Aug 2008 16:17:05 +0900 Subject: block: add gfp_mask argument to blk_rq_map_user and blk_rq_map_user_iov Currently, blk_rq_map_user and blk_rq_map_user_iov always do GFP_KERNEL allocation. This adds gfp_mask argument to blk_rq_map_user and blk_rq_map_user_iov so sg can use it (sg always does GFP_ATOMIC allocation). Signed-off-by: FUJITA Tomonori Signed-off-by: Douglas Gilbert Cc: Mike Christie Cc: James Bottomley Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 +++++---- include/linux/blkdev.h | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 13aba20edb2..200b185c3e8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -325,11 +325,11 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); extern int bio_get_nr_vecs(struct block_device *); extern struct bio *bio_map_user(struct request_queue *, struct block_device *, - unsigned long, unsigned int, int); + unsigned long, unsigned int, int, gfp_t); struct sg_iovec; extern struct bio *bio_map_user_iov(struct request_queue *, struct block_device *, - struct sg_iovec *, int, int); + struct sg_iovec *, int, int, gfp_t); extern void bio_unmap_user(struct bio *); extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, gfp_t); @@ -337,9 +337,10 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, gfp_t, int); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int); +extern struct bio *bio_copy_user(struct request_queue *, unsigned long, + unsigned int, int, gfp_t); extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *, - int, int); + int, int, gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 12df8efeef1..00e388d0e22 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -710,11 +710,12 @@ extern void __blk_stop_queue(struct request_queue *q); extern void __blk_run_queue(struct request_queue *); extern void blk_run_queue(struct request_queue *); extern void blk_start_queueing(struct request_queue *); -extern int blk_rq_map_user(struct request_queue *, struct request *, void __user *, unsigned long); +extern int blk_rq_map_user(struct request_queue *, struct request *, + void __user *, unsigned long, gfp_t); extern int blk_rq_unmap_user(struct bio *); extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); extern int blk_rq_map_user_iov(struct request_queue *, struct request *, - struct sg_iovec *, int, unsigned int); + struct sg_iovec *, int, unsigned int, gfp_t); extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, -- cgit v1.2.3 From 152e283fdfea0cd11e297d982378b55937842dde Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 28 Aug 2008 16:17:06 +0900 Subject: block: introduce struct rq_map_data to use reserved pages This patch introduces struct rq_map_data to enable bio_copy_use_iov() use reserved pages. Currently, bio_copy_user_iov allocates bounce pages but drivers/scsi/sg.c wants to allocate pages by itself and use them. struct rq_map_data can be used to pass allocated pages to bio_copy_user_iov. The current users of bio_copy_user_iov simply passes NULL (they don't want to use pre-allocated pages). Signed-off-by: FUJITA Tomonori Cc: Jens Axboe Cc: Douglas Gilbert Cc: Mike Christie Cc: James Bottomley Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 +++++--- include/linux/blkdev.h | 12 ++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 200b185c3e8..bc386cd5e99 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -327,6 +327,7 @@ extern int bio_get_nr_vecs(struct block_device *); extern struct bio *bio_map_user(struct request_queue *, struct block_device *, unsigned long, unsigned int, int, gfp_t); struct sg_iovec; +struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, struct block_device *, struct sg_iovec *, int, int, gfp_t); @@ -337,9 +338,10 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, gfp_t, int); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -extern struct bio *bio_copy_user(struct request_queue *, unsigned long, - unsigned int, int, gfp_t); -extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *, +extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, + unsigned long, unsigned int, int, gfp_t); +extern struct bio *bio_copy_user_iov(struct request_queue *, + struct rq_map_data *, struct sg_iovec *, int, int, gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 00e388d0e22..358ac423ed2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -642,6 +642,12 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) } #endif /* CONFIG_MMU */ +struct rq_map_data { + struct page **pages; + int page_order; + int nr_entries; +}; + struct req_iterator { int i; struct bio *bio; @@ -711,11 +717,13 @@ extern void __blk_run_queue(struct request_queue *); extern void blk_run_queue(struct request_queue *); extern void blk_start_queueing(struct request_queue *); extern int blk_rq_map_user(struct request_queue *, struct request *, - void __user *, unsigned long, gfp_t); + struct rq_map_data *, void __user *, unsigned long, + gfp_t); extern int blk_rq_unmap_user(struct bio *); extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); extern int blk_rq_map_user_iov(struct request_queue *, struct request *, - struct sg_iovec *, int, unsigned int, gfp_t); + struct rq_map_data *, struct sg_iovec *, int, + unsigned int, gfp_t); extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, -- cgit v1.2.3 From 879040742cf09f2360a9ac41846288707e4e567c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 28 Aug 2008 15:05:58 +0900 Subject: block: add blk_rq_aligned helper function This adds blk_rq_aligned helper function to see if alignment and padding requirement is satisfied for DMA transfer. This also converts blk_rq_map_kern and __blk_rq_map_user to use the helper function. Signed-off-by: FUJITA Tomonori Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 358ac423ed2..9c254926042 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -899,6 +899,13 @@ static inline int queue_dma_alignment(struct request_queue *q) return q ? q->dma_alignment : 511; } +static inline int blk_rq_aligned(struct request_queue *q, void *addr, + unsigned int len) +{ + unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; + return !((unsigned long)addr & alignment) && !(len & alignment); +} + /* assumes size > 256 */ static inline unsigned int blksize_bits(unsigned int size) { -- cgit v1.2.3 From 818827669d85b84241696ffef2de485db46b0b5e Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 2 Sep 2008 16:20:19 +0900 Subject: block: make blk_rq_map_user take a NULL user-space buffer This patch changes blk_rq_map_user to accept a NULL user-space buffer with a READ command if rq_map_data is not NULL. Thus a caller can pass page frames to lk_rq_map_user to just set up a request and bios with page frames propely. bio_uncopy_user (called via blk_rq_unmap_user) doesn't copy data to user space with such request. Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index bc386cd5e99..7af373f253d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -108,6 +108,7 @@ struct bio { #define BIO_USER_MAPPED 6 /* contains user pages */ #define BIO_EOPNOTSUPP 7 /* not supported */ #define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ +#define BIO_NULL_MAPPED 9 /* contains invalid user pages */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* -- cgit v1.2.3 From 0c002c2f74e10baa9021d3ecc50585c6eafea568 Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 4 Sep 2008 14:27:20 -0600 Subject: Wrapper for lower-level revalidate_disk routines. This is a wrapper for the lower-level revalidate_disk call-backs such as sd_revalidate_disk(). It allows us to perform pre and post operations when calling them. We will use this wrapper in a later patch to adjust block device sizes after an online resize (a _post_ operation). Signed-off-by: Andrew Patterson Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 02a9fb5a830..d63461f9798 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1722,6 +1722,7 @@ extern int fs_may_remount_ro(struct super_block *); */ #define bio_data_dir(bio) ((bio)->bi_rw & 1) +extern int revalidate_disk(struct gendisk *); extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); -- cgit v1.2.3 From c3279d1454cdfed02a557d789d8a6d08ab4cbe70 Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 4 Sep 2008 14:27:25 -0600 Subject: Adjust block device size after an online resize of a disk. The revalidate_disk routine now checks if a disk has been resized by comparing the gendisk capacity to the bdev inode size. If they are different (usually because the disk has been resized underneath the kernel) the bdev inode size is adjusted to match the capacity. Signed-off-by: Andrew Patterson Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index d63461f9798..32477e8872d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1722,6 +1722,8 @@ extern int fs_may_remount_ro(struct super_block *); */ #define bio_data_dir(bio) ((bio)->bi_rw & 1) +extern void check_disk_size_change(struct gendisk *disk, + struct block_device *bdev); extern int revalidate_disk(struct gendisk *); extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *); -- cgit v1.2.3 From 242f9dcb8ba6f68fcd217a119a7648a4f69290e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 14 Sep 2008 05:55:09 -0700 Subject: block: unify request timeout handling Right now SCSI and others do their own command timeout handling. Move those bits to the block layer. Instead of having a timer per command, we try to be a bit more clever and simply have one per-queue. This avoids the overhead of having to tear down and setup a timer for each command, so it will result in a lot less timer fiddling. Signed-off-by: Mike Anderson Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 20 ++++++++++++++++++++ include/scsi/scsi_cmnd.h | 3 --- include/scsi/scsi_host.h | 9 +-------- include/scsi/scsi_transport.h | 3 ++- 4 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9c254926042..067f28b8007 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -147,6 +147,7 @@ struct request { unsigned int cmd_flags; enum rq_cmd_type_bits cmd_type; + unsigned long atomic_flags; /* Maintain bio traversal state for part by part I/O submission. * hard_* are block layer internals, no driver should touch them! @@ -214,6 +215,8 @@ struct request { void *data; void *sense; + unsigned long deadline; + struct list_head timeout_list; unsigned int timeout; int retries; @@ -266,6 +269,14 @@ typedef void (prepare_flush_fn) (struct request_queue *, struct request *); typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); +enum blk_eh_timer_return { + BLK_EH_NOT_HANDLED, + BLK_EH_HANDLED, + BLK_EH_RESET_TIMER, +}; + +typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *); + enum blk_queue_state { Queue_down, Queue_up, @@ -311,6 +322,7 @@ struct request_queue merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; softirq_done_fn *softirq_done_fn; + rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; /* @@ -386,6 +398,10 @@ struct request_queue unsigned int nr_sorted; unsigned int in_flight; + unsigned int rq_timeout; + struct timer_list timeout; + struct list_head timeout_list; + /* * sg stuff */ @@ -770,6 +786,8 @@ extern int blk_end_request_callback(struct request *rq, int error, unsigned int nr_bytes, int (drv_callback)(struct request *)); extern void blk_complete_request(struct request *); +extern void __blk_complete_request(struct request *); +extern void blk_abort_request(struct request *); /* * blk_end_request() takes bytes instead of sectors as a complete size. @@ -811,6 +829,8 @@ extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *); +extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); +extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); extern int blk_do_ordered(struct request_queue *, struct request **); diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index f9f6e793575..855bf95963e 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -75,7 +75,6 @@ struct scsi_cmnd { int retries; int allowed; - int timeout_per_command; unsigned char prot_op; unsigned char prot_type; @@ -86,7 +85,6 @@ struct scsi_cmnd { /* These elements define the operation we are about to perform */ unsigned char *cmnd; - struct timer_list eh_timeout; /* Used to time out the command. */ /* These elements define the operation we ultimately want to perform */ struct scsi_data_buffer sdb; @@ -139,7 +137,6 @@ extern void scsi_put_command(struct scsi_cmnd *); extern void __scsi_put_command(struct Scsi_Host *, struct scsi_cmnd *, struct device *); extern void scsi_finish_command(struct scsi_cmnd *cmd); -extern void scsi_req_abort_cmd(struct scsi_cmnd *cmd); extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 44a55d1bf53..d123ca84e73 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -43,13 +43,6 @@ struct blk_queue_tags; #define DISABLE_CLUSTERING 0 #define ENABLE_CLUSTERING 1 -enum scsi_eh_timer_return { - EH_NOT_HANDLED, - EH_HANDLED, - EH_RESET_TIMER, -}; - - struct scsi_host_template { struct module *module; const char *name; @@ -347,7 +340,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - enum scsi_eh_timer_return (* eh_timed_out)(struct scsi_cmnd *); + enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *); /* * Name of proc directory diff --git a/include/scsi/scsi_transport.h b/include/scsi/scsi_transport.h index 490bd13a634..0de32cd4e8a 100644 --- a/include/scsi/scsi_transport.h +++ b/include/scsi/scsi_transport.h @@ -21,6 +21,7 @@ #define SCSI_TRANSPORT_H #include +#include #include #include @@ -64,7 +65,7 @@ struct scsi_transport_template { * begin counting again * EH_NOT_HANDLED Begin normal error recovery */ - enum scsi_eh_timer_return (* eh_timed_out)(struct scsi_cmnd *); + enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *); /* * Used as callback for the completion of i_t_nexus request -- cgit v1.2.3 From 11914a53d2ec2974a565311af327b8983d8c820d Mon Sep 17 00:00:00 2001 From: Mike Anderson Date: Sat, 13 Sep 2008 20:31:27 +0200 Subject: block: Add interface to abort queued requests Signed-off-by: Mike Anderson Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + include/linux/blktrace_api.h | 2 ++ include/linux/elevator.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 067f28b8007..37781d6fe04 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -788,6 +788,7 @@ extern int blk_end_request_callback(struct request *rq, int error, extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); +extern void blk_abort_queue(struct request_queue *); /* * blk_end_request() takes bytes instead of sectors as a complete size. diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 27da2cc682e..dcaf2452ed1 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -48,6 +48,7 @@ enum blktrace_act { __BLK_TA_SPLIT, /* bio was split */ __BLK_TA_BOUNCE, /* bio was bounced */ __BLK_TA_REMAP, /* bio was remapped */ + __BLK_TA_ABORT, /* request aborted */ }; /* @@ -78,6 +79,7 @@ enum blktrace_notify { #define BLK_TA_SPLIT (__BLK_TA_SPLIT) #define BLK_TA_BOUNCE (__BLK_TA_BOUNCE) #define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_ABORT (__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index bb791c311a5..92f6f634e3e 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -112,6 +112,7 @@ extern struct request *elv_latter_request(struct request_queue *, struct request extern int elv_register_queue(struct request_queue *q); extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, int); +extern void elv_abort_queue(struct request_queue *); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *, struct request *, gfp_t); extern void elv_put_request(struct request_queue *, struct request *); -- cgit v1.2.3 From 3e6053d76dcbd92b2f9f4ad5ece9bce83149523e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 11 Sep 2008 10:57:55 +0200 Subject: block: adjust blkdev_issue_discard for swap Two mods to blkdev_issue_discard(), thinking ahead to its use on swap: 1. Add gfp_mask argument, so swap allocation can use it where GFP_KERNEL might deadlock but GFP_NOIO is safe. 2. Enlarge nr_sects argument from unsigned to sector_t: unsigned long is enough to cover a whole swap area, but sector_t suits any partition. Change sb_issue_discard()'s nr_blocks to sector_t too; but no need seen for a gfp_mask there, just pass GFP_KERNEL down to blkdev_issue_discard(). Signed-off-by: Hugh Dickins Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 37781d6fe04..b47767c72ce 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -873,15 +874,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, } extern int blkdev_issue_flush(struct block_device *, sector_t *); -extern int blkdev_issue_discard(struct block_device *, sector_t sector, - unsigned nr_sects); +extern int blkdev_issue_discard(struct block_device *, + sector_t sector, sector_t nr_sects, gfp_t); static inline int sb_issue_discard(struct super_block *sb, - sector_t block, unsigned nr_blocks) + sector_t block, sector_t nr_blocks) { block <<= (sb->s_blocksize_bits - 9); nr_blocks <<= (sb->s_blocksize_bits - 9); - return blkdev_issue_discard(sb->s_bdev, block, nr_blocks); + return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL); } /* -- cgit v1.2.3 From 0a0d96b03a1f3bfd6bc3ea08008699e8e59fccd9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Sep 2008 13:17:37 +0200 Subject: block: add bio_kmalloc() Not all callers need (or want!) the mempool backing guarentee, it essentially means that you can only use bio_alloc() for short allocations and not for preallocating some bio's at setup or init time. So add bio_kmalloc() which does the same thing as bio_alloc(), except it just uses kmalloc() as the backing instead of the bio mempools. Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 7af373f253d..6520ee1a3f6 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -308,6 +308,7 @@ extern struct bio_set *bioset_create(int, int); extern void bioset_free(struct bio_set *); extern struct bio *bio_alloc(gfp_t, int); +extern struct bio *bio_kmalloc(gfp_t, int); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); extern void bio_free(struct bio *, struct bio_set *); -- cgit v1.2.3 From 581d4e28d9195aa8b2231383dbabc288988d615e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 14 Sep 2008 05:56:33 -0700 Subject: block: add fault injection mechanism for faking request timeouts Only works for the generic request timer handling. Allows one to sporadically ignore request completions, thus exercising the timeout handling. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b47767c72ce..e34999d14c1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -440,6 +440,7 @@ struct request_queue #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ +#define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ static inline int queue_is_locked(struct request_queue *q) { -- cgit v1.2.3 From 9c02f2b02e29a2244e36c6e1f246080d8afc6cff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Sep 2008 09:31:53 -0700 Subject: block: cleanup some of the integrity stuff in blkdev.h Don't put functions that are only used in fs/bio-integrity.c in blkdev.h, it's much cleaner to just keep it in there. Also kill completely unused bdev_get_tag_size() Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e34999d14c1..e23b838825b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1004,47 +1004,6 @@ extern int blk_integrity_compare(struct block_device *, struct block_device *); extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); extern int blk_rq_count_integrity_sg(struct request *); -static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) -{ - if (bi) - return bi->tuple_size; - - return 0; -} - -static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev) -{ - return bdev->bd_disk->integrity; -} - -static inline unsigned int bdev_get_tag_size(struct block_device *bdev) -{ - struct blk_integrity *bi = bdev_get_integrity(bdev); - - if (bi) - return bi->tag_size; - - return 0; -} - -static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) -{ - struct blk_integrity *bi = bdev_get_integrity(bdev); - - if (bi == NULL) - return 0; - - if (rw == READ && bi->verify_fn != NULL && - (bi->flags & INTEGRITY_FLAG_READ)) - return 1; - - if (rw == WRITE && bi->generate_fn != NULL && - (bi->flags & INTEGRITY_FLAG_WRITE)) - return 1; - - return 0; -} - static inline int blk_integrity_rq(struct request *rq) { if (rq->bio == NULL) @@ -1058,8 +1017,6 @@ static inline int blk_integrity_rq(struct request *rq) #define blk_integrity_rq(rq) (0) #define blk_rq_count_integrity_sg(a) (0) #define blk_rq_map_integrity_sg(a, b) (0) -#define bdev_get_integrity(a) (0) -#define bdev_get_tag_size(a) (0) #define blk_integrity_compare(a, b) (0) #define blk_integrity_register(a, b) (0) #define blk_integrity_unregister(a) do { } while (0); -- cgit v1.2.3 From 32fab448e5e86694beade415e750363538ea5f49 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 18 Sep 2008 10:45:09 -0400 Subject: block: add request update interface This patch adds blk_update_request(), which updates struct request with completing its data part, but doesn't complete the struct request itself. Though it looks like end_that_request_first() of older kernels, blk_update_request() should be used only by request stacking drivers. Request-based dm will use it in bio->bi_end_io callback to update the original request when a data part of a cloned request completes. Followings are additional background information of why request-based dm needs this interface. - Request stacking drivers can't use blk_end_request() directly from the lower driver's completion context (bio->bi_end_io or rq->end_io), because some device drivers (e.g. ide) may try to complete their request with queue lock held, and it may cause deadlock. See below for detailed description of possible deadlock: - To solve that, request-based dm offloads the completion of cloned struct request to softirq context (i.e. using blk_complete_request() from rq->end_io). - Though it is possible to use the same solution from bio->bi_end_io, it will delay the notification of bio completion to the original submitter. Also, it will cause inefficient partial completion, because the lower driver can't perform the cloned request anymore and request-based dm needs to requeue and redispatch it to the lower driver again later. That's not good. - So request-based dm needs blk_update_request() to perform the bio completion in the lower driver's completion context, which is more efficient. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e23b838825b..e82a84c9f37 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -791,6 +791,8 @@ extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); extern void blk_abort_queue(struct request_queue *); +extern void blk_update_request(struct request *rq, int error, + unsigned int nr_bytes); /* * blk_end_request() takes bytes instead of sectors as a complete size. -- cgit v1.2.3 From 82124d60354846623a4b94af335717a5e142a074 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 18 Sep 2008 10:45:38 -0400 Subject: block: add request submission interface This patch adds blk_insert_cloned_request(), a generic request submission interface for request stacking drivers. Request-based dm will use it to submit their clones to underlying devices. blk_rq_check_limits() is also added because it is possible that the lower queue has stronger limitations than the upper queue if multiple drivers are stacking at request-level. Not only for blk_insert_cloned_request()'s internal use, the function will be used by request-based dm when the queue limitation is modified (e.g. by replacing dm's table). Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e82a84c9f37..964c246bc27 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -693,6 +693,9 @@ extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_insert_request(struct request_queue *, struct request *, int, void *); extern void blk_requeue_request(struct request_queue *, struct request *); +extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); +extern int blk_insert_cloned_request(struct request_queue *q, + struct request *rq); extern void blk_plug_device(struct request_queue *); extern void blk_plug_device_unlocked(struct request_queue *); extern int blk_remove_plug(struct request_queue *); -- cgit v1.2.3 From 4ee5eaf4516a60f8ef64d3c246c64c6be0cf8c3a Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 18 Sep 2008 10:46:13 -0400 Subject: block: add a queue flag for request stacking support This patch adds a queue flag to indicate the block device can be used for request stacking. Request stacking drivers need to stack their devices on top of only devices of which q->request_fn is functional. Since bio stacking drivers (e.g. md, loop) basically initialize their queue using blk_alloc_queue() and don't set q->request_fn, the check of (q->request_fn == NULL) looks enough for that purpose. However, dm will become both types of stacking driver (bio-based and request-based). And dm will always set q->request_fn even if the dm device is bio-based of which q->request_fn is not functional actually. So we need something else to distinguish the type of the device. Adding a queue flag is a solution for that. The reason why dm always sets q->request_fn is to keep the compatibility of dm user-space tools. Currently, all dm user-space tools are using bio-based dm without specifying the type of the dm device they use. To use request-based dm without changing such tools, the kernel must decide the type of the dm device automatically. The automatic type decision can't be done at the device creation time and needs to be deferred until such tools load a mapping table, since the actual type is decided by dm target type included in the mapping table. So a dm device has to be initialized using blk_init_queue() so that we can load either type of table. Then, all queue stuffs are set (e.g. q->request_fn) and we have no element to distinguish that it is bio-based or request-based, even after a table is loaded and the type of the device is decided. By the way, some stuffs of the queue (e.g. request_list, elevator) are needless when the dm device is used as bio-based. But the memory size is not so large (about 20[KB] per queue on ia64), so I hope the memory loss can be acceptable for bio-based dm users. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 964c246bc27..86f77ef127f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -441,6 +441,7 @@ struct request_queue #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ #define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ +#define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ static inline int queue_is_locked(struct request_queue *q) { @@ -547,6 +548,8 @@ enum { #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_flushing(q) ((q)->ordseq) +#define blk_queue_stackable(q) \ + test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) #define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC) -- cgit v1.2.3 From 9e49184c82e9ec3ab4d45f9ea5a17ccaf43869f0 Mon Sep 17 00:00:00 2001 From: Keith Wansbrough Date: Mon, 22 Sep 2008 14:57:17 -0700 Subject: floppy: support arbitrary first-sector numbers The current floppy_struct allows floppies to number sectors starting from 0 or 1. This patch allows arbitrary first-sector numbers - for example, 0xC1 for Amstrad CPC disks. This extends the existing 1-bit field (FD_ZEROBASED, bit 2 of stretch) to 8 bits (FD_SECTMASK, bits 2 to 9). Currently 0x00 denotes a first sector number of 1, and 0x01 denotes a first sector number of 0. We extend this by interpreting FD_SECTMASK as the first sector number with the LSB flipped. Signed-off-by: Keith Wansbrough Cc: Alain Knaff Cc: Michael Kerrisk Cc: Karel Zak Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- include/linux/fd.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fd.h b/include/linux/fd.h index b6bd41d2b46..f5d194af07a 100644 --- a/include/linux/fd.h +++ b/include/linux/fd.h @@ -15,10 +15,16 @@ struct floppy_struct { sect, /* sectors per track */ head, /* nr of heads */ track, /* nr of tracks */ - stretch; /* !=0 means double track steps */ + stretch; /* bit 0 !=0 means double track steps */ + /* bit 1 != 0 means swap sides */ + /* bits 2..9 give the first sector */ + /* number (the LSB is flipped) */ #define FD_STRETCH 1 #define FD_SWAPSIDES 2 #define FD_ZEROBASED 4 +#define FD_SECTBASEMASK 0x3FC +#define FD_MKSECTBASE(s) (((s) ^ 1) << 2) +#define FD_SECTBASE(floppy) ((((floppy)->stretch & FD_SECTBASEMASK) >> 2) ^ 1) unsigned char gap, /* gap1 size */ -- cgit v1.2.3 From a68bbddba486020c9c74825ce90c4c1ec463e0e8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 24 Sep 2008 13:03:33 +0200 Subject: block: add queue flag for SSD/non-rotational devices We don't want to idle in AS/CFQ if the device doesn't have a seek penalty. So add a QUEUE_FLAG_NONROT to indicate a non-rotational device, low level drivers should set this flag upon discovery of an SSD or similar device type. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 86f77ef127f..0cf3e619fb2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -442,6 +442,7 @@ struct request_queue #define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ #define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ #define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ +#define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ static inline int queue_is_locked(struct request_queue *q) { @@ -547,6 +548,7 @@ enum { #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) +#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_flushing(q) ((q)->ordseq) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) -- cgit v1.2.3 From 8bff7c6b0f63c7ee9c5e3a076338d74125b8debb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 24 Sep 2008 13:05:10 +0200 Subject: libata: set queue SSD flag for SSD devices SSD devices should give an RPM setting of 1 in word 217 of the ID page. If we see such a device, tell the block layer about it. Signed-off-by: Jens Axboe --- include/linux/ata.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/ata.h b/include/linux/ata.h index 8a12d718c16..c1c8b4a4ba2 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -88,6 +88,7 @@ enum { ATA_ID_DLF = 128, ATA_ID_CSFO = 129, ATA_ID_CFA_POWER = 160, + ATA_ID_ROT_SPEED = 217, ATA_ID_PIO4 = (1 << 1), ATA_ID_SERNO_LEN = 20, @@ -691,6 +692,11 @@ static inline int ata_id_is_cfa(const u16 *id) return 0; } +static inline int ata_id_is_ssd(const u16 *id) +{ + return id[ATA_ID_ROT_SPEED] == 0x01; +} + static inline int ata_drive_40wire(const u16 *dev_id) { if (ata_id_is_sata(dev_id)) -- cgit v1.2.3 From c0ddffa84a7d12da9943a94d04dadbfb1883b904 Mon Sep 17 00:00:00 2001 From: Sven Schuetz Date: Fri, 26 Sep 2008 10:58:02 +0200 Subject: include blktrace_api.h in headers_install This header file is of interest for user space programming, i.e. for tools that process blktrace data. We would like to use it for a tool on-top of blktrace which processes data provided by blktrace. For this purpose, it would be helpful if the blktrace API would make it to /usr/include/linux. The git tree for the blktrace tools comes with its own copy of this header file. I didn't manage to replace that copy with the file generated by the patch below yet. A few more cleanups would be needed. For example, the blktrace ioctl numbers, which are currently defined in usr/include/fs.h, might need to be moved. Should be feasible, though. Signed-off-by: Sven Schuetz Signed-off-by: Martin Peschke Signed-off-by: Jens Axboe --- include/linux/Kbuild | 1 + include/linux/blktrace_api.h | 58 ++++++++++++++++++++++++-------------------- 2 files changed, 33 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index b68ec09399b..31474e89c59 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -180,6 +180,7 @@ unifdef-y += audit.h unifdef-y += auto_fs.h unifdef-y += auxvec.h unifdef-y += binfmts.h +unifdef-y += blktrace_api.h unifdef-y += capability.h unifdef-y += capi.h unifdef-y += cciss_ioctl.h diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index dcaf2452ed1..a2a7d0ca275 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -1,8 +1,10 @@ #ifndef BLKTRACE_H #define BLKTRACE_H +#ifdef __KERNEL__ #include #include +#endif /* * Trace categories @@ -92,17 +94,17 @@ enum blktrace_notify { * The trace itself */ struct blk_io_trace { - u32 magic; /* MAGIC << 8 | version */ - u32 sequence; /* event number */ - u64 time; /* in microseconds */ - u64 sector; /* disk offset */ - u32 bytes; /* transfer length */ - u32 action; /* what happened */ - u32 pid; /* who did it */ - u32 device; /* device number */ - u32 cpu; /* on what cpu did it happen */ - u16 error; /* completion error */ - u16 pdu_len; /* length of data after this trace */ + __u32 magic; /* MAGIC << 8 | version */ + __u32 sequence; /* event number */ + __u64 time; /* in microseconds */ + __u64 sector; /* disk offset */ + __u32 bytes; /* transfer length */ + __u32 action; /* what happened */ + __u32 pid; /* who did it */ + __u32 device; /* device number */ + __u32 cpu; /* on what cpu did it happen */ + __u16 error; /* completion error */ + __u16 pdu_len; /* length of data after this trace */ }; /* @@ -120,6 +122,25 @@ enum { Blktrace_stopped, }; +/* + * User setup structure passed with BLKTRACESTART + */ +struct blk_user_trace_setup { +#ifdef __KERNEL__ + char name[BDEVNAME_SIZE]; /* output */ +#else + char name[32]; /* output */ +#endif + __u16 act_mask; /* input */ + __u32 buf_size; /* input */ + __u32 buf_nr; /* input */ + __u64 start_lba; + __u64 end_lba; + __u32 pid; +}; + +#ifdef __KERNEL__ +#if defined(CONFIG_BLK_DEV_IO_TRACE) struct blk_trace { int trace_state; struct rchan *rchan; @@ -136,21 +157,6 @@ struct blk_trace { atomic_t dropped; }; -/* - * User setup structure passed with BLKTRACESTART - */ -struct blk_user_trace_setup { - char name[BDEVNAME_SIZE]; /* output */ - u16 act_mask; /* input */ - u32 buf_size; /* input */ - u32 buf_nr; /* input */ - u64 start_lba; - u64 end_lba; - u32 pid; -}; - -#ifdef __KERNEL__ -#if defined(CONFIG_BLK_DEV_IO_TRACE) extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); -- cgit v1.2.3 From ef9e3facdf1fe1228721a7c295a76d1b7a0e57ec Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Wed, 1 Oct 2008 16:12:15 +0200 Subject: block: add lld busy state exporting interface This patch adds an new interface, blk_lld_busy(), to check lld's busy state from the block layer. blk_lld_busy() calls down into low-level drivers for the checking if the drivers set q->lld_busy_fn() using blk_queue_lld_busy(). This resolves a performance problem on request stacking devices below. Some drivers like scsi mid layer stop dispatching request when they detect busy state on its low-level device like host/target/device. It allows other requests to stay in the I/O scheduler's queue for a chance of merging. Request stacking drivers like request-based dm should follow the same logic. However, there is no generic interface for the stacked device to check if the underlying device(s) are busy. If the request stacking driver dispatches and submits requests to the busy underlying device, the requests will stay in the underlying device's queue without a chance of merging. This causes performance problem on burst I/O load. With this patch, busy state of the underlying device is exported via q->lld_busy_fn(). So the request stacking driver can check it and stop dispatching requests if busy. The underlying device driver must return the busy state appropriately: 1: when the device driver can't process requests immediately. 0: when the device driver can process requests immediately, including abnormal situations where the device driver needs to kill all requests. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Andrew Morton Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0cf3e619fb2..9e0ee1a8254 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -269,6 +269,7 @@ typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *, typedef void (prepare_flush_fn) (struct request_queue *, struct request *); typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); +typedef int (lld_busy_fn) (struct request_queue *q); enum blk_eh_timer_return { BLK_EH_NOT_HANDLED, @@ -325,6 +326,7 @@ struct request_queue softirq_done_fn *softirq_done_fn; rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; + lld_busy_fn *lld_busy_fn; /* * Dispatch queue sorting @@ -699,6 +701,7 @@ extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_insert_request(struct request_queue *, struct request *, int, void *); extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); +extern int blk_lld_busy(struct request_queue *q); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern void blk_plug_device(struct request_queue *); @@ -835,6 +838,7 @@ extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size); +extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn); extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); -- cgit v1.2.3 From 0497b345e7d067109e0dd9bf9f4978a6847ee13b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Oct 2008 16:16:25 +0200 Subject: blktrace: use BLKTRACE_BDEV_SIZE as the name size for setup structure Define as 32, which is is what BDEVNAME_SIZE is/was as well. This keeps the user interface the same and gets rid of the difference between kernel and user api here. Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index a2a7d0ca275..3a31eb50616 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -122,15 +122,13 @@ enum { Blktrace_stopped, }; +#define BLKTRACE_BDEV_SIZE 32 + /* * User setup structure passed with BLKTRACESTART */ struct blk_user_trace_setup { -#ifdef __KERNEL__ - char name[BDEVNAME_SIZE]; /* output */ -#else - char name[32]; /* output */ -#endif + char name[BLKTRACE_BDEV_SIZE]; /* output */ __u16 act_mask; /* input */ __u32 buf_size; /* input */ __u32 buf_nr; /* input */ -- cgit v1.2.3 From d00e29fd99dd63d1c51917604e35dee824ed567f Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Wed, 1 Oct 2008 10:14:46 -0400 Subject: block: remove end_{queued|dequeued}_request() This patch removes end_queued_request() and end_dequeued_request(), which are no longer used. As a results, users of __end_request() became only end_request(). So the actual code in __end_request() is moved to end_request() and __end_request() is removed. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9e0ee1a8254..bfc18e497c7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -793,8 +793,6 @@ extern int __blk_end_request(struct request *rq, int error, extern int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); extern void end_request(struct request *, int); -extern void end_queued_request(struct request *, int); -extern void end_dequeued_request(struct request *, int); extern int blk_end_request_callback(struct request *rq, int error, unsigned int nr_bytes, int (drv_callback)(struct request *)); -- cgit v1.2.3 From 8deaf7210728c453295dc1cb2a5b66c68183ac85 Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Thu, 2 Oct 2008 12:46:53 +0200 Subject: bio.h: Remove unused conditional code The whole bio_integrity() definition is inside an #ifdef CONFIG_BLK_DEV_INTEGRITY, there's no need for the conditional code. Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 6520ee1a3f6..98c2d057065 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -457,14 +457,7 @@ static inline int bio_has_data(struct bio *bio) #define bip_for_each_vec(bvl, bip, i) \ __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) -static inline int bio_integrity(struct bio *bio) -{ -#if defined(CONFIG_BLK_DEV_INTEGRITY) - return bio->bi_integrity != NULL; -#else - return 0; -#endif -} +#define bio_integrity(bio) (bio->bi_integrity != NULL) extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); -- cgit v1.2.3 From b04accc425d52ca59699290661e0dfd09b0feeeb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 2 Oct 2008 12:53:22 +0200 Subject: block: revert part of d7533ad0e132f92e75c1b2eb7c26387b25a583c1 We need bdev_get_integrity() to support the pending md/dm patches. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bfc18e497c7..bc693f5c388 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1016,6 +1016,12 @@ extern int blk_integrity_compare(struct block_device *, struct block_device *); extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); extern int blk_rq_count_integrity_sg(struct request *); +static inline +struct blk_integrity *bdev_get_integrity(struct block_device *bdev) +{ + return bdev->bd_disk->integrity; +} + static inline int blk_integrity_rq(struct request *rq) { if (rq->bio == NULL) @@ -1029,6 +1035,7 @@ static inline int blk_integrity_rq(struct request *rq) #define blk_integrity_rq(rq) (0) #define blk_rq_count_integrity_sg(a) (0) #define blk_rq_map_integrity_sg(a, b) (0) +#define bdev_get_integrity(a) (0) #define blk_integrity_compare(a, b) (0) #define blk_integrity_register(a, b) (0) #define blk_integrity_unregister(a) do { } while (0); -- cgit v1.2.3 From 74aa8c2cc010035a7eef2b4ca4d6430e0dae206a Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Oct 2008 03:38:37 -0400 Subject: block: Introduce integrity data ownership flag A filesystem might supply its own integrity metadata. Introduce a flag that indicates whether the filesystem or the block layer owns the integrity buffer. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 98c2d057065..d86d39d490e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -109,6 +109,7 @@ struct bio { #define BIO_EOPNOTSUPP 7 /* not supported */ #define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ #define BIO_NULL_MAPPED 9 /* contains invalid user pages */ +#define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* -- cgit v1.2.3 From ad7fce93147d32ae53d25d9ea1a8ba31a239deee Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Oct 2008 03:38:39 -0400 Subject: block: Switch blk_integrity_compare from bdev to gendisk The DM and MD integrity support now depends on being able to use gendisks instead of block_devices when comparing integrity profiles. Change function parameters accordingly. Also update comparison logic so that two NULL profiles are a valid configuration. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc693f5c388..00d340b0f75 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1012,7 +1012,7 @@ struct blk_integrity { extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); extern void blk_integrity_unregister(struct gendisk *); -extern int blk_integrity_compare(struct block_device *, struct block_device *); +extern int blk_integrity_compare(struct gendisk *, struct gendisk *); extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); extern int blk_rq_count_integrity_sg(struct request *); -- cgit v1.2.3 From b02739b01c5309d74a59859f2ce92c931d1f1955 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 2 Oct 2008 18:47:49 +0200 Subject: block: gendisk integrity wrapper This is a wrapper for accessing a gendisk's integrity bits. It allows the integrity support in MD to be compiled with BLK_DEV_INTEGRITY off. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 00d340b0f75..a92d9e4ea96 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1022,6 +1022,11 @@ struct blk_integrity *bdev_get_integrity(struct block_device *bdev) return bdev->bd_disk->integrity; } +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +{ + return disk->integrity; +} + static inline int blk_integrity_rq(struct request *rq) { if (rq->bio == NULL) @@ -1036,6 +1041,7 @@ static inline int blk_integrity_rq(struct request *rq) #define blk_rq_count_integrity_sg(a) (0) #define blk_rq_map_integrity_sg(a, b) (0) #define bdev_get_integrity(a) (0) +#define blk_get_integrity(a) (0) #define blk_integrity_compare(a, b) (0) #define blk_integrity_register(a, b) (0) #define blk_integrity_unregister(a) do { } while (0); -- cgit v1.2.3 From ad3316bf4eeb53c89164f759767f911072b56203 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Oct 2008 22:42:53 -0400 Subject: block: Find bio sector offset given idx and offset Helper function to find the sector offset in a bio given bvec index and page offset. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index d86d39d490e..fe12d0f9eba 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -327,6 +327,7 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); extern int bio_get_nr_vecs(struct block_device *); +extern sector_t bio_sector_offset(struct bio *, unsigned short, unsigned int); extern struct bio *bio_map_user(struct request_queue *, struct block_device *, unsigned long, unsigned int, int, gfp_t); struct sg_iovec; -- cgit v1.2.3 From 6feef531f55cf4a20fd9eb39f5352e5745203603 Mon Sep 17 00:00:00 2001 From: Denis ChengRq Date: Thu, 9 Oct 2008 08:57:05 +0200 Subject: block: mark bio_split_pool static Since all bio_split calls refer the same single bio_split_pool, the bio_split function can use bio_split_pool directly instead of the mempool_t parameter; then the mempool_t parameter can be removed from bio_split param list, and bio_split_pool is only referred in fs/bio.c file, can be marked static. Signed-off-by: Denis ChengRq Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index fe12d0f9eba..fb97221d7c3 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -300,9 +300,7 @@ struct bio_pair { atomic_t cnt; int error; }; -extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, - int first_sectors); -extern mempool_t *bio_split_pool; +extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); extern void bio_pair_release(struct bio_pair *dbio); extern struct bio_set *bioset_create(int, int); -- cgit v1.2.3 From af5639424008ffe96f89b059bea1aec15e0115a9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Oct 2008 09:01:10 +0200 Subject: block: add some comments around the bio read-write flags Signed-off-by: Jens Axboe --- include/linux/bio.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index fb97221d7c3..ff5b4cf9e2d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -123,13 +123,23 @@ struct bio { /* * bio bi_rw flags * - * bit 0 -- read (not set) or write (set) + * bit 0 -- data direction + * If not set, bio is a read from device. If set, it's a write to device. * bit 1 -- rw-ahead when set * bit 2 -- barrier + * Insert a serialization point in the IO queue, forcing previously + * submitted IO to be completed before this oen is issued. * bit 3 -- fail fast, don't want low level driver retries * bit 4 -- synchronous I/O hint: the block layer will unplug immediately + * Note that this does NOT indicate that the IO itself is sync, just + * that the block layer will not postpone issue of this IO by plugging. * bit 5 -- metadata request + * Used for tracing to differentiate metadata and data IO. May also + * get some preferential treatment in the IO scheduler * bit 6 -- discard sectors + * Informs the lower level device that this range of sectors is no longer + * used by the file system and may thus be freed by the device. Used + * for flash based storage. */ #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ -- cgit v1.2.3