aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorJames Morris <jmorris@namei.org>2009-05-08 17:56:47 +1000
committerJames Morris <jmorris@namei.org>2009-05-08 17:56:47 +1000
commitd254117099d711f215e62427f55dfb8ebd5ad011 (patch)
tree0848ff8dd74314fec14a86497f8d288c86ba7c65 /block
parent07ff7a0b187f3951788f64ae1f30e8109bc8e9eb (diff)
parent8c9ed899b44c19e81859fbb0e9d659fe2f8630fc (diff)
Merge branch 'master' into next
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig16
-rw-r--r--block/Makefile1
-rw-r--r--block/as-iosched.c116
-rw-r--r--block/blk-barrier.c3
-rw-r--r--block/blk-core.c107
-rw-r--r--block/blk-merge.c32
-rw-r--r--block/blk-settings.c22
-rw-r--r--block/blk-softirq.c2
-rw-r--r--block/blk-sysfs.c40
-rw-r--r--block/blk-timeout.c13
-rw-r--r--block/blk.h13
-rw-r--r--block/blktrace.c860
-rw-r--r--block/cfq-iosched.c492
-rw-r--r--block/elevator.c44
-rw-r--r--block/genhd.c12
-rw-r--r--block/ioctl.c2
-rw-r--r--block/scsi_ioctl.c19
17 files changed, 592 insertions, 1202 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 0cbb3b88b59..e7d12782bcf 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -44,22 +44,6 @@ config LBD
If unsure, say N.
-config BLK_DEV_IO_TRACE
- bool "Support for tracing block io actions"
- depends on SYSFS
- select RELAY
- select DEBUG_FS
- select TRACEPOINTS
- help
- Say Y here if you want to be able to trace the block layer actions
- on a given queue. Tracing allows you to see any traffic happening
- on a block device queue. For more information (and the userspace
- support tools needed), fetch the blktrace tools from:
-
- git://git.kernel.dk/blktrace.git
-
- If unsure, say N.
-
config BLK_DEV_BSG
bool "Block layer SG support v4 (EXPERIMENTAL)"
depends on EXPERIMENTAL
diff --git a/block/Makefile b/block/Makefile
index bfe73049f93..e9fa4dd690f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -13,6 +13,5 @@ obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 631f6f44460..c48fa670d22 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -17,9 +17,6 @@
#include <linux/rbtree.h>
#include <linux/interrupt.h>
-#define REQ_SYNC 1
-#define REQ_ASYNC 0
-
/*
* See Documentation/block/as-iosched.txt
*/
@@ -93,7 +90,7 @@ struct as_data {
struct list_head fifo_list[2];
struct request *next_rq[2]; /* next in sort order */
- sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */
+ sector_t last_sector[2]; /* last SYNC & ASYNC sectors */
unsigned long exit_prob; /* probability a task will exit while
being waited on */
@@ -109,7 +106,7 @@ struct as_data {
unsigned long last_check_fifo[2];
int changed_batch; /* 1: waiting for old batch to end */
int new_batch; /* 1: waiting on first read complete */
- int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */
+ int batch_data_dir; /* current batch SYNC / ASYNC */
int write_batch_count; /* max # of reqs in a write batch */
int current_write_count; /* how many requests left this batch */
int write_batch_idled; /* has the write batch gone idle? */
@@ -554,7 +551,7 @@ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
if (aic == NULL)
return;
- if (data_dir == REQ_SYNC) {
+ if (data_dir == BLK_RW_SYNC) {
unsigned long in_flight = atomic_read(&aic->nr_queued)
+ atomic_read(&aic->nr_dispatched);
spin_lock(&aic->lock);
@@ -811,7 +808,7 @@ static void as_update_rq(struct as_data *ad, struct request *rq)
*/
static void update_write_batch(struct as_data *ad)
{
- unsigned long batch = ad->batch_expire[REQ_ASYNC];
+ unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
long write_time;
write_time = (jiffies - ad->current_batch_expires) + batch;
@@ -855,7 +852,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
kblockd_schedule_work(q, &ad->antic_work);
ad->changed_batch = 0;
- if (ad->batch_data_dir == REQ_SYNC)
+ if (ad->batch_data_dir == BLK_RW_SYNC)
ad->new_batch = 1;
}
WARN_ON(ad->nr_dispatched == 0);
@@ -869,7 +866,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
update_write_batch(ad);
ad->current_batch_expires = jiffies +
- ad->batch_expire[REQ_SYNC];
+ ad->batch_expire[BLK_RW_SYNC];
ad->new_batch = 0;
}
@@ -960,7 +957,7 @@ static inline int as_batch_expired(struct as_data *ad)
if (ad->changed_batch || ad->new_batch)
return 0;
- if (ad->batch_data_dir == REQ_SYNC)
+ if (ad->batch_data_dir == BLK_RW_SYNC)
/* TODO! add a check so a complete fifo gets written? */
return time_after(jiffies, ad->current_batch_expires);
@@ -986,7 +983,7 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
*/
ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
- if (data_dir == REQ_SYNC) {
+ if (data_dir == BLK_RW_SYNC) {
struct io_context *ioc = RQ_IOC(rq);
/* In case we have to anticipate after this */
copy_io_context(&ad->io_context, &ioc);
@@ -1025,41 +1022,41 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
static int as_dispatch_request(struct request_queue *q, int force)
{
struct as_data *ad = q->elevator->elevator_data;
- const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
- const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
+ const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
+ const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
struct request *rq;
if (unlikely(force)) {
/*
* Forced dispatch, accounting is useless. Reset
* accounting states and dump fifo_lists. Note that
- * batch_data_dir is reset to REQ_SYNC to avoid
+ * batch_data_dir is reset to BLK_RW_SYNC to avoid
* screwing write batch accounting as write batch
* accounting occurs on W->R transition.
*/
int dispatched = 0;
- ad->batch_data_dir = REQ_SYNC;
+ ad->batch_data_dir = BLK_RW_SYNC;
ad->changed_batch = 0;
ad->new_batch = 0;
- while (ad->next_rq[REQ_SYNC]) {
- as_move_to_dispatch(ad, ad->next_rq[REQ_SYNC]);
+ while (ad->next_rq[BLK_RW_SYNC]) {
+ as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
dispatched++;
}
- ad->last_check_fifo[REQ_SYNC] = jiffies;
+ ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
- while (ad->next_rq[REQ_ASYNC]) {
- as_move_to_dispatch(ad, ad->next_rq[REQ_ASYNC]);
+ while (ad->next_rq[BLK_RW_ASYNC]) {
+ as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
dispatched++;
}
- ad->last_check_fifo[REQ_ASYNC] = jiffies;
+ ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
return dispatched;
}
/* Signal that the write batch was uncontended, so we can't time it */
- if (ad->batch_data_dir == REQ_ASYNC && !reads) {
+ if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
if (ad->current_write_count == 0 || !writes)
ad->write_batch_idled = 1;
}
@@ -1076,8 +1073,8 @@ static int as_dispatch_request(struct request_queue *q, int force)
*/
rq = ad->next_rq[ad->batch_data_dir];
- if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
- if (as_fifo_expired(ad, REQ_SYNC))
+ if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
+ if (as_fifo_expired(ad, BLK_RW_SYNC))
goto fifo_expired;
if (as_can_anticipate(ad, rq)) {
@@ -1090,7 +1087,7 @@ static int as_dispatch_request(struct request_queue *q, int force)
/* we have a "next request" */
if (reads && !writes)
ad->current_batch_expires =
- jiffies + ad->batch_expire[REQ_SYNC];
+ jiffies + ad->batch_expire[BLK_RW_SYNC];
goto dispatch_request;
}
}
@@ -1101,20 +1098,20 @@ static int as_dispatch_request(struct request_queue *q, int force)
*/
if (reads) {
- BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_SYNC]));
+ BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
- if (writes && ad->batch_data_dir == REQ_SYNC)
+ if (writes && ad->batch_data_dir == BLK_RW_SYNC)
/*
* Last batch was a read, switch to writes
*/
goto dispatch_writes;
- if (ad->batch_data_dir == REQ_ASYNC) {
+ if (ad->batch_data_dir == BLK_RW_ASYNC) {
WARN_ON(ad->new_batch);
ad->changed_batch = 1;
}
- ad->batch_data_dir = REQ_SYNC;
- rq = rq_entry_fifo(ad->fifo_list[REQ_SYNC].next);
+ ad->batch_data_dir = BLK_RW_SYNC;
+ rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
ad->last_check_fifo[ad->batch_data_dir] = jiffies;
goto dispatch_request;
}
@@ -1125,9 +1122,9 @@ static int as_dispatch_request(struct request_queue *q, int force)
if (writes) {
dispatch_writes:
- BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_ASYNC]));
+ BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
- if (ad->batch_data_dir == REQ_SYNC) {
+ if (ad->batch_data_dir == BLK_RW_SYNC) {
ad->changed_batch = 1;
/*
@@ -1137,11 +1134,11 @@ dispatch_writes:
*/
ad->new_batch = 0;
}
- ad->batch_data_dir = REQ_ASYNC;
+ ad->batch_data_dir = BLK_RW_ASYNC;
ad->current_write_count = ad->write_batch_count;
ad->write_batch_idled = 0;
- rq = rq_entry_fifo(ad->fifo_list[REQ_ASYNC].next);
- ad->last_check_fifo[REQ_ASYNC] = jiffies;
+ rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
+ ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
goto dispatch_request;
}
@@ -1164,9 +1161,9 @@ fifo_expired:
if (ad->nr_dispatched)
return 0;
- if (ad->batch_data_dir == REQ_ASYNC)
+ if (ad->batch_data_dir == BLK_RW_ASYNC)
ad->current_batch_expires = jiffies +
- ad->batch_expire[REQ_ASYNC];
+ ad->batch_expire[BLK_RW_ASYNC];
else
ad->new_batch = 1;
@@ -1238,8 +1235,8 @@ static int as_queue_empty(struct request_queue *q)
{
struct as_data *ad = q->elevator->elevator_data;
- return list_empty(&ad->fifo_list[REQ_ASYNC])
- && list_empty(&ad->fifo_list[REQ_SYNC]);
+ return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
+ && list_empty(&ad->fifo_list[BLK_RW_SYNC]);
}
static int
@@ -1346,8 +1343,8 @@ static void as_exit_queue(struct elevator_queue *e)
del_timer_sync(&ad->antic_timer);
cancel_work_sync(&ad->antic_work);
- BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
- BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
+ BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
+ BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
put_io_context(ad->io_context);
kfree(ad);
@@ -1372,18 +1369,18 @@ static void *as_init_queue(struct request_queue *q)
init_timer(&ad->antic_timer);
INIT_WORK(&ad->antic_work, as_work_handler);
- INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
- INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
- ad->sort_list[REQ_SYNC] = RB_ROOT;
- ad->sort_list[REQ_ASYNC] = RB_ROOT;
- ad->fifo_expire[REQ_SYNC] = default_read_expire;
- ad->fifo_expire[REQ_ASYNC] = default_write_expire;
+ INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
+ INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
+ ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
+ ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
+ ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
+ ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
ad->antic_expire = default_antic_expire;
- ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
- ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
+ ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
+ ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
- ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
- ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
+ ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
+ ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
if (ad->write_batch_count < 2)
ad->write_batch_count = 2;
@@ -1432,11 +1429,11 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \
struct as_data *ad = e->elevator_data; \
return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
}
-SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]);
-SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
+SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
-SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]);
-SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
+SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
@@ -1451,13 +1448,14 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
*(__PTR) = msecs_to_jiffies(*(__PTR)); \
return ret; \
}
-STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
+STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
+STORE_FUNCTION(as_write_expire_store,
+ &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
STORE_FUNCTION(as_read_batch_expire_store,
- &ad->batch_expire[REQ_SYNC], 0, INT_MAX);
+ &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
STORE_FUNCTION(as_write_batch_expire_store,
- &ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
+ &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
#undef STORE_FUNCTION
#define AS_ATTR(name) \
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f7dae57e6ca..20b4111fa05 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -319,9 +319,6 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
return -ENXIO;
bio = bio_alloc(GFP_KERNEL, 0);
- if (!bio)
- return -ENOMEM;
-
bio->bi_end_io = bio_end_empty_barrier;
bio->bi_private = &wait;
bio->bi_bdev = bdev;
diff --git a/block/blk-core.c b/block/blk-core.c
index 996ed906d8c..2998fe3a237 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,12 +64,11 @@ static struct workqueue_struct *kblockd_workqueue;
static void drive_stat_acct(struct request *rq, int new_io)
{
- struct gendisk *disk = rq->rq_disk;
struct hd_struct *part;
int rw = rq_data_dir(rq);
int cpu;
- if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue))
+ if (!blk_fs_request(rq) || !blk_do_io_stat(rq))
return;
cpu = part_stat_lock();
@@ -132,6 +131,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->cmd = rq->__cmd;
+ rq->cmd_len = BLK_MAX_CDB;
rq->tag = -1;
rq->ref_count = 1;
}
@@ -484,11 +484,11 @@ static int blk_init_free_list(struct request_queue *q)
{
struct request_list *rl = &q->rq;
- rl->count[READ] = rl->count[WRITE] = 0;
- rl->starved[READ] = rl->starved[WRITE] = 0;
+ rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
+ rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
rl->elvpriv = 0;
- init_waitqueue_head(&rl->wait[READ]);
- init_waitqueue_head(&rl->wait[WRITE]);
+ init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
+ init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
mempool_free_slab, request_cachep, q->node);
@@ -643,7 +643,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
}
static struct request *
-blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
{
struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
@@ -652,7 +652,7 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
blk_rq_init(q, rq);
- rq->cmd_flags = rw | REQ_ALLOCED;
+ rq->cmd_flags = flags | REQ_ALLOCED;
if (priv) {
if (unlikely(elv_set_request(q, rq, gfp_mask))) {
@@ -699,18 +699,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
ioc->last_waited = jiffies;
}
-static void __freed_request(struct request_queue *q, int rw)
+static void __freed_request(struct request_queue *q, int sync)
{
struct request_list *rl = &q->rq;
- if (rl->count[rw] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, rw);
+ if (rl->count[sync] < queue_congestion_off_threshold(q))
+ blk_clear_queue_congested(q, sync);
- if (rl->count[rw] + 1 <= q->nr_requests) {
- if (waitqueue_active(&rl->wait[rw]))
- wake_up(&rl->wait[rw]);
+ if (rl->count[sync] + 1 <= q->nr_requests) {
+ if (waitqueue_active(&rl->wait[sync]))
+ wake_up(&rl->wait[sync]);
- blk_clear_queue_full(q, rw);
+ blk_clear_queue_full(q, sync);
}
}
@@ -718,18 +718,18 @@ static void __freed_request(struct request_queue *q, int rw)
* A request has just been released. Account for it, update the full and
* congestion status, wake up any waiters. Called under q->queue_lock.
*/
-static void freed_request(struct request_queue *q, int rw, int priv)
+static void freed_request(struct request_queue *q, int sync, int priv)
{
struct request_list *rl = &q->rq;
- rl->count[rw]--;
+ rl->count[sync]--;
if (priv)
rl->elvpriv--;
- __freed_request(q, rw);
+ __freed_request(q, sync);
- if (unlikely(rl->starved[rw ^ 1]))
- __freed_request(q, rw ^ 1);
+ if (unlikely(rl->starved[sync ^ 1]))
+ __freed_request(q, sync ^ 1);
}
/*
@@ -743,15 +743,15 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
struct request *rq = NULL;
struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
- const int rw = rw_flags & 0x01;
+ const bool is_sync = rw_is_sync(rw_flags) != 0;
int may_queue, priv;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
goto rq_starved;
- if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
- if (rl->count[rw]+1 >= q->nr_requests) {
+ if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
+ if (rl->count[is_sync]+1 >= q->nr_requests) {
ioc = current_io_context(GFP_ATOMIC, q->node);
/*
* The queue will fill after this allocation, so set
@@ -759,9 +759,9 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
* This process will be allowed to complete a batch of
* requests, others will be blocked.
*/
- if (!blk_queue_full(q, rw)) {
+ if (!blk_queue_full(q, is_sync)) {
ioc_set_batching(q, ioc);
- blk_set_queue_full(q, rw);
+ blk_set_queue_full(q, is_sync);
} else {
if (may_queue != ELV_MQUEUE_MUST
&& !ioc_batching(q, ioc)) {
@@ -774,7 +774,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
}
}
}
- blk_set_queue_congested(q, rw);
+ blk_set_queue_congested(q, is_sync);
}
/*
@@ -782,16 +782,18 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
* limit of requests, otherwise we could have thousands of requests
* allocated with any setting of ->nr_requests
*/
- if (rl->count[rw] >= (3 * q->nr_requests / 2))
+ if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
goto out;
- rl->count[rw]++;
- rl->starved[rw] = 0;
+ rl->count[is_sync]++;
+ rl->starved[is_sync] = 0;
priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
if (priv)
rl->elvpriv++;
+ if (blk_queue_io_stat(q))
+ rw_flags |= REQ_IO_STAT;
spin_unlock_irq(q->queue_lock);
rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -804,7 +806,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
* wait queue, but this is pretty rare.
*/
spin_lock_irq(q->queue_lock);
- freed_request(q, rw, priv);
+ freed_request(q, is_sync, priv);
/*
* in the very unlikely event that allocation failed and no
@@ -814,8 +816,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
* rq mempool into READ and WRITE
*/
rq_starved:
- if (unlikely(rl->count[rw] == 0))
- rl->starved[rw] = 1;
+ if (unlikely(rl->count[is_sync] == 0))
+ rl->starved[is_sync] = 1;
goto out;
}
@@ -829,7 +831,7 @@ rq_starved:
if (ioc_batching(q, ioc))
ioc->nr_batch_requests--;
- trace_block_getrq(q, bio, rw);
+ trace_block_getrq(q, bio, rw_flags & 1);
out:
return rq;
}
@@ -843,7 +845,7 @@ out:
static struct request *get_request_wait(struct request_queue *q, int rw_flags,
struct bio *bio)
{
- const int rw = rw_flags & 0x01;
+ const bool is_sync = rw_is_sync(rw_flags) != 0;
struct request *rq;
rq = get_request(q, rw_flags, bio, GFP_NOIO);
@@ -852,10 +854,10 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
struct io_context *ioc;
struct request_list *rl = &q->rq;
- prepare_to_wait_exclusive(&rl->wait[rw], &wait,
+ prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
TASK_UNINTERRUPTIBLE);
- trace_block_sleeprq(q, bio, rw);
+ trace_block_sleeprq(q, bio, rw_flags & 1);
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
@@ -871,7 +873,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
ioc_set_batching(q, ioc);
spin_lock_irq(q->queue_lock);
- finish_wait(&rl->wait[rw], &wait);
+ finish_wait(&rl->wait[is_sync], &wait);
rq = get_request(q, rw_flags, bio, GFP_NOIO);
};
@@ -1070,14 +1072,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
* it didn't come out of our reserved rq pools
*/
if (req->cmd_flags & REQ_ALLOCED) {
- int rw = rq_data_dir(req);
+ int is_sync = rq_is_sync(req) != 0;
int priv = req->cmd_flags & REQ_ELVPRIV;
BUG_ON(!list_empty(&req->queuelist));
BUG_ON(!hlist_unhashed(&req->hash));
blk_free_request(q, req);
- freed_request(q, rw, priv);
+ freed_request(q, is_sync, priv);
}
}
EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1124,10 +1126,10 @@ void init_request_from_bio(struct request *req, struct bio *bio)
if (bio_sync(bio))
req->cmd_flags |= REQ_RW_SYNC;
- if (bio_unplug(bio))
- req->cmd_flags |= REQ_UNPLUG;
if (bio_rw_meta(bio))
req->cmd_flags |= REQ_RW_META;
+ if (bio_noidle(bio))
+ req->cmd_flags |= REQ_NOIDLE;
req->errors = 0;
req->hard_sector = req->sector = bio->bi_sector;
@@ -1136,6 +1138,15 @@ void init_request_from_bio(struct request *req, struct bio *bio)
blk_rq_bio_prep(req->q, req, bio);
}
+/*
+ * Only disabling plugging for non-rotational devices if it does tagging
+ * as well, otherwise we do need the proper merging
+ */
+static inline bool queue_should_plug(struct request_queue *q)
+{
+ return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
+}
+
static int __make_request(struct request_queue *q, struct bio *bio)
{
struct request *req;
@@ -1242,11 +1253,11 @@ get_rq:
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
bio_flagged(bio, BIO_CPU_AFFINE))
req->cpu = blk_cpu_to_group(smp_processor_id());
- if (!blk_queue_nonrot(q) && elv_queue_empty(q))
+ if (queue_should_plug(q) && elv_queue_empty(q))
blk_plug_device(q);
add_request(q, req);
out:
- if (unplug || blk_queue_nonrot(q))
+ if (unplug || !queue_should_plug(q))
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
return 0;
@@ -1664,9 +1675,7 @@ EXPORT_SYMBOL(blkdev_dequeue_request);
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
- struct gendisk *disk = req->rq_disk;
-
- if (!disk || !blk_do_io_stat(disk->queue))
+ if (!blk_do_io_stat(req))
return;
if (blk_fs_request(req)) {
@@ -1683,9 +1692,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
static void blk_account_io_done(struct request *req)
{
- struct gendisk *disk = req->rq_disk;
-
- if (!disk || !blk_do_io_stat(disk->queue))
+ if (!blk_do_io_stat(req))
return;
/*
@@ -1700,7 +1707,7 @@ static void blk_account_io_done(struct request *req)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(disk, req->sector);
+ part = disk_map_sector_rcu(req->rq_disk, req->sector);
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e39cb24b767..23d2a6fe34a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -338,6 +338,22 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
return 1;
}
+static void blk_account_io_merge(struct request *req)
+{
+ if (blk_do_io_stat(req)) {
+ struct hd_struct *part;
+ int cpu;
+
+ cpu = part_stat_lock();
+ part = disk_map_sector_rcu(req->rq_disk, req->sector);
+
+ part_round_stats(cpu, part);
+ part_dec_in_flight(part);
+
+ part_stat_unlock();
+ }
+}
+
/*
* Has to be called with the request spinlock acquired
*/
@@ -386,18 +402,10 @@ static int attempt_merge(struct request_queue *q, struct request *req,
elv_merge_requests(q, req, next);
- if (req->rq_disk) {
- struct hd_struct *part;
- int cpu;
-
- cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, req->sector);
-
- part_round_stats(cpu, part);
- part_dec_in_flight(part);
-
- part_stat_unlock();
- }
+ /*
+ * 'next' is going away, so update stats accordingly
+ */
+ blk_account_io_merge(next);
req->ioprio = ioprio_best(req->ioprio, next->ioprio);
if (blk_rq_cpu_valid(next))
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 59fd05d9f1d..57af728d94b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -156,26 +156,28 @@ EXPORT_SYMBOL(blk_queue_make_request);
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
- * @q: the request queue for the device
- * @dma_addr: bus address limit
+ * @q: the request queue for the device
+ * @dma_mask: the maximum address the device can handle
*
* Description:
* Different hardware can have different requirements as to what pages
* it can do I/O directly to. A low level driver can call
* blk_queue_bounce_limit to have lower memory pages allocated as bounce
- * buffers for doing I/O to pages residing above @dma_addr.
+ * buffers for doing I/O to pages residing above @dma_mask.
**/
-void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
+void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
{
- unsigned long b_pfn = dma_addr >> PAGE_SHIFT;
+ unsigned long b_pfn = dma_mask >> PAGE_SHIFT;
int dma = 0;
q->bounce_gfp = GFP_NOIO;
#if BITS_PER_LONG == 64
- /* Assume anything <= 4GB can be handled by IOMMU.
- Actually some IOMMUs can handle everything, but I don't
- know of a way to test this here. */
- if (b_pfn < (min_t(u64, 0x100000000UL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
+ /*
+ * Assume anything <= 4GB can be handled by IOMMU. Actually
+ * some IOMMUs can handle everything, but I don't know of a
+ * way to test this here.
+ */
+ if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
dma = 1;
q->bounce_pfn = max_low_pfn;
#else
@@ -431,7 +433,7 @@ EXPORT_SYMBOL(blk_queue_segment_boundary);
*
* description:
* set required memory and length alignment for direct dma transactions.
- * this is used when buiding direct io requests for the queue.
+ * this is used when building direct io requests for the queue.
*
**/
void blk_queue_dma_alignment(struct request_queue *q, int mask)
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ce0efc6b26d..ee9c2160222 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -64,7 +64,7 @@ static int raise_blk_irq(int cpu, struct request *rq)
data->info = rq;
data->flags = 0;
- __smp_call_function_single(cpu, data);
+ __smp_call_function_single(cpu, data, 0);
return 0;
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e29ddfc73cf..3ff9bba3379 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,28 +48,28 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
q->nr_requests = nr;
blk_queue_congestion_threshold(q);
- if (rl->count[READ] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, READ);
- else if (rl->count[READ] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, READ);
-
- if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, WRITE);
- else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, WRITE);
-
- if (rl->count[READ] >= q->nr_requests) {
- blk_set_queue_full(q, READ);
- } else if (rl->count[READ]+1 <= q->nr_requests) {
- blk_clear_queue_full(q, READ);
- wake_up(&rl->wait[READ]);
+ if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+ blk_set_queue_congested(q, BLK_RW_SYNC);
+ else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+ blk_clear_queue_congested(q, BLK_RW_SYNC);
+
+ if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+ blk_set_queue_congested(q, BLK_RW_ASYNC);
+ else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+ blk_clear_queue_congested(q, BLK_RW_ASYNC);
+
+ if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+ blk_set_queue_full(q, BLK_RW_SYNC);
+ } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+ blk_clear_queue_full(q, BLK_RW_SYNC);
+ wake_up(&rl->wait[BLK_RW_SYNC]);
}
- if (rl->count[WRITE] >= q->nr_requests) {
- blk_set_queue_full(q, WRITE);
- } else if (rl->count[WRITE]+1 <= q->nr_requests) {
- blk_clear_queue_full(q, WRITE);
- wake_up(&rl->wait[WRITE]);
+ if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+ blk_set_queue_full(q, BLK_RW_ASYNC);
+ } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+ blk_clear_queue_full(q, BLK_RW_ASYNC);
+ wake_up(&rl->wait[BLK_RW_ASYNC]);
}
spin_unlock_irq(q->queue_lock);
return ret;
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index bbbdc4b8ccf..1ec0d503cac 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -211,6 +211,12 @@ void blk_abort_queue(struct request_queue *q)
struct request *rq, *tmp;
LIST_HEAD(list);
+ /*
+ * Not a request based block device, nothing to abort
+ */
+ if (!q->request_fn)
+ return;
+
spin_lock_irqsave(q->queue_lock, flags);
elv_abort_queue(q);
@@ -224,6 +230,13 @@ void blk_abort_queue(struct request_queue *q)
list_for_each_entry_safe(rq, tmp, &list, timeout_list)
blk_abort_request(rq);
+ /*
+ * Occasionally, blk_abort_request() will return without
+ * deleting the element from the list. Make sure we add those back
+ * instead of leaving them on the local stack list.
+ */
+ list_splice(&list, &q->timeout_list);
+
spin_unlock_irqrestore(q->queue_lock, flags);
}
diff --git a/block/blk.h b/block/blk.h
index 0dce92c3749..79c85f7c9ff 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -70,6 +70,10 @@ void blk_queue_congestion_threshold(struct request_queue *q);
int blk_dev_init(void);
+void elv_quiesce_start(struct request_queue *q);
+void elv_quiesce_end(struct request_queue *q);
+
+
/*
* Return the threshold (number of used requests) at which the queue is
* considered to be congested. It include a little hysteresis to keep the
@@ -102,18 +106,15 @@ static inline int blk_cpu_to_group(int cpu)
const struct cpumask *mask = cpu_coregroup_mask(cpu);
return cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
- return first_cpu(per_cpu(cpu_sibling_map, cpu));
+ return cpumask_first(topology_thread_cpumask(cpu));
#else
return cpu;
#endif
}
-static inline int blk_do_io_stat(struct request_queue *q)
+static inline int blk_do_io_stat(struct request *rq)
{
- if (q)
- return blk_queue_io_stat(q);
-
- return 0;
+ return rq->rq_disk && blk_rq_io_stat(rq);
}
#endif
diff --git a/block/blktrace.c b/block/blktrace.c
deleted file mode 100644
index 028120a0965..00000000000
--- a/block/blktrace.c
+++ /dev/null
@@ -1,860 +0,0 @@
-/*
- * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- */
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/blktrace_api.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/mutex.h>
-#include <linux/debugfs.h>
-#include <linux/time.h>
-#include <trace/block.h>
-#include <asm/uaccess.h>
-
-static unsigned int blktrace_seq __read_mostly = 1;
-
-/* Global reference count of probes */
-static DEFINE_MUTEX(blk_probe_mutex);
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
-
-static int blk_register_tracepoints(void);
-static void blk_unregister_tracepoints(void);
-
-/*
- * Send out a notify message.
- */
-static void trace_note(struct blk_trace *bt, pid_t pid, int action,
- const void *data, size_t len)
-{
- struct blk_io_trace *t;
-
- t = relay_reserve(bt->rchan, sizeof(*t) + len);
- if (t) {
- const int cpu = smp_processor_id();
-
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->time = ktime_to_ns(ktime_get());
- t->device = bt->dev;
- t->action = action;
- t->pid = pid;
- t->cpu = cpu;
- t->pdu_len = len;
- memcpy((void *) t + sizeof(*t), data, len);
- }
-}
-
-/*
- * Send out a notify for this process, if we haven't done so since a trace
- * started
- */
-static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
-{
- tsk->btrace_seq = blktrace_seq;
- trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
-}
-
-static void trace_note_time(struct blk_trace *bt)
-{
- struct timespec now;
- unsigned long flags;
- u32 words[2];
-
- getnstimeofday(&now);
- words[0] = now.tv_sec;
- words[1] = now.tv_nsec;
-
- local_irq_save(flags);
- trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
- local_irq_restore(flags);
-}
-
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
-{
- int n;
- va_list args;
- unsigned long flags;
- char *buf;
-
- local_irq_save(flags);
- buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
- va_start(args, fmt);
- n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
- va_end(args);
-
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(__trace_note_message);
-
-static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
- pid_t pid)
-{
- if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
- return 1;
- if (sector < bt->start_lba || sector > bt->end_lba)
- return 1;
- if (bt->pid && pid != bt->pid)
- return 1;
-
- return 0;
-}
-
-/*
- * Data direction bit lookup
- */
-static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
-
-/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \
- (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) )
-
-/*
- * The worker for the various blk_add_trace*() types. Fills out a
- * blk_io_trace structure and places it in a per-cpu subbuffer.
- */
-static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int rw, u32 what, int error, int pdu_len, void *pdu_data)
-{
- struct task_struct *tsk = current;
- struct blk_io_trace *t;
- unsigned long flags;
- unsigned long *sequence;
- pid_t pid;
- int cpu;
-
- if (unlikely(bt->trace_state != Blktrace_running))
- return;
-
- what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, BARRIER);
- what |= MASK_TC_BIT(rw, SYNCIO);
- what |= MASK_TC_BIT(rw, AHEAD);
- what |= MASK_TC_BIT(rw, META);
- what |= MASK_TC_BIT(rw, DISCARD);
-
- pid = tsk->pid;
- if (unlikely(act_log_check(bt, what, sector, pid)))
- return;
-
- /*
- * A word about the locking here - we disable interrupts to reserve
- * some space in the relay per-cpu buffer, to prevent an irq
- * from coming in and stepping on our toes.
- */
- local_irq_save(flags);
-
- if (unlikely(tsk->btrace_seq != blktrace_seq))
- trace_note_tsk(bt, tsk);
-
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
- if (t) {
- cpu = smp_processor_id();
- sequence = per_cpu_ptr(bt->sequence, cpu);
-
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->sequence = ++(*sequence);
- t->time = ktime_to_ns(ktime_get());
- t->sector = sector;
- t->bytes = bytes;
- t->action = what;
- t->pid = pid;
- t->device = bt->dev;
- t->cpu = cpu;
- t->error = error;
- t->pdu_len = pdu_len;
-
- if (pdu_len)
- memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
- }
-
- local_irq_restore(flags);
-}
-
-static struct dentry *blk_tree_root;
-static DEFINE_MUTEX(blk_tree_mutex);
-
-static void blk_trace_cleanup(struct blk_trace *bt)
-{
- debugfs_remove(bt->msg_file);
- debugfs_remove(bt->dropped_file);
- relay_close(bt->rchan);
- free_percpu(bt->sequence);
- free_percpu(bt->msg_data);
- kfree(bt);
- mutex_lock(&blk_probe_mutex);
- if (atomic_dec_and_test(&blk_probes_ref))
- blk_unregister_tracepoints();
- mutex_unlock(&blk_probe_mutex);
-}
-
-int blk_trace_remove(struct request_queue *q)
-{
- struct blk_trace *bt;
-
- bt = xchg(&q->blk_trace, NULL);
- if (!bt)
- return -EINVAL;
-
- if (bt->trace_state == Blktrace_setup ||
- bt->trace_state == Blktrace_stopped)
- blk_trace_cleanup(bt);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_trace_remove);
-
-static int blk_dropped_open(struct inode *inode, struct file *filp)
-{
- filp->private_data = inode->i_private;
-
- return 0;
-}
-
-static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct blk_trace *bt = filp->private_data;
- char buf[16];
-
- snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
-
- return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-}
-
-static const struct file_operations blk_dropped_fops = {
- .owner = THIS_MODULE,
- .open = blk_dropped_open,
- .read = blk_dropped_read,
-};
-
-static int blk_msg_open(struct inode *inode, struct file *filp)
-{
- filp->private_data = inode->i_private;
-
- return 0;
-}
-
-static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- char *msg;
- struct blk_trace *bt;
-
- if (count > BLK_TN_MAX_MSG)
- return -EINVAL;
-
- msg = kmalloc(count, GFP_KERNEL);
- if (msg == NULL)
- return -ENOMEM;
-
- if (copy_from_user(msg, buffer, count)) {
- kfree(msg);
- return -EFAULT;
- }
-
- bt = filp->private_data;
- __trace_note_message(bt, "%s", msg);
- kfree(msg);
-
- return count;
-}
-
-static const struct file_operations blk_msg_fops = {
- .owner = THIS_MODULE,
- .open = blk_msg_open,
- .write = blk_msg_write,
-};
-
-/*
- * Keep track of how many times we encountered a full subbuffer, to aid
- * the user space app in telling how many lost events there were.
- */
-static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
- void *prev_subbuf, size_t prev_padding)
-{
- struct blk_trace *bt;
-
- if (!relay_buf_full(buf))
- return 1;
-
- bt = buf->chan->private_data;
- atomic_inc(&bt->dropped);
- return 0;
-}
-
-static int blk_remove_buf_file_callback(struct dentry *dentry)
-{
- struct dentry *parent = dentry->d_parent;
- debugfs_remove(dentry);
-
- /*
- * this will fail for all but the last file, but that is ok. what we
- * care about is the top level buts->name directory going away, when
- * the last trace file is gone. Then we don't have to rmdir() that
- * manually on trace stop, so it nicely solves the issue with
- * force killing of running traces.
- */
-
- debugfs_remove(parent);
- return 0;
-}
-
-static struct dentry *blk_create_buf_file_callback(const char *filename,
- struct dentry *parent,
- int mode,
- struct rchan_buf *buf,
- int *is_global)
-{
- return debugfs_create_file(filename, mode, parent, buf,
- &relay_file_operations);
-}
-
-static struct rchan_callbacks blk_relay_callbacks = {
- .subbuf_start = blk_subbuf_start_callback,
- .create_buf_file = blk_create_buf_file_callback,
- .remove_buf_file = blk_remove_buf_file_callback,
-};
-
-/*
- * Setup everything required to start tracing
- */
-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct blk_user_trace_setup *buts)
-{
- struct blk_trace *old_bt, *bt = NULL;
- struct dentry *dir = NULL;
- int ret, i;
-
- if (!buts->buf_size || !buts->buf_nr)
- return -EINVAL;
-
- strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
- buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
-
- /*
- * some device names have larger paths - convert the slashes
- * to underscores for this to work as expected
- */
- for (i = 0; i < strlen(buts->name); i++)
- if (buts->name[i] == '/')
- buts->name[i] = '_';
-
- ret = -ENOMEM;
- bt = kzalloc(sizeof(*bt), GFP_KERNEL);
- if (!bt)
- goto err;
-
- bt->sequence = alloc_percpu(unsigned long);
- if (!bt->sequence)
- goto err;
-
- bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
- if (!bt->msg_data)
- goto err;
-
- ret = -ENOENT;
-
- if (!blk_tree_root) {
- blk_tree_root = debugfs_create_dir("block", NULL);
- if (!blk_tree_root)
- return -ENOMEM;
- }
-
- dir = debugfs_create_dir(buts->name, blk_tree_root);
-
- if (!dir)
- goto err;
-
- bt->dir = dir;
- bt->dev = dev;
- atomic_set(&bt->dropped, 0);
-
- ret = -EIO;
- bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
- if (!bt->dropped_file)
- goto err;
-
- bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
- if (!bt->msg_file)
- goto err;
-
- bt->rchan = relay_open("trace", dir, buts->buf_size,
- buts->buf_nr, &blk_relay_callbacks, bt);
- if (!bt->rchan)
- goto err;
-
- bt->act_mask = buts->act_mask;
- if (!bt->act_mask)
- bt->act_mask = (u16) -1;
-
- bt->start_lba = buts->start_lba;
- bt->end_lba = buts->end_lba;
- if (!bt->end_lba)
- bt->end_lba = -1ULL;
-
- bt->pid = buts->pid;
- bt->trace_state = Blktrace_setup;
-
- mutex_lock(&blk_probe_mutex);
- if (atomic_add_return(1, &blk_probes_ref) == 1) {
- ret = blk_register_tracepoints();
- if (ret)
- goto probe_err;
- }
- mutex_unlock(&blk_probe_mutex);
-
- ret = -EBUSY;
- old_bt = xchg(&q->blk_trace, bt);
- if (old_bt) {
- (void) xchg(&q->blk_trace, old_bt);
- goto err;
- }
-
- return 0;
-probe_err:
- atomic_dec(&blk_probes_ref);
- mutex_unlock(&blk_probe_mutex);
-err:
- if (bt) {
- if (bt->msg_file)
- debugfs_remove(bt->msg_file);
- if (bt->dropped_file)
- debugfs_remove(bt->dropped_file);
- free_percpu(bt->sequence);
- free_percpu(bt->msg_data);
- if (bt->rchan)
- relay_close(bt->rchan);
- kfree(bt);
- }
- return ret;
-}
-
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- char __user *arg)
-{
- struct blk_user_trace_setup buts;
- int ret;
-
- ret = copy_from_user(&buts, arg, sizeof(buts));
- if (ret)
- return -EFAULT;
-
- ret = do_blk_trace_setup(q, name, dev, &buts);
- if (ret)
- return ret;
-
- if (copy_to_user(arg, &buts, sizeof(buts)))
- return -EFAULT;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_trace_setup);
-
-int blk_trace_startstop(struct request_queue *q, int start)
-{
- struct blk_trace *bt;
- int ret;
-
- if ((bt = q->blk_trace) == NULL)
- return -EINVAL;
-
- /*
- * For starting a trace, we can transition from a setup or stopped
- * trace. For stopping a trace, the state must be running
- */
- ret = -EINVAL;
- if (start) {
- if (bt->trace_state == Blktrace_setup ||
- bt->trace_state == Blktrace_stopped) {
- blktrace_seq++;
- smp_mb();
- bt->trace_state = Blktrace_running;
-
- trace_note_time(bt);
- ret = 0;
- }
- } else {
- if (bt->trace_state == Blktrace_running) {
- bt->trace_state = Blktrace_stopped;
- relay_flush(bt->rchan);
- ret = 0;
- }
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(blk_trace_startstop);
-
-/**
- * blk_trace_ioctl: - handle the ioctls associated with tracing
- * @bdev: the block device
- * @cmd: the ioctl cmd
- * @arg: the argument data, if any
- *
- **/
-int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
-{
- struct request_queue *q;
- int ret, start = 0;
- char b[BDEVNAME_SIZE];
-
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
- mutex_lock(&bdev->bd_mutex);
-
- switch (cmd) {
- case BLKTRACESETUP:
- bdevname(bdev, b);
- ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
- break;
- case BLKTRACESTART:
- start = 1;
- case BLKTRACESTOP:
- ret = blk_trace_startstop(q, start);
- break;
- case BLKTRACETEARDOWN:
- ret = blk_trace_remove(q);
- break;
- default:
- ret = -ENOTTY;
- break;
- }
-
- mutex_unlock(&bdev->bd_mutex);
- return ret;
-}
-
-/**
- * blk_trace_shutdown: - stop and cleanup trace structures
- * @q: the request queue associated with the device
- *
- **/
-void blk_trace_shutdown(struct request_queue *q)
-{
- if (q->blk_trace) {
- blk_trace_startstop(q, 0);
- blk_trace_remove(q);
- }
-}
-
-/*
- * blktrace probes
- */
-
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q: queue the io is for
- * @rq: the source request
- * @what: the action
- *
- * Description:
- * Records an action against a request. Will log the bio offset + size.
- *
- **/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
- u32 what)
-{
- struct blk_trace *bt = q->blk_trace;
- int rw = rq->cmd_flags & 0x03;
-
- if (likely(!bt))
- return;
-
- if (blk_discard_rq(rq))
- rw |= (1 << BIO_RW_DISCARD);
-
- if (blk_pc_request(rq)) {
- what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
- sizeof(rq->cmd), rq->cmd);
- } else {
- what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
- rw, what, rq->errors, 0, NULL);
- }
-}
-
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, BLK_TA_ABORT);
-}
-
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, BLK_TA_INSERT);
-}
-
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
-}
-
-static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
-}
-
-static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q: queue the io is for
- * @bio: the source bio
- * @what: the action
- *
- * Description:
- * Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (likely(!bt))
- return;
-
- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
- !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
-{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
-}
-
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
-{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
-}
-
-static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio)
-{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
-}
-
-static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio)
-{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
-}
-
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
-{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
-}
-
-static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw)
-{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
- else {
- struct blk_trace *bt = q->blk_trace;
-
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
- }
-}
-
-
-static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw)
-{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
- else {
- struct blk_trace *bt = q->blk_trace;
-
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL);
- }
-}
-
-static void blk_add_trace_plug(struct request_queue *q)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (bt)
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
-}
-
-static void blk_add_trace_unplug_io(struct request_queue *q)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (bt) {
- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
- __be64 rpdu = cpu_to_be64(pdu);
-
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
- sizeof(rpdu), &rpdu);
- }
-}
-
-static void blk_add_trace_unplug_timer(struct request_queue *q)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (bt) {
- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
- __be64 rpdu = cpu_to_be64(pdu);
-
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
- sizeof(rpdu), &rpdu);
- }
-}
-
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
- unsigned int pdu)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (bt) {
- __be64 rpdu = cpu_to_be64(pdu);
-
- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
- BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
- sizeof(rpdu), &rpdu);
- }
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q: queue the io is for
- * @bio: the source bio
- * @dev: target device
- * @from: source sector
- * @to: target sector
- *
- * Description:
- * Device mapper or raid target sometimes need to split a bio because
- * it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from, sector_t to)
-{
- struct blk_trace *bt = q->blk_trace;
- struct blk_io_trace_remap r;
-
- if (likely(!bt))
- return;
-
- r.device = cpu_to_be32(dev);
- r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
- r.sector = cpu_to_be64(to);
-
- __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
- !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
-/**
- * blk_add_driver_data - Add binary message with driver-specific data
- * @q: queue the io is for
- * @rq: io request
- * @data: driver-specific data
- * @len: length of driver-specific data
- *
- * Description:
- * Some drivers might want to write driver-specific data per request.
- *
- **/
-void blk_add_driver_data(struct request_queue *q,
- struct request *rq,
- void *data, size_t len)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (likely(!bt))
- return;
-
- if (blk_pc_request(rq))
- __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
- rq->errors, len, data);
- else
- __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
- 0, BLK_TA_DRV_DATA, rq->errors, len, data);
-}
-EXPORT_SYMBOL_GPL(blk_add_driver_data);
-
-static int blk_register_tracepoints(void)
-{
- int ret;
-
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
- WARN_ON(ret);
- ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
- WARN_ON(ret);
- ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
- WARN_ON(ret);
- ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
- WARN_ON(ret);
- ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
- WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
- WARN_ON(ret);
- ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
- WARN_ON(ret);
- ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
- WARN_ON(ret);
- ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
- WARN_ON(ret);
- ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
- WARN_ON(ret);
- ret = register_trace_block_getrq(blk_add_trace_getrq);
- WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
- WARN_ON(ret);
- ret = register_trace_block_plug(blk_add_trace_plug);
- WARN_ON(ret);
- ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
- WARN_ON(ret);
- ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
- WARN_ON(ret);
- ret = register_trace_block_split(blk_add_trace_split);
- WARN_ON(ret);
- ret = register_trace_block_remap(blk_add_trace_remap);
- WARN_ON(ret);
- return 0;
-}
-
-static void blk_unregister_tracepoints(void)
-{
- unregister_trace_block_remap(blk_add_trace_remap);
- unregister_trace_block_split(blk_add_trace_split);
- unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
- unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
- unregister_trace_block_plug(blk_add_trace_plug);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
- unregister_trace_block_getrq(blk_add_trace_getrq);
- unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
- unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
- unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
- unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
- unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
- unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
- unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
- unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
-
- tracepoint_synchronize_unregister();
-}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 664ebfd092e..a55a9bd75bd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -56,9 +56,6 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
-#define ASYNC (0)
-#define SYNC (1)
-
#define sample_valid(samples) ((samples) > 80)
/*
@@ -83,6 +80,14 @@ struct cfq_data {
* rr list of queues with requests and the count of them
*/
struct cfq_rb_root service_tree;
+
+ /*
+ * Each priority tree is sorted by next_request position. These
+ * trees are used when determining if two or more queues are
+ * interleaving requests (see cfq_close_cooperator).
+ */
+ struct rb_root prio_trees[CFQ_PRIO_LISTS];
+
unsigned int busy_queues;
/*
* Used to track any pending rt requests so we can pre-empt current
@@ -147,6 +152,10 @@ struct cfq_queue {
struct rb_node rb_node;
/* service_tree key */
unsigned long rb_key;
+ /* prio tree member */
+ struct rb_node p_node;
+ /* prio tree root we belong to, if any */
+ struct rb_root *p_root;
/* sorted list of pending requests */
struct rb_root sort_list;
/* if fifo isn't expired, next request to serve */
@@ -160,6 +169,7 @@ struct cfq_queue {
unsigned long slice_end;
long slice_resid;
+ unsigned int slice_dispatch;
/* pending metadata requests */
int meta_pending;
@@ -176,15 +186,15 @@ struct cfq_queue {
enum cfqq_state_flags {
CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
+ CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
- CFQ_CFQQ_FLAG_must_dispatch, /* must dispatch, even if expired */
CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
- CFQ_CFQQ_FLAG_queue_new, /* queue never been serviced */
CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
CFQ_CFQQ_FLAG_sync, /* synchronous queue */
+ CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */
};
#define CFQ_CFQQ_FNS(name) \
@@ -203,15 +213,15 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
+CFQ_CFQQ_FNS(must_dispatch);
CFQ_CFQQ_FNS(must_alloc);
CFQ_CFQQ_FNS(must_alloc_slice);
-CFQ_CFQQ_FNS(must_dispatch);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
-CFQ_CFQQ_FNS(queue_new);
CFQ_CFQQ_FNS(slice_new);
CFQ_CFQQ_FNS(sync);
+CFQ_CFQQ_FNS(coop);
#undef CFQ_CFQQ_FNS
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
@@ -420,13 +430,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+ rb_erase(n, root);
+ RB_CLEAR_NODE(n);
+}
+
static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
{
if (root->left == n)
root->left = NULL;
-
- rb_erase(n, &root->rb);
- RB_CLEAR_NODE(n);
+ rb_erase_init(n, &root->rb);
}
/*
@@ -471,8 +485,8 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
* requests waiting to be processed. It is sorted in the order that
* we will service the queues.
*/
-static void cfq_service_tree_add(struct cfq_data *cfqd,
- struct cfq_queue *cfqq, int add_front)
+static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ int add_front)
{
struct rb_node **p, *parent;
struct cfq_queue *__cfqq;
@@ -545,6 +559,67 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
}
+static struct cfq_queue *
+cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
+ sector_t sector, struct rb_node **ret_parent,
+ struct rb_node ***rb_link)
+{
+ struct rb_node **p, *parent;
+ struct cfq_queue *cfqq = NULL;
+
+ parent = NULL;
+ p = &root->rb_node;
+ while (*p) {
+ struct rb_node **n;
+
+ parent = *p;
+ cfqq = rb_entry(parent, struct cfq_queue, p_node);
+
+ /*
+ * Sort strictly based on sector. Smallest to the left,
+ * largest to the right.
+ */
+ if (sector > cfqq->next_rq->sector)
+ n = &(*p)->rb_right;
+ else if (sector < cfqq->next_rq->sector)
+ n = &(*p)->rb_left;
+ else
+ break;
+ p = n;
+ cfqq = NULL;
+ }
+
+ *ret_parent = parent;
+ if (rb_link)
+ *rb_link = p;
+ return cfqq;
+}
+
+static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ struct rb_node **p, *parent;
+ struct cfq_queue *__cfqq;
+
+ if (cfqq->p_root) {
+ rb_erase(&cfqq->p_node, cfqq->p_root);
+ cfqq->p_root = NULL;
+ }
+
+ if (cfq_class_idle(cfqq))
+ return;
+ if (!cfqq->next_rq)
+ return;
+
+ cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
+ __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, cfqq->next_rq->sector,
+ &parent, &p);
+ if (!__cfqq) {
+ rb_link_node(&cfqq->p_node, parent, p);
+ rb_insert_color(&cfqq->p_node, cfqq->p_root);
+ } else
+ cfqq->p_root = NULL;
+}
+
/*
* Update cfqq's position in the service tree.
*/
@@ -553,8 +628,10 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
/*
* Resorting requires the cfqq to be on the RR list already.
*/
- if (cfq_cfqq_on_rr(cfqq))
+ if (cfq_cfqq_on_rr(cfqq)) {
cfq_service_tree_add(cfqd, cfqq, 0);
+ cfq_prio_tree_add(cfqd, cfqq);
+ }
}
/*
@@ -585,6 +662,10 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
if (!RB_EMPTY_NODE(&cfqq->rb_node))
cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+ if (cfqq->p_root) {
+ rb_erase(&cfqq->p_node, cfqq->p_root);
+ cfqq->p_root = NULL;
+ }
BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
@@ -614,7 +695,7 @@ static void cfq_add_rq_rb(struct request *rq)
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
struct cfq_data *cfqd = cfqq->cfqd;
- struct request *__alias;
+ struct request *__alias, *prev;
cfqq->queued[rq_is_sync(rq)]++;
@@ -631,7 +712,15 @@ static void cfq_add_rq_rb(struct request *rq)
/*
* check if this request is a better next-serve candidate
*/
+ prev = cfqq->next_rq;
cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
+
+ /*
+ * adjust priority tree position, if ->next_rq changes
+ */
+ if (prev != cfqq->next_rq)
+ cfq_prio_tree_add(cfqd, cfqq);
+
BUG_ON(!cfqq->next_rq);
}
@@ -774,10 +863,15 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
if (cfqq) {
cfq_log_cfqq(cfqd, cfqq, "set_active");
cfqq->slice_end = 0;
+ cfqq->slice_dispatch = 0;
+
+ cfq_clear_cfqq_wait_request(cfqq);
+ cfq_clear_cfqq_must_dispatch(cfqq);
cfq_clear_cfqq_must_alloc_slice(cfqq);
cfq_clear_cfqq_fifo_expire(cfqq);
cfq_mark_cfqq_slice_new(cfqq);
- cfq_clear_cfqq_queue_new(cfqq);
+
+ del_timer(&cfqd->idle_slice_timer);
}
cfqd->active_queue = cfqq;
@@ -795,7 +889,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (cfq_cfqq_wait_request(cfqq))
del_timer(&cfqd->idle_slice_timer);
- cfq_clear_cfqq_must_dispatch(cfqq);
cfq_clear_cfqq_wait_request(cfqq);
/*
@@ -840,11 +933,15 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
/*
* Get and set a new active queue for service.
*/
-static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
+ struct cfq_queue *cfqq)
{
- struct cfq_queue *cfqq;
+ if (!cfqq) {
+ cfqq = cfq_get_next_queue(cfqd);
+ if (cfqq)
+ cfq_clear_cfqq_coop(cfqq);
+ }
- cfqq = cfq_get_next_queue(cfqd);
__cfq_set_active_queue(cfqd, cfqq);
return cfqq;
}
@@ -858,28 +955,100 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
return cfqd->last_position - rq->sector;
}
+#define CIC_SEEK_THR 8 * 1024
+#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR)
+
static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
{
struct cfq_io_context *cic = cfqd->active_cic;
+ sector_t sdist = cic->seek_mean;
if (!sample_valid(cic->seek_samples))
- return 0;
+ sdist = CIC_SEEK_THR;
- return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
+ return cfq_dist_from_last(cfqd, rq) <= sdist;
}
-static int cfq_close_cooperator(struct cfq_data *cfq_data,
- struct cfq_queue *cfqq)
+static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+ struct cfq_queue *cur_cfqq)
{
+ struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
+ struct rb_node *parent, *node;
+ struct cfq_queue *__cfqq;
+ sector_t sector = cfqd->last_position;
+
+ if (RB_EMPTY_ROOT(root))
+ return NULL;
+
+ /*
+ * First, if we find a request starting at the end of the last
+ * request, choose it.
+ */
+ __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
+ if (__cfqq)
+ return __cfqq;
+
+ /*
+ * If the exact sector wasn't found, the parent of the NULL leaf
+ * will contain the closest sector.
+ */
+ __cfqq = rb_entry(parent, struct cfq_queue, p_node);
+ if (cfq_rq_close(cfqd, __cfqq->next_rq))
+ return __cfqq;
+
+ if (__cfqq->next_rq->sector < sector)
+ node = rb_next(&__cfqq->p_node);
+ else
+ node = rb_prev(&__cfqq->p_node);
+ if (!node)
+ return NULL;
+
+ __cfqq = rb_entry(node, struct cfq_queue, p_node);
+ if (cfq_rq_close(cfqd, __cfqq->next_rq))
+ return __cfqq;
+
+ return NULL;
+}
+
+/*
+ * cfqd - obvious
+ * cur_cfqq - passed in so that we don't decide that the current queue is
+ * closely cooperating with itself.
+ *
+ * So, basically we're assuming that that cur_cfqq has dispatched at least
+ * one request, and that cfqd->last_position reflects a position on the disk
+ * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
+ * assumption.
+ */
+static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
+ struct cfq_queue *cur_cfqq,
+ int probe)
+{
+ struct cfq_queue *cfqq;
+
+ /*
+ * A valid cfq_io_context is necessary to compare requests against
+ * the seek_mean of the current cfqq.
+ */
+ if (!cfqd->active_cic)
+ return NULL;
+
/*
* We should notice if some of the queues are cooperating, eg
* working closely on the same area of the disk. In that case,
* we can group them together and don't waste time idling.
*/
- return 0;
-}
+ cfqq = cfqq_close(cfqd, cur_cfqq);
+ if (!cfqq)
+ return NULL;
-#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
+ if (cfq_cfqq_coop(cfqq))
+ return NULL;
+
+ if (!probe)
+ cfq_mark_cfqq_coop(cfqq);
+ return cfqq;
+}
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
{
@@ -917,14 +1086,6 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
if (!cic || !atomic_read(&cic->ioc->nr_tasks))
return;
- /*
- * See if this prio level has a good candidate
- */
- if (cfq_close_cooperator(cfqd, cfqq) &&
- (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
- return;
-
- cfq_mark_cfqq_must_dispatch(cfqq);
cfq_mark_cfqq_wait_request(cfqq);
/*
@@ -937,7 +1098,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
- cfq_log(cfqd, "arm_idle: %lu", sl);
+ cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
}
/*
@@ -1001,7 +1162,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
*/
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
{
- struct cfq_queue *cfqq;
+ struct cfq_queue *cfqq, *new_cfqq = NULL;
cfqq = cfqd->active_queue;
if (!cfqq)
@@ -1010,7 +1171,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
/*
* The active queue has run out of time, expire it and select new.
*/
- if (cfq_slice_used(cfqq))
+ if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
goto expire;
/*
@@ -1035,6 +1196,16 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
goto keep_queue;
/*
+ * If another queue has a request waiting within our mean seek
+ * distance, let it run. The expire code will check for close
+ * cooperators and put the close queue at the front of the service
+ * tree.
+ */
+ new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
+ if (new_cfqq)
+ goto expire;
+
+ /*
* No requests pending. If the active queue still has requests in
* flight or is idling for a new request, allow either of these
* conditions to happen (or time out) before selecting a new queue.
@@ -1048,71 +1219,11 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
expire:
cfq_slice_expired(cfqd, 0);
new_queue:
- cfqq = cfq_set_active_queue(cfqd);
+ cfqq = cfq_set_active_queue(cfqd, new_cfqq);
keep_queue:
return cfqq;
}
-/*
- * Dispatch some requests from cfqq, moving them to the request queue
- * dispatch list.
- */
-static int
-__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
- int max_dispatch)
-{
- int dispatched = 0;
-
- BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
-
- do {
- struct request *rq;
-
- /*
- * follow expired path, else get first next available
- */
- rq = cfq_check_fifo(cfqq);
- if (rq == NULL)
- rq = cfqq->next_rq;
-
- /*
- * finally, insert request into driver dispatch list
- */
- cfq_dispatch_insert(cfqd->queue, rq);
-
- dispatched++;
-
- if (!cfqd->active_cic) {
- atomic_inc(&RQ_CIC(rq)->ioc->refcount);
- cfqd->active_cic = RQ_CIC(rq);
- }
-
- if (RB_EMPTY_ROOT(&cfqq->sort_list))
- break;
-
- /*
- * If there is a non-empty RT cfqq waiting for current
- * cfqq's timeslice to complete, pre-empt this cfqq
- */
- if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues)
- break;
-
- } while (dispatched < max_dispatch);
-
- /*
- * expire an async queue immediately if it has used up its slice. idle
- * queue always expire after 1 dispatch round.
- */
- if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
- dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
- cfq_class_idle(cfqq))) {
- cfqq->slice_end = jiffies + 1;
- cfq_slice_expired(cfqd, 0);
- }
-
- return dispatched;
-}
-
static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
{
int dispatched = 0;
@@ -1146,11 +1257,45 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
return dispatched;
}
+/*
+ * Dispatch a request from cfqq, moving them to the request queue
+ * dispatch list.
+ */
+static void cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ struct request *rq;
+
+ BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
+
+ /*
+ * follow expired path, else get first next available
+ */
+ rq = cfq_check_fifo(cfqq);
+ if (!rq)
+ rq = cfqq->next_rq;
+
+ /*
+ * insert request into driver dispatch list
+ */
+ cfq_dispatch_insert(cfqd->queue, rq);
+
+ if (!cfqd->active_cic) {
+ struct cfq_io_context *cic = RQ_CIC(rq);
+
+ atomic_inc(&cic->ioc->refcount);
+ cfqd->active_cic = cic;
+ }
+}
+
+/*
+ * Find the cfqq that we need to service and move a request from that to the
+ * dispatch list
+ */
static int cfq_dispatch_requests(struct request_queue *q, int force)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
struct cfq_queue *cfqq;
- int dispatched;
+ unsigned int max_dispatch;
if (!cfqd->busy_queues)
return 0;
@@ -1158,29 +1303,63 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
if (unlikely(force))
return cfq_forced_dispatch(cfqd);
- dispatched = 0;
- while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
- int max_dispatch;
+ cfqq = cfq_select_queue(cfqd);
+ if (!cfqq)
+ return 0;
- max_dispatch = cfqd->cfq_quantum;
+ /*
+ * If this is an async queue and we have sync IO in flight, let it wait
+ */
+ if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
+ return 0;
+
+ max_dispatch = cfqd->cfq_quantum;
+ if (cfq_class_idle(cfqq))
+ max_dispatch = 1;
+
+ /*
+ * Does this cfqq already have too much IO in flight?
+ */
+ if (cfqq->dispatched >= max_dispatch) {
+ /*
+ * idle queue must always only have a single IO in flight
+ */
if (cfq_class_idle(cfqq))
- max_dispatch = 1;
+ return 0;
- if (cfqq->dispatched >= max_dispatch && cfqd->busy_queues > 1)
- break;
+ /*
+ * We have other queues, don't allow more IO from this one
+ */
+ if (cfqd->busy_queues > 1)
+ return 0;
- if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
- break;
+ /*
+ * we are the only queue, allow up to 4 times of 'quantum'
+ */
+ if (cfqq->dispatched >= 4 * max_dispatch)
+ return 0;
+ }
- cfq_clear_cfqq_must_dispatch(cfqq);
- cfq_clear_cfqq_wait_request(cfqq);
- del_timer(&cfqd->idle_slice_timer);
+ /*
+ * Dispatch a request from this cfqq
+ */
+ cfq_dispatch_request(cfqd, cfqq);
+ cfqq->slice_dispatch++;
+ cfq_clear_cfqq_must_dispatch(cfqq);
- dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
+ /*
+ * expire an async queue immediately if it has used up its slice. idle
+ * queue always expire after 1 dispatch round.
+ */
+ if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
+ cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
+ cfq_class_idle(cfqq))) {
+ cfqq->slice_end = jiffies + 1;
+ cfq_slice_expired(cfqd, 0);
}
- cfq_log(cfqd, "dispatched=%d", dispatched);
- return dispatched;
+ cfq_log(cfqd, "dispatched a request");
+ return 1;
}
/*
@@ -1323,14 +1502,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
if (ioc->ioc_data == cic)
rcu_assign_pointer(ioc->ioc_data, NULL);
- if (cic->cfqq[ASYNC]) {
- cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
- cic->cfqq[ASYNC] = NULL;
+ if (cic->cfqq[BLK_RW_ASYNC]) {
+ cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
+ cic->cfqq[BLK_RW_ASYNC] = NULL;
}
- if (cic->cfqq[SYNC]) {
- cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
- cic->cfqq[SYNC] = NULL;
+ if (cic->cfqq[BLK_RW_SYNC]) {
+ cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
+ cic->cfqq[BLK_RW_SYNC] = NULL;
}
}
@@ -1439,17 +1618,18 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
spin_lock_irqsave(cfqd->queue->queue_lock, flags);
- cfqq = cic->cfqq[ASYNC];
+ cfqq = cic->cfqq[BLK_RW_ASYNC];
if (cfqq) {
struct cfq_queue *new_cfqq;
- new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC);
+ new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
+ GFP_ATOMIC);
if (new_cfqq) {
- cic->cfqq[ASYNC] = new_cfqq;
+ cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
cfq_put_queue(cfqq);
}
}
- cfqq = cic->cfqq[SYNC];
+ cfqq = cic->cfqq[BLK_RW_SYNC];
if (cfqq)
cfq_mark_cfqq_prio_changed(cfqq);
@@ -1500,13 +1680,13 @@ retry:
}
RB_CLEAR_NODE(&cfqq->rb_node);
+ RB_CLEAR_NODE(&cfqq->p_node);
INIT_LIST_HEAD(&cfqq->fifo);
atomic_set(&cfqq->ref, 0);
cfqq->cfqd = cfqd;
cfq_mark_cfqq_prio_changed(cfqq);
- cfq_mark_cfqq_queue_new(cfqq);
cfq_init_prio_data(cfqq, ioc);
@@ -1736,7 +1916,9 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
sector_t sdist;
u64 total;
- if (cic->last_request_pos < rq->sector)
+ if (!cic->last_request_pos)
+ sdist = 0;
+ else if (cic->last_request_pos < rq->sector)
sdist = rq->sector - cic->last_request_pos;
else
sdist = cic->last_request_pos - rq->sector;
@@ -1893,14 +2075,22 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (cfqq == cfqd->active_queue) {
/*
- * if we are waiting for a request for this queue, let it rip
- * immediately and flag that we must not expire this queue
- * just now
+ * Remember that we saw a request from this process, but
+ * don't start queuing just yet. Otherwise we risk seeing lots
+ * of tiny requests, because we disrupt the normal plugging
+ * and merging. If the request is already larger than a single
+ * page, let it rip immediately. For that case we assume that
+ * merging is already done. Ditto for a busy system that
+ * has other work pending, don't risk delaying until the
+ * idle timer unplug to continue working.
*/
if (cfq_cfqq_wait_request(cfqq)) {
+ if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
+ cfqd->busy_queues > 1) {
+ del_timer(&cfqd->idle_slice_timer);
+ blk_start_queueing(cfqd->queue);
+ }
cfq_mark_cfqq_must_dispatch(cfqq);
- del_timer(&cfqd->idle_slice_timer);
- blk_start_queueing(cfqd->queue);
}
} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
/*
@@ -1910,7 +2100,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* this new queue is RT and the current one is BE
*/
cfq_preempt_queue(cfqd, cfqq);
- cfq_mark_cfqq_must_dispatch(cfqq);
blk_start_queueing(cfqd->queue);
}
}
@@ -1986,13 +2175,23 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
* or if we want to idle in case it has no pending requests.
*/
if (cfqd->active_queue == cfqq) {
+ const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
+
if (cfq_cfqq_slice_new(cfqq)) {
cfq_set_prio_slice(cfqd, cfqq);
cfq_clear_cfqq_slice_new(cfqq);
}
+ /*
+ * If there are no requests waiting in this queue, and
+ * there are other queues ready to issue requests, AND
+ * those other queues are issuing requests within our
+ * mean seek distance, give them a chance to run instead
+ * of idling.
+ */
if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
cfq_slice_expired(cfqd, 1);
- else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
+ else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
+ sync && !rq_noidle(rq))
cfq_arm_slice_timer(cfqd);
}
@@ -2054,7 +2253,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
if (!cic)
return ELV_MQUEUE_MAY;
- cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC);
+ cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
if (cfqq) {
cfq_init_prio_data(cfqq, cic->ioc);
cfq_prio_boost(cfqq);
@@ -2144,11 +2343,10 @@ static void cfq_kick_queue(struct work_struct *work)
struct cfq_data *cfqd =
container_of(work, struct cfq_data, unplug_work);
struct request_queue *q = cfqd->queue;
- unsigned long flags;
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irq(q->queue_lock);
blk_start_queueing(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irq(q->queue_lock);
}
/*
@@ -2170,6 +2368,12 @@ static void cfq_idle_slice_timer(unsigned long data)
timed_out = 0;
/*
+ * We saw a request before the queue expired, let it through
+ */
+ if (cfq_cfqq_must_dispatch(cfqq))
+ goto out_kick;
+
+ /*
* expired
*/
if (cfq_slice_used(cfqq))
@@ -2185,10 +2389,8 @@ static void cfq_idle_slice_timer(unsigned long data)
/*
* not expired and it has a request pending, let it dispatch
*/
- if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
- cfq_mark_cfqq_must_dispatch(cfqq);
+ if (!RB_EMPTY_ROOT(&cfqq->sort_list))
goto out_kick;
- }
}
expire:
cfq_slice_expired(cfqd, timed_out);
@@ -2251,12 +2453,22 @@ static void cfq_exit_queue(struct elevator_queue *e)
static void *cfq_init_queue(struct request_queue *q)
{
struct cfq_data *cfqd;
+ int i;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd)
return NULL;
cfqd->service_tree = CFQ_RB_ROOT;
+
+ /*
+ * Not strictly needed (since RB_ROOT just clears the node and we
+ * zeroed cfqd on alloc), but better be safe in case someone decides
+ * to add magic to the rb code
+ */
+ for (i = 0; i < CFQ_PRIO_LISTS; i++)
+ cfqd->prio_trees[i] = RB_ROOT;
+
INIT_LIST_HEAD(&cfqd->cic_list);
cfqd->queue = q;
diff --git a/block/elevator.c b/block/elevator.c
index 98259eda0ef..7073a907257 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -573,7 +573,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
}
-static void elv_drain_elevator(struct request_queue *q)
+void elv_drain_elevator(struct request_queue *q)
{
static int printed;
while (q->elevator->ops->elevator_dispatch_fn(q, 1))
@@ -587,6 +587,31 @@ static void elv_drain_elevator(struct request_queue *q)
}
}
+/*
+ * Call with queue lock held, interrupts disabled
+ */
+void elv_quiesce_start(struct request_queue *q)
+{
+ queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
+
+ /*
+ * make sure we don't have any requests in flight
+ */
+ elv_drain_elevator(q);
+ while (q->rq.elvpriv) {
+ blk_start_queueing(q);
+ spin_unlock_irq(q->queue_lock);
+ msleep(10);
+ spin_lock_irq(q->queue_lock);
+ elv_drain_elevator(q);
+ }
+}
+
+void elv_quiesce_end(struct request_queue *q)
+{
+ queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+}
+
void elv_insert(struct request_queue *q, struct request *rq, int where)
{
struct list_head *pos;
@@ -677,7 +702,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
}
if (unplug_it && blk_queue_plugged(q)) {
- int nrq = q->rq.count[READ] + q->rq.count[WRITE]
+ int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
- q->in_flight;
if (nrq >= q->unplug_thresh)
@@ -1101,18 +1126,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
* Turn on BYPASS and drain all requests w/ elevator private data
*/
spin_lock_irq(q->queue_lock);
-
- queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
-
- elv_drain_elevator(q);
-
- while (q->rq.elvpriv) {
- blk_start_queueing(q);
- spin_unlock_irq(q->queue_lock);
- msleep(10);
- spin_lock_irq(q->queue_lock);
- elv_drain_elevator(q);
- }
+ elv_quiesce_start(q);
/*
* Remember old elevator.
@@ -1136,7 +1150,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
*/
elevator_exit(old_elevator);
spin_lock_irq(q->queue_lock);
- queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
+ elv_quiesce_end(q);
spin_unlock_irq(q->queue_lock);
blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
diff --git a/block/genhd.c b/block/genhd.c
index a9ec910974c..1a4916e0173 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -98,7 +98,7 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
if (flags & DISK_PITER_REVERSE)
piter->idx = ptbl->len - 1;
- else if (flags & DISK_PITER_INCL_PART0)
+ else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
piter->idx = 0;
else
piter->idx = 1;
@@ -134,7 +134,8 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
/* determine iteration parameters */
if (piter->flags & DISK_PITER_REVERSE) {
inc = -1;
- if (piter->flags & DISK_PITER_INCL_PART0)
+ if (piter->flags & (DISK_PITER_INCL_PART0 |
+ DISK_PITER_INCL_EMPTY_PART0))
end = -1;
else
end = 0;
@@ -150,7 +151,10 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
part = rcu_dereference(ptbl->part[piter->idx]);
if (!part)
continue;
- if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects)
+ if (!part->nr_sects &&
+ !(piter->flags & DISK_PITER_INCL_EMPTY) &&
+ !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
+ piter->idx == 0))
continue;
get_device(part_to_dev(part));
@@ -1011,7 +1015,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
"\n\n");
*/
- disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0);
+ disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
while ((hd = disk_part_iter_next(&piter))) {
cpu = part_stat_lock();
part_round_stats(cpu, hd);
diff --git a/block/ioctl.c b/block/ioctl.c
index 0f22e629b13..ad474d4bbcc 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -146,8 +146,6 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
struct bio *bio;
bio = bio_alloc(GFP_KERNEL, 0);
- if (!bio)
- return -ENOMEM;
bio->bi_end_io = blk_ioc_discard_endio;
bio->bi_bdev = bdev;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 626ee274c5c..82a0ca2f672 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -217,7 +217,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
struct bio *bio)
{
- int ret = 0;
+ int r, ret = 0;
/*
* fill in all the output members
@@ -242,7 +242,9 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
ret = -EFAULT;
}
- blk_rq_unmap_user(bio);
+ r = blk_rq_unmap_user(bio);
+ if (!ret)
+ ret = r;
blk_put_request(rq);
return ret;
@@ -288,6 +290,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
if (hdr->iovec_count) {
const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
+ size_t iov_data_len;
struct sg_iovec *iov;
iov = kmalloc(size, GFP_KERNEL);
@@ -302,8 +305,18 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
goto out;
}
+ /* SG_IO howto says that the shorter of the two wins */
+ iov_data_len = iov_length((struct iovec *)iov,
+ hdr->iovec_count);
+ if (hdr->dxfer_len < iov_data_len) {
+ hdr->iovec_count = iov_shorten((struct iovec *)iov,
+ hdr->iovec_count,
+ hdr->dxfer_len);
+ iov_data_len = hdr->dxfer_len;
+ }
+
ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count,
- hdr->dxfer_len, GFP_KERNEL);
+ iov_data_len, GFP_KERNEL);
kfree(iov);
} else if (hdr->dxfer_len)
ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,