aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-11-07 18:22:45 -0500
committerChris Mason <chris.mason@oracle.com>2008-11-07 18:22:45 -0500
commit5f2cc086ccab27ac5252b3883ac004347860b4c7 (patch)
tree5d9d1a5ebce044fabf6491e454af60289895bba5
parent42e70e7a2f9d96fd843723fa46d5121cb3e551d0 (diff)
Btrfs: Avoid unplug storms during commit
While doing a commit, btrfs makes sure all the metadata blocks were properly written to disk, calling wait_on_page_writeback for each page. This writeback happens after allowing another transaction to start, so it competes for the disk with other processes in the FS. If the page writeback bit is still set, each wait_on_page_writeback might trigger an unplug, even though the page might be waiting for checksumming to finish or might be waiting for the async work queue to submit the bio. This trades wait_on_page_writeback for waiting on the extent writeback bits. It won't trigger any unplugs and substantially improves performance in a number of workloads. This also changes the async bio submission to avoid requeueing if there is only one device. The requeue just wastes CPU time because there are no other devices to service. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/transaction.c30
-rw-r--r--fs/btrfs/volumes.c3
2 files changed, 31 insertions, 2 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e72a013d24b..202c1b6df4a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -20,6 +20,7 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
+#include <linux/blkdev.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -331,6 +332,7 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
int werr = 0;
struct page *page;
struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
u64 start = 0;
u64 end;
unsigned long index;
@@ -371,6 +373,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
page_cache_release(page);
}
}
+ /*
+ * we unplug once and then use the wait_on_extent_bit for
+ * everything else
+ */
+ blk_run_address_space(btree_inode->i_mapping);
while(1) {
ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
EXTENT_DIRTY);
@@ -391,7 +398,28 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
if (err)
werr = err;
}
- wait_on_page_writeback(page);
+ if (PageWriteback(page)) {
+ /*
+ * we don't wait on the page writeback bit
+ * because that triggers a lot of unplugs.
+ * The extent bits are much nicer to
+ * the disks, but come with a slightly
+ * higher latency because we aren't forcing
+ * unplugs.
+ */
+ wait_on_extent_writeback(io_tree,
+ page_offset(page),
+ page_offset(page) +
+ PAGE_CACHE_SIZE - 1);
+ }
+ if (PageWriteback(page)) {
+ /*
+ * the state bits get cleared before the
+ * page bits, lets add some extra
+ * paranoia here
+ */
+ wait_on_page_writeback(page);
+ }
page_cache_release(page);
cond_resched();
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cbb9bb31431..80a27284dbf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -200,7 +200,8 @@ loop:
* is now congested. Back off and let other work structs
* run instead
*/
- if (pending && bdi_write_congested(bdi)) {
+ if (pending && bdi_write_congested(bdi) &&
+ fs_info->fs_devices->open_devices > 1) {
struct bio *old_head;
spin_lock(&device->io_lock);