From 4722607db6a78bd7748c51fa4c8d7371da797254 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 12:55:09 -0400 Subject: Btrfs: only write one super copy during fsync During a tree-log commit for fsync, we've been writing at least two copies of the super block and forcing them to disk. The other filesystems write only one, and this change brings us on par with them. A full transaction commit will write all the super copies, so we still have redundant info written on a regular basis. Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 78f6254ac2d..6d9ec285644 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2092,7 +2092,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - write_ctree_super(trans, root->fs_info->tree_root, 2); + write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; out_wake_log_root: -- cgit v1.2.3 From 257c62e1bce03e5b9f3f069fd52ad73a56de71fd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:21:08 -0400 Subject: Btrfs: avoid tree log commit when there are no changes rpm has a habit of running fdatasync when the file hasn't changed. We already detect if a file hasn't been changed in the current transaction but it might have been sent to the tree-log in this transaction and not changed since the last call to fsync. In this case, we want to avoid a tree log sync, which includes a number of synchronous writes and barriers. This commit extends the existing tracking of the last transaction to change a file to also track the last sub-transaction. The end result is that rpm -ivh and -Uvh are roughly twice as fast, and on par with ext3. Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 6 ++++++ fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 ++ fs/btrfs/file.c | 41 ++++++++++++++++++++++++++--------------- fs/btrfs/inode.c | 6 +++++- fs/btrfs/transaction.h | 1 + fs/btrfs/tree-log.c | 27 +++++++++++++++++++++++++++ fs/btrfs/tree-log.h | 3 +++ 8 files changed, 71 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c71abec0ab9..f6783a42f01 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -86,6 +86,12 @@ struct btrfs_inode { * transid of the trans_handle that last modified this inode */ u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + /* * transid that last logged this inode */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36a19cd43e0..d0cede5ff25 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1009,6 +1009,7 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; unsigned long log_transid; + unsigned long last_log_commit; unsigned long log_batch; pid_t log_start_pid; bool log_multiple_pids; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ac8927bdc33..d4132aad9ea 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -919,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); root->log_batch = 0; root->log_transid = 0; + root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1089,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->last_log_commit = 0; return 0; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 53fb1c997f0..4599113ed72 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1087,8 +1087,10 @@ out_nolock: btrfs_end_transaction(trans, root); else btrfs_commit_transaction(trans, root); - } else { + } else if (ret != BTRFS_NO_LOG_SYNC) { btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); } } if (file->f_flags & O_DIRECT) { @@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + + /* we wait first, since the writeback may change the inode */ + root->log_batch++; + /* the VFS called filemap_fdatawrite for us */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->log_batch++; + /* * check the transaction that last modified this inode * and see if its already been committed @@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (!BTRFS_I(inode)->last_trans) goto out; + /* + * if the last transaction that changed this file was before + * the current transaction, we can bail out now without any + * syncing + */ mutex_lock(&root->fs_info->trans_mutex); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { @@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_unlock(&root->fs_info->trans_mutex); - root->log_batch++; - filemap_fdatawrite(inode->i_mapping); - btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->log_batch++; - - if (datasync && !(inode->i_state & I_DIRTY_PAGES)) - goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ @@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) */ mutex_unlock(&dentry->d_inode->i_mutex); - if (ret > 0) { - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - ret = btrfs_end_transaction(trans, root); - else + if (ret != BTRFS_NO_LOG_SYNC) { + if (ret > 0) { ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + } else { + ret = btrfs_end_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ef399a7794f..5b9567caba0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3480,6 +3480,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->generation = 0; bi->sequence = 0; bi->last_trans = 0; + bi->last_sub_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; bi->reserved_bytes = 0; @@ -4980,7 +4981,9 @@ again: set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -5100,6 +5103,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->last_trans = 0; + ei->last_sub_trans = 0; ei->logged_trans = 0; ei->outstanding_extents = 0; ei->reserved_extents = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 663c6740491..f68cbbe61e5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6d9ec285644..0a1bde26896 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1980,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + u64 log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -2018,6 +2019,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; + log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; @@ -2095,6 +2097,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); @@ -2862,6 +2869,21 @@ out: return ret; } +static int inode_in_log(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == trans->transid && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -2901,6 +2923,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; + if (inode_in_log(trans, inode)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + start_log_trans(trans, root); ret = btrfs_log_inode(trans, root, inode, inode_only); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index d09c7609e16..0776eacb508 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); -- cgit v1.2.3 From 690587d109ffe19d6743e4cc80c18b0906b7f9ff Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:29:19 -0400 Subject: Btrfs: streamline tree-log btree block writeout Syncing the tree log is a 3 phase operation. 1) write and wait for all the tree log blocks for a given root. 2) write and wait for all the tree log blocks for the tree of tree log roots. 3) write and wait for the super blocks (barriers here) This isn't as efficient as it could be because there is no requirement to wait for the blocks from step one to hit the disk before we start writing the blocks from step two. This commit changes the sequence so that we don't start waiting until all the tree blocks from both steps one and two have been sent to disk. We do this by breaking up btrfs_write_wait_marked_extents into two functions, which is trivial because it was already broken up into two parts. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/transaction.h | 4 ++++ fs/btrfs/tree-log.c | 8 +++++++- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0b8f36d4400..bca82a4ca8e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -344,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit + * those extents are sent to disk but does not wait on them */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) { int ret; int err = 0; @@ -394,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, page_cache_release(page); } } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + while (1) { ret = find_first_extent_bit(dirty_pages, 0, &start, &end, EXTENT_DIRTY); @@ -424,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return werr; } +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int ret2; + + ret = btrfs_write_marked_extents(root, dirty_pages); + ret2 = btrfs_wait_marked_extents(root, dirty_pages); + return ret || ret2; +} + int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f68cbbe61e5..d4e3e7a6938 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -108,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0a1bde26896..4aff766d171 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2013,7 +2013,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); @@ -2048,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2067,6 +2071,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2075,6 +2080,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); BUG_ON(ret); + btrfs_wait_marked_extents(log, &log->dirty_log_pages); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); -- cgit v1.2.3 From 0eda294dfc980c1cbe4f8a0564bf543f86a01ddb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:50:18 -0400 Subject: Btrfs: fix btrfs acl #ifdef checks The btrfs acl code was #ifdefing for a define that didn't exist. This correctly matches it to the values used by the Kconfig file. Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 6 +++--- fs/btrfs/ctree.h | 2 +- fs/btrfs/super.c | 2 +- fs/btrfs/xattr.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 69b355ae7f4..36160424427 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_BTRFS_POSIX_ACL */ +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_BTRFS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d0cede5ff25..dfd7e6fc66d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2374,7 +2374,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e0a64328080..3fbbf0761ac 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -344,7 +344,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b0fc93f95fd..b6dd5967c48 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -260,7 +260,7 @@ err: * attributes are handled directly. */ struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, #endif -- cgit v1.2.3 From 5d5e103a70f74ae98e3965a4add1ab951d0651d1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 13 Oct 2009 16:46:49 -0400 Subject: Btrfs: fix possible ENOSPC problems with truncate There's a problem where we don't do any space reservation for truncates, which can cause you to OOPs because you will be allowed to go off in the weeds a bit since we don't account for the delalloc bytes that are created as a result of the truncate. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b9567caba0..78139efe41f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3032,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) + goto out; ret = -ENOMEM; again: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); goto out; + } page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -3070,6 +3080,10 @@ again: goto again; } + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); if (ret) { unlock_extent(io_tree, page_start, page_end, GFP_NOFS); @@ -3088,6 +3102,9 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + if (ret) + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); unlock_page(page); page_cache_release(page); out: @@ -3111,7 +3128,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + err = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (err) + return err; while (1) { struct btrfs_ordered_extent *ordered; @@ -5008,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); -- cgit v1.2.3 From 86df7eb921a009515285e7171363fa57dd2d7d31 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: properly wait log writers during log sync A recently fsync optimization make btrfs_sync_log skip calling wait_for_writer in the single log writer case. This is incorrect since the writer count can also be increased by btrfs_pin_log. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4aff766d171..f51bf13125c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1995,12 +1995,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (root->log_multiple_pids) { + while (1) { unsigned long batch = root->log_batch; - mutex_unlock(&root->log_mutex); - schedule_timeout_uninterruptible(1); - mutex_lock(&root->log_mutex); - + if (root->log_multiple_pids) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } wait_for_writer(trans, root); if (batch == root->log_batch) break; -- cgit v1.2.3 From e244a0aeb6a599c19a7c802cda6e2d67c847b154 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: add -o discard option Enable discard by default is not a good idea given the the trim speed of SSD prototypes we've seen, and the carecteristics for many high-end arrays. Turn of discards by default and require the -o discard option to enable them on. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 3 +++ fs/btrfs/super.c | 7 ++++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dfd7e6fc66d..e5dd628a526 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1153,6 +1153,7 @@ struct btrfs_root { #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4aedbff36b8..bf7782fc593 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1584,6 +1584,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; + if (!btrfs_test_opt(root, DISCARD)) + return 0; + /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, bytenr, &map_length, &multi, 0); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3fbbf0761ac..939b68f0612 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_discard, Opt_err, }; static match_table_t tokens = { @@ -88,6 +89,7 @@ static match_table_t tokens = { {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, {Opt_err, NULL}, }; @@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->metadata_ratio); } break; + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; default: break; } -- cgit v1.2.3 From 0634857488ec6e28fa22920cd0bee3c2ac07ccfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: enable discard support The discard support code in btrfs currently is guarded by ifdefs for BIO_RW_DISCARD, which is never defines as it's the name of an enum memeber. Just remove the useless ifdefs to actually enable the code. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bf7782fc593..d4a35283d99 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1568,18 +1568,15 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); } -#endif static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { -#ifdef BIO_RW_DISCARD int ret; u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; @@ -1606,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } return ret; -#else - return 0; -#endif } int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 444528b3e614f7f2391488d9bca8e0b872db909b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 14 Oct 2009 09:38:28 -0400 Subject: Btrfs: always pin metadata in discard mode We have an optimization in btrfs to allow blocks to be immediately freed if they were allocated in this transaction and never written. Otherwise they are pinned and freed when the transaction commits. This isn't optimal for discard mode because immediately freeing them means immediately discarding them. It is better to give the block to the pinning code and letting the (slow) discard happen later. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d4a35283d99..c56f91639dc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3686,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, if (is_data) goto pinit; + /* + * discard is sloooow, and so triggering discards on + * individual btree blocks isn't a good plan. Just + * pin everything in discard mode. + */ + if (btrfs_test_opt(root, DISCARD)) + goto pinit; + buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (!buf) goto pinit; -- cgit v1.2.3