diff options
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 18 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 35 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 2 | ||||
-rw-r--r-- | fs/btrfs/file.c | 26 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 81 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 118 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.h | 4 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 11 |
8 files changed, 288 insertions, 7 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3af4cfb5654..b30986f00b9 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,6 +66,12 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; + /* + * list for tracking inodes that must be sent to disk before a + * rename or truncate commit + */ + struct list_head ordered_operations; + /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; @@ -122,6 +128,18 @@ struct btrfs_inode { */ u64 last_unlink_trans; + /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + * + * yes, its silly to have a single bitflag, but we might grow more + * of these. + */ + unsigned ordered_data_close:1; + struct inode vfs_inode; }; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2737facbd34..f48905ee524 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -45,6 +45,13 @@ struct btrfs_ordered_sum; #define BTRFS_MAX_LEVEL 8 +/* + * files bigger than this get some pre-flushing when they are added + * to the ordered operations list. That way we limit the total + * work done by the commit + */ +#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) + /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -727,6 +734,15 @@ struct btrfs_fs_info { struct mutex volume_mutex; struct mutex tree_reloc_mutex; + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make + * sure the commit code doesn't find the list temporarily empty + * because another function happens to be doing non-waiting preflush + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -741,10 +757,29 @@ struct btrfs_fs_info { * ordered extents */ spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ struct list_head ordered_extents; + + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ struct list_head delalloc_inodes; /* + * special rename and truncate targets that must be on disk before + * we're allowed to commit. This is basically the ext3 style + * data=ordered list. + */ + struct list_head ordered_operations; + + /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers * can run with FS locks held, and the writers may be waiting for diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9244cd7313d..1747dfd1865 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); @@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, insert_inode_hash(fs_info->btree_inode); mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); mutex_init(&fs_info->pinned_mutex); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 32d10a61761..9c9fb46ccd0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1161,6 +1161,20 @@ out_nolock: page_cache_release(pinned[1]); *ppos = pos; + /* + * we want to make sure fsync finds this change + * but we haven't joined a transaction running right now. + * + * Later on, someone is sure to update the inode and get the + * real transid recorded. + * + * We set last_trans now to the fs_info generation + 1, + * this will either be one more than the running transaction + * or the generation used for the next transaction if there isn't + * one running right now. + */ + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + if (num_written > 0 && will_write) { struct btrfs_trans_handle *trans; @@ -1194,6 +1208,18 @@ out_nolock: int btrfs_release_file(struct inode *inode, struct file *filp) { + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (BTRFS_I(inode)->ordered_data_close) { + BTRFS_I(inode)->ordered_data_close = 0; + btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(inode->i_mapping); + } if (filp->private_data) btrfs_ioctl_trans_end(filp); return 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bffd79faffb..1cff528d5b5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + if (attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; + } else if (inode->i_size > 0 && + attr->ia_size == 0) { + + /* we're truncating a file that used to have good + * data down to zero. Make sure it gets into + * the ordered flush list so that any new writes + * get down to disk quickly. + */ + BTRFS_I(inode)->ordered_data_close = 1; + } } err = inode_setattr(inode, attr); @@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode) extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -4419,6 +4430,8 @@ again: } ClearPageChecked(page); set_page_dirty(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); + + /* + * setattr is responsible for setting the ordered_data_close flag, + * but that is only tested during the last file release. That + * could happen well after the next commit, leaving a great big + * window where new writes may get lost if someone chooses to write + * to this file after truncating to zero + * + * The inode doesn't have any dirty data here, and so if we commit + * this is a noop. If someone immediately starts writing to the inode + * it is very likely we'll catch some of their writes in this + * transaction, and the commit will find this file on the ordered + * data list with good things to send down. + * + * This is a best effort solution, there is still a window where + * using truncate to replace the contents of the file will + * end up with a zero length file after a crash. + */ + if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + btrfs_add_ordered_operation(trans, root, inode); + btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, inode->i_size); @@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_acl = BTRFS_ACL_NOT_CACHED; ei->i_default_acl = BTRFS_ACL_NOT_CACHED; INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->ordered_operations); return &ei->vfs_inode; } void btrfs_destroy_inode(struct inode *inode) { struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); @@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode) BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) posix_acl_release(BTRFS_I(inode)->i_default_acl); - spin_lock(&BTRFS_I(inode)->root->list_lock); + /* + * Make sure we're properly removed from the ordered operation + * lists. + */ + smp_mb(); + if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { + spin_lock(&root->fs_info->ordered_extent_lock); + list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + } + + spin_lock(&root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" " list\n", inode->i_ino); dump_stack(); } - spin_unlock(&BTRFS_I(inode)->root->list_lock); + spin_unlock(&root->list_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_unlock; + /* + * we're using rename to replace one file with another. + * and the replacement file is large. Start IO on it now so + * we don't add too much work to the end of the transaction + */ + if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && + new_inode->i_size && + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + trans = btrfs_start_transaction(root, 1); /* + * make sure the inode gets flushed if it is replacing + * something. + */ + if (new_inode && new_inode->i_size && + old_inode && S_ISREG(old_inode->i_mode)) { + btrfs_add_ordered_operation(trans, root, old_inode); + } + + /* * this is an ugly little race, but the rename is required to make * sure that if we crash, the inode is either at the old name * or the new one. pinning the log transaction lets us make sure diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 77c2411a5f0..53c87b197d7 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); + + /* + * we have no more ordered extents for this inode and + * no dirty pages. We can safely remove it from the + * list of ordered extents + */ + if (RB_EMPTY_ROOT(&tree->tree) && + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + list_del_init(&BTRFS_I(inode)->ordered_operations); + } spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); mutex_unlock(&tree->mutex); @@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) } /* + * this is used during transaction commit to write all the inodes + * added to the ordered operation list. These files must be fully on + * disk before the transaction commits. + * + * we have two modes here, one is to just start the IO via filemap_flush + * and the other is to wait for all the io. When we wait, we have an + * extra check to make sure the ordered operation list really is empty + * before we return + */ +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +{ + struct btrfs_inode *btrfs_inode; + struct inode *inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); +again: + list_splice_init(&root->fs_info->ordered_operations, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + inode = &btrfs_inode->vfs_inode; + + list_del_init(&btrfs_inode->ordered_operations); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(inode); + + if (!wait && inode) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + if (wait) + btrfs_wait_ordered_range(inode, 0, (u64)-1); + else + filemap_flush(inode->i_mapping); + iput(inode); + } + + cond_resched(); + spin_lock(&root->fs_info->ordered_extent_lock); + } + if (wait && !list_empty(&root->fs_info->ordered_operations)) + goto again; + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +/* * Used to start IO or wait for a given ordered extent to finish. * * If wait is one, this effectively waits on page writeback for all the pages @@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, return ret; } + +/* + * add a given inode to the list of inodes that must be fully on + * disk before a transaction commit finishes. + * + * This basically gives us the ext3 style data=ordered mode, and it is mostly + * used to make sure renamed files are fully on disk. + * + * It is a noop if the inode is already fully on disk. + * + * If trans is not null, we'll do a friendly check for a transaction that + * is already flushing things and force the IO down ourselves. + */ +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + u64 last_mod; + + last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); + + /* + * if this file hasn't been changed since the last transaction + * commit, we can safely return without doing anything + */ + if (last_mod < root->fs_info->last_trans_committed) + return 0; + + /* + * the transaction is already committing. Just start the IO and + * don't bother with all of this list nonsense + */ + if (trans && root->fs_info->running_transaction->blocked) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + return 0; + } + + spin_lock(&root->fs_info->ordered_extent_lock); + if (list_empty(&BTRFS_I(inode)->ordered_operations)) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ab66d5e8d6d..3d31c8827b0 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9c8f158dd2d..664782c6a2d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int should_grow = 0; unsigned long now = get_seconds(); + btrfs_run_ordered_operations(root, 0); + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ @@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); } + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + smp_mb(); if (cur_trans->num_writers > 1 || should_grow) schedule_timeout(timeout); |