aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bio.c5
-rw-r--r--fs/btrfs/ctree.c58
-rw-r--r--fs/btrfs/ctree.h11
-rw-r--r--fs/btrfs/disk-io.c46
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c83
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/inode-map.c1
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/locking.c11
-rw-r--r--fs/btrfs/super.c5
-rw-r--r--fs/btrfs/transaction.c2
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c6
-rw-r--r--fs/buffer.c3
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/inode.c27
-rw-r--r--fs/ext4/mballoc.c32
-rw-r--r--fs/ext4/migrate.c8
-rw-r--r--fs/ext4/super.c11
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/jbd2/transaction.c42
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/notify/inotify/inotify.c2
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/seq_file.c36
-rw-r--r--fs/super.c17
-rw-r--r--fs/timerfd.c12
29 files changed, 322 insertions, 147 deletions
diff --git a/fs/bio.c b/fs/bio.c
index 062299acbcc..72ab251cdb9 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,9 +302,10 @@ void bio_init(struct bio *bio)
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
struct bio *bio = NULL;
+ void *p;
if (bs) {
- void *p = mempool_alloc(bs->bio_pool, gfp_mask);
+ p = mempool_alloc(bs->bio_pool, gfp_mask);
if (p)
bio = p + bs->front_pad;
@@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
}
if (unlikely(!bvl)) {
if (bs)
- mempool_free(bio, bs->bio_pool);
+ mempool_free(p, bs->bio_pool);
else
kfree(bio);
bio = NULL;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35443cc4b9a..42491d728e9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
-inline void btrfs_init_path(struct btrfs_path *p)
-{
- memset(p, 0, sizeof(*p));
-}
-
struct btrfs_path *btrfs_alloc_path(void)
{
struct btrfs_path *path;
- path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
- if (path) {
- btrfs_init_path(path);
+ path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+ if (path)
path->reada = 1;
- }
return path;
}
@@ -69,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
/*
* reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path. You can safely use NULL
+ * for held
*/
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+ struct extent_buffer *held)
{
int i;
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /* lockdep really cares that we take all of these spinlocks
+ * in the right order. If any of the locks in the path are not
+ * currently blocking, it is going to complain. So, make really
+ * really sure by forcing the path to blocking before we clear
+ * the path blocking.
+ */
+ if (held)
+ btrfs_set_lock_blocking(held);
+ btrfs_set_path_blocking(p);
+#endif
+
+ for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
if (p->nodes[i] && p->locks[i])
btrfs_clear_lock_blocking(p->nodes[i]);
}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (held)
+ btrfs_clear_lock_blocking(held);
+#endif
}
/* this also releases the path */
@@ -286,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid, level, &ins);
BUG_ON(ret);
cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
- buf->len);
+ buf->len, level);
} else {
cow = btrfs_alloc_free_block(trans, root, buf->len,
parent_start,
@@ -917,9 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* promote the child to a root */
child = read_node_slot(root, mid, 0);
+ BUG_ON(!child);
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
- BUG_ON(!child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
BUG_ON(ret);
@@ -1566,7 +1583,7 @@ cow_done:
if (!p->skip_locking)
p->locks[level] = 1;
- btrfs_clear_path_blocking(p);
+ btrfs_clear_path_blocking(p, NULL);
/*
* we have a lock on b and as long as we aren't changing
@@ -1605,7 +1622,7 @@ cow_done:
btrfs_set_path_blocking(p);
sret = split_node(trans, root, p, level);
- btrfs_clear_path_blocking(p);
+ btrfs_clear_path_blocking(p, NULL);
BUG_ON(sret > 0);
if (sret) {
@@ -1625,7 +1642,7 @@ cow_done:
btrfs_set_path_blocking(p);
sret = balance_level(trans, root, p, level);
- btrfs_clear_path_blocking(p);
+ btrfs_clear_path_blocking(p, NULL);
if (sret) {
ret = sret;
@@ -1688,13 +1705,13 @@ cow_done:
if (!p->skip_locking) {
int lret;
- btrfs_clear_path_blocking(p);
+ btrfs_clear_path_blocking(p, NULL);
lret = btrfs_try_spin_lock(b);
if (!lret) {
btrfs_set_path_blocking(p);
btrfs_tree_lock(b);
- btrfs_clear_path_blocking(p);
+ btrfs_clear_path_blocking(p, b);
}
}
} else {
@@ -1706,7 +1723,7 @@ cow_done:
btrfs_set_path_blocking(p);
sret = split_leaf(trans, root, key,
p, ins_len, ret == 0);
- btrfs_clear_path_blocking(p);
+ btrfs_clear_path_blocking(p, NULL);
BUG_ON(sret > 0);
if (sret) {
@@ -3926,7 +3943,6 @@ find_next_key:
btrfs_release_path(root, path);
goto again;
} else {
- btrfs_clear_path_blocking(path);
goto out;
}
}
@@ -3946,7 +3962,7 @@ find_next_key:
path->locks[level - 1] = 1;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1);
- btrfs_clear_path_blocking(path);
+ btrfs_clear_path_blocking(path, NULL);
}
out:
if (ret == 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 531db112c8b..766b31ae318 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
#define BTRFS_ACL_NOT_CACHED ((void *)-1)
-#ifdef CONFIG_LOCKDEP
-# define BTRFS_MAX_LEVEL 7
-#else
-# define BTRFS_MAX_LEVEL 8
-#endif
+#define BTRFS_MAX_LEVEL 8
/* holds pointers to all of the tree roots */
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -1715,7 +1711,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
u64 empty_size);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u32 blocksize);
+ u64 bytenr, u32 blocksize,
+ int level);
int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1834,9 +1831,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
-void btrfs_init_path(struct btrfs_path *p);
void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p);
void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aebddd7119..adda739a021 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -75,6 +75,40 @@ struct async_submit_bio {
struct btrfs_work work;
};
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+# error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+ /* leaf */
+ "btrfs-extent-00",
+ "btrfs-extent-01",
+ "btrfs-extent-02",
+ "btrfs-extent-03",
+ "btrfs-extent-04",
+ "btrfs-extent-05",
+ "btrfs-extent-06",
+ "btrfs-extent-07",
+ /* highest possible level */
+ "btrfs-extent-08",
+};
+#endif
+
/*
* extents on the btree inode are pretty simple, there's one extent
* that covers the entire device
@@ -347,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
return ret;
}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+ lockdep_set_class_and_name(&eb->lock,
+ &btrfs_eb_class[level],
+ btrfs_eb_name[level]);
+}
+#endif
+
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state)
{
@@ -392,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
}
found_level = btrfs_header_level(eb);
+ btrfs_set_buffer_lockdep_class(eb, found_level);
+
ret = csum_tree_block(root, eb, 1);
if (ret)
ret = -EIO;
@@ -1777,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_DEV_TREE_OBJECTID, dev_root);
dev_root->track_dirty = 1;
-
if (ret)
goto fail_extent_root;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 494a56eb298..95029db227b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,4 +101,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btree_lock_page_hook(struct page *page);
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+ int level)
+{
+}
+#endif
#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7527523c2d2..0a5d796c9f7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1323,8 +1323,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- finish_current_insert(trans, root->fs_info->extent_root, 1);
- del_pending_extents(trans, root->fs_info->extent_root, 1);
+ u64 start;
+ u64 end;
+ int ret;
+
+ while(1) {
+ finish_current_insert(trans, root->fs_info->extent_root, 1);
+ del_pending_extents(trans, root->fs_info->extent_root, 1);
+
+ /* is there more work to do? */
+ ret = find_first_extent_bit(&root->fs_info->pending_del,
+ 0, &start, &end, EXTENT_WRITEBACK);
+ if (!ret)
+ continue;
+ ret = find_first_extent_bit(&root->fs_info->extent_ins,
+ 0, &start, &end, EXTENT_WRITEBACK);
+ if (!ret)
+ continue;
+ break;
+ }
return 0;
}
@@ -2211,13 +2228,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
u64 end;
u64 priv;
u64 search = 0;
- u64 skipped = 0;
struct btrfs_fs_info *info = extent_root->fs_info;
struct btrfs_path *path;
struct pending_extent_op *extent_op, *tmp;
struct list_head insert_list, update_list;
int ret;
- int num_inserts = 0, max_inserts;
+ int num_inserts = 0, max_inserts, restart = 0;
path = btrfs_alloc_path();
INIT_LIST_HEAD(&insert_list);
@@ -2233,19 +2249,19 @@ again:
ret = find_first_extent_bit(&info->extent_ins, search, &start,
&end, EXTENT_WRITEBACK);
if (ret) {
- if (skipped && all && !num_inserts &&
+ if (restart && !num_inserts &&
list_empty(&update_list)) {
- skipped = 0;
+ restart = 0;
search = 0;
continue;
}
- mutex_unlock(&info->extent_ins_mutex);
break;
}
ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
if (!ret) {
- skipped = 1;
+ if (all)
+ restart = 1;
search = end + 1;
if (need_resched()) {
mutex_unlock(&info->extent_ins_mutex);
@@ -2264,7 +2280,7 @@ again:
list_add_tail(&extent_op->list, &insert_list);
search = end + 1;
if (num_inserts == max_inserts) {
- mutex_unlock(&info->extent_ins_mutex);
+ restart = 1;
break;
}
} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2280,7 +2296,6 @@ again:
* somebody marked this thing for deletion then just unlock it and be
* done, the free_extents will handle it
*/
- mutex_lock(&info->extent_ins_mutex);
list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
clear_extent_bits(&info->extent_ins, extent_op->bytenr,
extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2302,6 +2317,10 @@ again:
if (!list_empty(&update_list)) {
ret = update_backrefs(trans, extent_root, path, &update_list);
BUG_ON(ret);
+
+ /* we may have COW'ed new blocks, so lets start over */
+ if (all)
+ restart = 1;
}
/*
@@ -2309,9 +2328,9 @@ again:
* need to make sure everything is cleaned then reset everything and
* go back to the beginning
*/
- if (!num_inserts && all && skipped) {
+ if (!num_inserts && restart) {
search = 0;
- skipped = 0;
+ restart = 0;
INIT_LIST_HEAD(&update_list);
INIT_LIST_HEAD(&insert_list);
goto again;
@@ -2368,27 +2387,19 @@ again:
BUG_ON(ret);
/*
- * if we broke out of the loop in order to insert stuff because we hit
- * the maximum number of inserts at a time we can handle, then loop
- * back and pick up where we left off
+ * if restart is set for whatever reason we need to go back and start
+ * searching through the pending list again.
+ *
+ * We just inserted some extents, which could have resulted in new
+ * blocks being allocated, which would result in new blocks needing
+ * updates, so if all is set we _must_ restart to get the updated
+ * blocks.
*/
- if (num_inserts == max_inserts) {
- INIT_LIST_HEAD(&insert_list);
- INIT_LIST_HEAD(&update_list);
- num_inserts = 0;
- goto again;
- }
-
- /*
- * again, if we need to make absolutely sure there are no more pending
- * extent operations left and we know that we skipped some, go back to
- * the beginning and do it all again
- */
- if (all && skipped) {
+ if (restart || all) {
INIT_LIST_HEAD(&insert_list);
INIT_LIST_HEAD(&update_list);
search = 0;
- skipped = 0;
+ restart = 0;
num_inserts = 0;
goto again;
}
@@ -2709,6 +2720,8 @@ again:
goto again;
}
+ if (!err)
+ finish_current_insert(trans, extent_root, 0);
return err;
}
@@ -2859,7 +2872,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
if (data & BTRFS_BLOCK_GROUP_METADATA) {
last_ptr = &root->fs_info->last_alloc;
- empty_cluster = 64 * 1024;
+ if (!btrfs_test_opt(root, SSD))
+ empty_cluster = 64 * 1024;
}
if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3402,7 +3416,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u32 blocksize)
+ u64 bytenr, u32 blocksize,
+ int level)
{
struct extent_buffer *buf;
@@ -3410,6 +3425,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
if (!buf)
return ERR_PTR(-ENOMEM);
btrfs_set_header_generation(buf, trans->transid);
+ btrfs_set_buffer_lockdep_class(buf, level);
btrfs_tree_lock(buf);
clean_tree_block(trans, root, buf);
@@ -3453,7 +3469,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
}
- buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+ buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+ blocksize, level);
return buf;
}
@@ -5641,7 +5658,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
prev_block = block_start;
}
+ mutex_lock(&extent_root->fs_info->trans_mutex);
btrfs_record_root_in_trans(found_root);
+ mutex_unlock(&extent_root->fs_info->trans_mutex);
if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
/*
* try to update data extent references while
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 37d43b516b7..ebe6b29e606 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
if (node) {
- struct extent_state *found;
- found = rb_entry(node, struct extent_state, rb_node);
free_extent_state(prealloc);
return -EEXIST;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb4..cc7334d833c 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
search_key.type = 0;
search_key.offset = 0;
- btrfs_init_path(path);
start_found = 0;
ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f0706210a4..3cee77ae03c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2531,8 +2531,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
key.offset = (u64)-1;
key.type = (u8)-1;
- btrfs_init_path(path);
-
search_again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -4263,7 +4261,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
return 0;
- return __btrfs_releasepage(page, gfp_flags);
+ return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
}
static void btrfs_invalidatepage(struct page *page, unsigned long offset)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9ebe9385129..85506c4a3af 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,21 +25,10 @@
#include "extent_io.h"
#include "locking.h"
-/*
- * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * on
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void spin_nested(struct extent_buffer *eb)
-{
- spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
-}
-#else
static inline void spin_nested(struct extent_buffer *eb)
{
spin_lock(&eb->lock);
}
-#endif
/*
* Setting a lock to blocking will drop the spinlock and set the
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3fd7e2cbc3..19a4daf03cc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -379,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
btrfs_start_delalloc_inodes(root);
btrfs_wait_ordered_extents(root, 0);
- btrfs_clean_old_snapshots(root);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
sb->s_dirt = 0;
@@ -511,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
struct btrfs_root *root = btrfs_sb(sb);
int ret;
+ ret = btrfs_parse_options(root, data);
+ if (ret)
+ return -EINVAL;
+
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 919172de5c9..4112d53d4f4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
num_bytes -= btrfs_root_used(&dirty->root->root_item);
bytes_used = btrfs_root_used(&root->root_item);
if (num_bytes) {
+ mutex_lock(&root->fs_info->trans_mutex);
btrfs_record_root_in_trans(root);
+ mutex_unlock(&root->fs_info->trans_mutex);
btrfs_set_root_used(&root->root_item,
bytes_used - num_bytes);
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 20794290256..9c462fbd60f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2832,7 +2832,9 @@ again:
BUG_ON(!wc.replay_dest);
wc.replay_dest->log_root = log;
+ mutex_lock(&fs_info->trans_mutex);
btrfs_record_root_in_trans(wc.replay_dest);
+ mutex_unlock(&fs_info->trans_mutex);
ret = walk_log_tree(trans, log, &wc);
BUG_ON(ret);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bcd14ebccae..1316139bf9e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2894,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
free_extent_map(em);
}
- map = kzalloc(sizeof(*map), GFP_NOFS);
- if (!map)
- return -ENOMEM;
-
em = alloc_extent_map(GFP_NOFS);
if (!em)
return -ENOMEM;
@@ -3106,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
if (!sb)
return -ENOMEM;
btrfs_set_buffer_uptodate(sb);
+ btrfs_set_buffer_lockdep_class(sb, 0);
+
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25b..9f697419ed8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page,
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_bdi_stat(mapping->backing_dev_info,
BDI_RECLAIMABLE);
+ task_dirty_inc(current);
task_io_account_write(PAGE_CACHE_SIZE);
}
radix_tree_tag_set(&mapping->page_tree,
@@ -3108,7 +3109,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
if (test_clear_buffer_dirty(bh)) {
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(WRITE_SYNC, bh);
+ ret = submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (buffer_eopnotsupp(bh)) {
clear_buffer_eopnotsupp(bh);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9c6d815dd19..39bd4d38e88 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1938,6 +1938,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
/* Big K */
COMPATIBLE_IOCTL(PIO_FONT)
COMPATIBLE_IOCTL(GIO_FONT)
+COMPATIBLE_IOCTL(PIO_CMAP)
+COMPATIBLE_IOCTL(GIO_CMAP)
ULONG_IOCTL(KDSIGACCEPT)
COMPATIBLE_IOCTL(KDGETKEYCODE)
COMPATIBLE_IOCTL(KDSETKEYCODE)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aafc9eba1c2..b0c87dce66a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
{
unsigned len = le16_to_cpu(dlen);
- if (len == EXT4_MAX_REC_LEN)
+ if (len == EXT4_MAX_REC_LEN || len == 0)
return 1 << 16;
return len;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20be132..cbd2ca99d11 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
- return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
- new_size);
+ return jbd2_journal_begin_ordered_truncate(
+ EXT4_SB(inode->i_sb)->s_journal,
+ &EXT4_I(inode)->jinode,
+ new_size);
}
static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -2437,6 +2439,7 @@ static int ext4_da_writepages(struct address_space *mapping,
int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
+ int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0, nr_to_writebump = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2488,9 +2491,15 @@ static int ext4_da_writepages(struct address_space *mapping,
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- if (wbc->range_cyclic)
+ range_cyclic = wbc->range_cyclic;
+ if (wbc->range_cyclic) {
index = mapping->writeback_index;
- else
+ if (index)
+ cycled = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = LLONG_MAX;
+ wbc->range_cyclic = 0;
+ } else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
@@ -2504,6 +2513,7 @@ static int ext4_da_writepages(struct address_space *mapping,
wbc->no_nrwrite_index_update = 1;
pages_skipped = wbc->pages_skipped;
+retry:
while (!ret && wbc->nr_to_write > 0) {
/*
@@ -2546,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping,
pages_written += mpd.pages_written;
wbc->pages_skipped = pages_skipped;
ret = 0;
+ io_done = 1;
} else if (wbc->nr_to_write)
/*
* There is no more writeout needed
@@ -2554,6 +2565,13 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
break;
}
+ if (!io_done && !cycled) {
+ cycled = 1;
+ index = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = mapping->writeback_index - 1;
+ goto retry;
+ }
if (pages_skipped != wbc->pages_skipped)
printk(KERN_EMERG "This should not happen leaving %s "
"with nr_to_write = %ld ret = %d\n",
@@ -2561,6 +2579,7 @@ static int ext4_da_writepages(struct address_space *mapping,
/* Update index */
index += pages_written;
+ wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
* set the writeback_index so that range_cyclic
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index deba54f6cbe..4415beeb0b6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
+ INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_linear = 0;
@@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_linear = 1;
@@ -4476,23 +4479,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
- /*
- * We want to add the pa to the right bucket.
- * Remove it from the list and while adding
- * make sure the list to which we are adding
- * doesn't grow big.
- */
- if (likely(pa->pa_free)) {
- spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
- spin_unlock(pa->pa_obj_lock);
- ext4_mb_add_n_trim(ac);
- }
}
- ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->alloc_semp)
up_read(ac->alloc_semp);
+ if (pa) {
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big. We need to release
+ * alloc_semp before calling ext4_mb_add_n_trim()
+ */
+ if (pa->pa_linear && likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
+ ext4_mb_put_pa(ac, ac->ac_sb, pa);
+ }
if (ac->ac_bitmap_page)
page_cache_release(ac->ac_bitmap_page);
if (ac->ac_buddy_page)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca25e3..fe64d9f7985 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
+ 1);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- goto err_out;
+ return retval;
}
tmp_inode = ext4_new_inode(handle,
inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
if (IS_ERR(tmp_inode)) {
retval = -ENOMEM;
ext4_journal_stop(handle);
- tmp_inode = NULL;
- goto err_out;
+ return retval;
}
i_size_write(tmp_inode, i_size_read(inode));
/*
@@ -618,8 +617,7 @@ err_out:
ext4_journal_stop(handle);
- if (tmp_inode)
- iput(tmp_inode);
+ iput(tmp_inode);
return retval;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5f045..a5732c58f67 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb)
static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
+ tid_t target;
trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
sb->s_dirt = 0;
if (EXT4_SB(sb)->s_journal) {
- if (wait)
- ret = ext4_force_commit(sb);
- else
- jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+ if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
+ &target)) {
+ if (wait)
+ jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
+ target);
+ }
} else {
ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb343008ede..58144102bf2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
}
/*
- * Called under j_state_lock. Returns true if a transaction was started.
+ * Called under j_state_lock. Returns true if a transaction commit was started.
*/
int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
/*
* Start a commit of the current running transaction (if any). Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
*/
int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
{
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
if (journal->j_running_transaction) {
tid_t tid = journal->j_running_transaction->t_tid;
- ret = __jbd2_log_start_commit(journal, tid);
- if (ret && ptid)
+ __jbd2_log_start_commit(journal, tid);
+ /* There's a running transaction and we've just made sure
+ * it's commit has been scheduled. */
+ if (ptid)
*ptid = tid;
- } else if (journal->j_committing_transaction && ptid) {
+ ret = 1;
+ } else if (journal->j_committing_transaction) {
/*
* If ext3_write_super() recently started a commit, then we
* have to wait for completion of that transaction
*/
- *ptid = journal->j_committing_transaction->t_tid;
+ if (ptid)
+ *ptid = journal->j_committing_transaction->t_tid;
ret = 1;
}
spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7..28ce21d8598 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
}
/*
- * This function must be called when inode is journaled in ordered mode
- * before truncation happens. It starts writeout of truncated part in
- * case it is in the committing transaction so that we stand to ordered
- * mode consistency guarantees.
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way. If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them. Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A. This function is a
+ * helper to solve this problem. It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
*/
-int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+ struct jbd2_inode *jinode,
loff_t new_size)
{
- journal_t *journal;
- transaction_t *commit_trans;
+ transaction_t *inode_trans, *commit_trans;
int ret = 0;
- if (!inode->i_transaction && !inode->i_next_transaction)
+ /* This is a quick check to avoid locking if not necessary */
+ if (!jinode->i_transaction)
goto out;
- journal = inode->i_transaction->t_journal;
+ /* Locks are here just to force reading of recent values, it is
+ * enough that the transaction was not committing before we started
+ * a transaction adding the inode to orphan list */
spin_lock(&journal->j_state_lock);
commit_trans = journal->j_committing_transaction;
spin_unlock(&journal->j_state_lock);
- if (inode->i_transaction == commit_trans) {
- ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+ spin_lock(&journal->j_list_lock);
+ inode_trans = jinode->i_transaction;
+ spin_unlock(&journal->j_list_lock);
+ if (inode_trans == commit_trans) {
+ ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
new_size, LLONG_MAX);
if (ret)
jbd2_journal_abort(journal, ret);
diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4bfd1..06f8e63f6cb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt)
*/
for_each_possible_cpu(cpu) {
struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
- if (cpu_writer->mnt != mnt)
- continue;
spin_lock(&cpu_writer->lock);
+ if (cpu_writer->mnt != mnt) {
+ spin_unlock(&cpu_writer->lock);
+ continue;
+ }
atomic_add(cpu_writer->count, &mnt->__mnt_writers);
cpu_writer->count = 0;
/*
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d..331f2e88e28 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
int ret;
do {
- if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+ if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
return -ENOSPC;
ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
} while (ret == -EAGAIN);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307..172850a9a12 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
- return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
- new_size);
+ return jbd2_journal_begin_ordered_truncate(
+ OCFS2_SB(inode->i_sb)->journal->j_journal,
+ &OCFS2_I(inode)->ip_jinode,
+ new_size);
}
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5267098532b..a1a4cfe1921 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op)
*/
file->f_version = 0;
- /* SEQ files support lseek, but not pread/pwrite */
- file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
+ /*
+ * seq_files support lseek() and pread(). They do not implement
+ * write() at all, but we clear FMODE_PWRITE here for historical
+ * reasons.
+ *
+ * If a client of seq_files a) implements file.write() and b) wishes to
+ * support pwrite() then that client will need to implement its own
+ * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+ */
+ file->f_mode &= ~FMODE_PWRITE;
return 0;
}
EXPORT_SYMBOL(seq_open);
@@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
int err = 0;
mutex_lock(&m->lock);
+
+ /* Don't assume *ppos is where we left it */
+ if (unlikely(*ppos != m->read_pos)) {
+ m->read_pos = *ppos;
+ while ((err = traverse(m, *ppos)) == -EAGAIN)
+ ;
+ if (err) {
+ /* With prejudice... */
+ m->read_pos = 0;
+ m->version = 0;
+ m->index = 0;
+ m->count = 0;
+ goto Done;
+ }
+ }
+
/*
* seq_file->op->..m_start/m_stop/m_next may do special actions
* or optimisations based on the file->f_version, so we want to
@@ -230,8 +254,10 @@ Fill:
Done:
if (!copied)
copied = err;
- else
+ else {
*ppos += copied;
+ m->read_pos += copied;
+ }
file->f_version = m->version;
mutex_unlock(&m->lock);
return copied;
@@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
if (offset < 0)
break;
retval = offset;
- if (offset != file->f_pos) {
+ if (offset != m->read_pos) {
while ((retval=traverse(m, offset)) == -EAGAIN)
;
if (retval) {
/* with extreme prejudice... */
file->f_pos = 0;
+ m->read_pos = 0;
m->version = 0;
m->index = 0;
m->count = 0;
} else {
+ m->read_pos = offset;
retval = file->f_pos = offset;
}
}
diff --git a/fs/super.c b/fs/super.c
index 61dce001dd5..8349ed6b141 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
* lock ordering than usbfs:
*/
lockdep_set_class(&s->s_lock, &type->s_lock_key);
- down_write(&s->s_umount);
+ /*
+ * sget() can have s_umount recursion.
+ *
+ * When it cannot find a suitable sb, it allocates a new
+ * one (this one), and tries again to find a suitable old
+ * one.
+ *
+ * In case that succeeds, it will acquire the s_umount
+ * lock of the old one. Since these are clearly distrinct
+ * locks, and this object isn't exposed yet, there's no
+ * risk of deadlocks.
+ *
+ * Annotate this by putting this lock in a different
+ * subclass.
+ */
+ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
s->s_count = S_BIAS;
atomic_set(&s->s_active, 1);
mutex_init(&s->s_vfs_rename_mutex);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f..b042bd7034b 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
- if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
- return -EINVAL;
- if (clockid != CLOCK_MONOTONIC &&
- clockid != CLOCK_REALTIME)
+ if ((flags & ~TFD_CREATE_FLAGS) ||
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME))
return -EINVAL;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
- flags & (O_CLOEXEC | O_NONBLOCK));
+ flags & TFD_SHARED_FCNTL_FLAGS);
if (ufd < 0)
kfree(ctx);
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
return -EFAULT;
- if (!timespec_valid(&ktmr.it_value) ||
+ if ((flags & ~TFD_SETTIME_FLAGS) ||
+ !timespec_valid(&ktmr.it_value) ||
!timespec_valid(&ktmr.it_interval))
return -EINVAL;