From fb5cbe9efd741b16e72133613747f76490bbecd3 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 28 Oct 2009 22:28:24 -0700 Subject: ocfs2: Return -EINVAL when a device is not ocfs2. In case of non-modular kernels the root filesystem is mounted by trying several filesystems. If ocfs2 was tried before the actual filesystem type, the mount would fail because ocfs2_sb_probe() returns -EAGAIN instead of -EINVAL. ocfs2 will now return -EINVAL properly. Signed-off-by: Joel Becker Reported-by: Laszlo Attila Toth --- fs/ocfs2/super.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c0e48aeebb1..960673004df 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -773,18 +773,20 @@ static int ocfs2_sb_probe(struct super_block *sb, if (tmpstat < 0) { status = tmpstat; mlog_errno(status); - goto bail; + break; } di = (struct ocfs2_dinode *) (*bh)->b_data; memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); spin_lock_init(&stats->b_lock); - status = ocfs2_verify_volume(di, *bh, blksize, stats); - if (status >= 0) - goto bail; - brelse(*bh); - *bh = NULL; - if (status != -EAGAIN) + tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats); + if (tmpstat < 0) { + brelse(*bh); + *bh = NULL; + } + if (tmpstat != -EAGAIN) { + status = tmpstat; break; + } } bail: -- cgit v1.2.3 From 87f4b1bb98696e6cf84f57df7de41f28c2a7dbeb Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 15 Oct 2009 11:10:48 +0800 Subject: ocfs2: Move ocfs2_complete_reflink to the right place. As its name ocfs2_complete_reflink indicates, it should be called after all the work for reflink is done, so it really should be called after we reflink xattr successfully. Signed-off-by: Tao Ma Signed-off-by: Joel Becker Tested-by: Tristan Ye --- fs/ocfs2/refcounttree.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 60287fc56bc..9d439b27730 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4013,10 +4013,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, goto out_unlock_refcount; } - ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve); - if (ret) - mlog_errno(ret); - out_unlock_refcount: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); @@ -4068,9 +4064,17 @@ static int __ocfs2_reflink(struct dentry *old_dentry, ret = ocfs2_reflink_xattrs(inode, old_bh, new_inode, new_bh, preserve); - if (ret) + if (ret) { mlog_errno(ret); + goto inode_unlock; + } } + + ret = ocfs2_complete_reflink(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) + mlog_errno(ret); + inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); -- cgit v1.2.3 From 2f48d593b6ceb7bb63d34124ceba77d33be298cf Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 15 Oct 2009 11:10:49 +0800 Subject: ocfs2: duplicate inline data properly during reflink. The old reflink fails to handle inodes with inline data and will oops if it encounters them. This patch copies inline data to the new inode. Extended attributes may still be refcounted. Signed-off-by: Tao Ma Signed-off-by: Joel Becker Tested-by: Tristan Ye --- fs/ocfs2/file.c | 3 ++- fs/ocfs2/refcounttree.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 89fc8ee1f5a..de059f49058 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1712,7 +1712,8 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, struct super_block *sb = inode->i_sb; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || - !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9d439b27730..3a0df7a1b81 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3743,6 +3743,9 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, goto out; } + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto attach_xattr; + ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); size = i_size_read(inode); @@ -3769,6 +3772,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, cpos += num_clusters; } +attach_xattr: if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, &ref_tree->rf_ci, @@ -3858,6 +3862,49 @@ out: return ret; } +static int ocfs2_duplicate_inline_data(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; + + BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; + memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, + le16_to_cpu(s_di->id2.i_data.id_count)); + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; + t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + static int ocfs2_duplicate_extent_list(struct inode *s_inode, struct inode *t_inode, struct buffer_head *t_bh, @@ -3997,6 +4044,14 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, goto out; } + if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, + t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { -- cgit v1.2.3 From 837711f862bb71ac263837a0f0714dd8cc4ef7ea Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 16 Jan 2009 16:33:05 +0800 Subject: ocfs2: return f_fsid info in ocfs2_statfs() Currently the f_fsid of struct kstatfs returned from ocfs2_statfs() is undefined (vfs layer fills in 0 as default). Since in some conditions, f_fsid value might be used in a (f_fsid, ino) pair to uniquely identify a file, ocfs2 should return a unique defined f_fsid value from ocfs2_statfs(). Because uuid_str is the same on big or litlle endian machine, it's endian consistent to use osb->uuid_str to generate f_fsid value. Signed-off-by: Coly Li Cc: Sunil Mushran Cc: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 960673004df..14f47d2bfe0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1647,6 +1647,10 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = numbits; buf->f_ffree = freebits; + buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN) + & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN, + OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; brelse(bh); -- cgit v1.2.3 From 5a1eb5c4453207ad9e7f6e8ca4f8db289743c993 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 30 Oct 2009 15:03:54 +1100 Subject: powerpc: Cleanup Kconfig selection of hugetlbfs support Signed-off-by: Benjamin Herrenschmidt --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 2126078a38e..64d44efad7a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -135,7 +135,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC_BOOK3S_64 || SPARC64 || (S390 && 64BIT) || \ + depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on -- cgit v1.2.3 From f91b90993f0d286be89f06c2f547ced8cfe291c6 Mon Sep 17 00:00:00 2001 From: Martin Stava Date: Mon, 2 Nov 2009 08:39:35 -0600 Subject: 9p: fix a small bug in readdir for long directories Here is a proposed patch for bug in readdir. Listing of dirs with many files fails without this patch. Signed-off-by: Martin Stava Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 873cd31baa4..cae53d405f2 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -90,6 +90,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) if (err <= 0) break; + i = 0; n = err; while (i < n) { err = p9stat_read(statbuf + i, buflen-i, &st, -- cgit v1.2.3 From 2511cd0b3b9e9b1c3e9360cc565c3745ac3f3f3f Mon Sep 17 00:00:00 2001 From: Martin Stava Date: Mon, 2 Nov 2009 08:39:34 -0600 Subject: 9p: fix readlink I do not know if you've looked on the patch, but unfortunately it is incorrect. A suggested better version is in this email (the old version didn't work in case the user provided buffer was not long enough - it incorrectly appended null byte on a position of last char, and thus broke the contract of the readlink method). However, I'm still not sure this is 100% correct thing to do, I think readlink is supposed to return buffer without last null byte in all cases, but we do return last null byte (even the old version).. on the other hand it is likely unspecified what is in the remaining part of the buffer, so null character may be fine there ;): Signed-off-by: Martin Stava Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 5947628aefe..18f74ec4dce 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -994,8 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); - retval = buflen; - + retval = strnlen(buffer, buflen); done: kfree(st); return retval; @@ -1062,7 +1061,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) __putname(link); link = ERR_PTR(len); } else - link[len] = 0; + link[min(len, PATH_MAX-1)] = 0; } nd_set_link(nd, link); -- cgit v1.2.3 From 3e2796a90cf349527e50b3bc4d0b2f4019b1ce7a Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Mon, 2 Nov 2009 08:39:28 -0600 Subject: 9p: fix readdir corner cases The patch below also addresses a couple of other corner cases in readdir seen with a large (e.g. 64k) msize. I'm not sure what people think of my co-opting of fid->aux here. I'd be happy to rework if there's a better way. When the size of the user supplied buffer passed to readdir is smaller than the data returned in one go by the 9P read request, v9fs_dir_readdir() currently discards extra data so that, on the next call, a 9P read request will be issued with offset < previous offset + bytes returned, which voilates the constraint described in paragraph 3 of read(5) description. This patch preseves the leftover data in fid->aux for use in the next call. Signed-off-by: Jim Garlick Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 94 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index cae53d405f2..15cce53bf61 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -39,6 +39,24 @@ #include "v9fs_vfs.h" #include "fid.h" +/** + * struct p9_rdir - readdir accounting + * @mutex: mutex protecting readdir + * @head: start offset of current dirread buffer + * @tail: end offset of current dirread buffer + * @buf: dirread buffer + * + * private structure for keeping track of readdir + * allocated on demand + */ + +struct p9_rdir { + struct mutex mutex; + int head; + int tail; + uint8_t *buf; +}; + /** * dt_type - return file type * @mistat: mistat structure @@ -70,57 +88,79 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) { int over; struct p9_wstat st; - int err; + int err = 0; struct p9_fid *fid; int buflen; - char *statbuf; - int n, i = 0; + int reclen = 0; + struct p9_rdir *rdir; P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; - statbuf = kmalloc(buflen, GFP_KERNEL); - if (!statbuf) - return -ENOMEM; - - while (1) { - err = v9fs_file_readn(filp, statbuf, NULL, buflen, - fid->rdir_fpos); - if (err <= 0) - break; - - i = 0; - n = err; - while (i < n) { - err = p9stat_read(statbuf + i, buflen-i, &st, - fid->clnt->dotu); + + /* allocate rdir on demand */ + if (!fid->rdir) { + rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + + if (rdir == NULL) { + err = -ENOMEM; + goto exit; + } + spin_lock(&filp->f_dentry->d_lock); + if (!fid->rdir) { + rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); + mutex_init(&rdir->mutex); + rdir->head = rdir->tail = 0; + fid->rdir = (void *) rdir; + rdir = NULL; + } + spin_unlock(&filp->f_dentry->d_lock); + kfree(rdir); + } + rdir = (struct p9_rdir *) fid->rdir; + + err = mutex_lock_interruptible(&rdir->mutex); + while (err == 0) { + if (rdir->tail == rdir->head) { + err = v9fs_file_readn(filp, rdir->buf, NULL, + buflen, filp->f_pos); + if (err <= 0) + goto unlock_and_exit; + + rdir->head = 0; + rdir->tail = err; + } + + while (rdir->head < rdir->tail) { + err = p9stat_read(rdir->buf + rdir->head, + buflen - rdir->head, &st, + fid->clnt->dotu); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; p9stat_free(&st); - goto free_and_exit; + goto unlock_and_exit; } - - i += st.size+2; - fid->rdir_fpos += st.size+2; + reclen = st.size+2; over = filldir(dirent, st.name, strlen(st.name), filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); - filp->f_pos += st.size+2; - p9stat_free(&st); if (over) { err = 0; - goto free_and_exit; + goto unlock_and_exit; } + rdir->head += reclen; + filp->f_pos += reclen; } } -free_and_exit: - kfree(statbuf); +unlock_and_exit: + mutex_unlock(&rdir->mutex); +exit: return err; } -- cgit v1.2.3 From d4da6c9ccf648f3f1cb5bf9d981a62c253d30e28 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 Nov 2009 10:15:27 -0800 Subject: Revert "ext4: Remove journal_checksum mount option and enable it by default" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit d0646f7b636d067d715fab52a2ba9c6f0f46b0d7, as requested by Eric Sandeen. It can basically cause an ext4 filesystem to miss recovery (and thus get mounted with errors) if the journal checksum does not match. Quoth Eric: "My hand-wavy hunch about what is happening is that we're finding a bad checksum on the last partially-written transaction, which is not surprising, but if we have a wrapped log and we're doing the initial scan for head/tail, and we abort scanning on that bad checksum, then we are essentially running an unrecovered filesystem. But that's hand-wavy and I need to go look at the code. We lived without journal checksums on by default until now, and at this point they're doing more harm than good, so we should revert the default-changing commit until we can fix it and do some good power-fail testing with the fixes in place." See http://bugzilla.kernel.org/show_bug.cgi?id=14354 for all the gory details. Requested-by: Eric Sandeen Cc: Theodore Tso Cc: Alexey Fisher Cc: Maxim Levitsky Cc: Aneesh Kumar K.V Cc: Mathias Burén Signed-off-by: Linus Torvalds --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 984ca0cb38c..00d153f2f26 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -743,6 +743,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 312211ee05a..d4ca92aab51 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1300,9 +1300,11 @@ static int parse_options(char *options, struct super_block *sb, *journal_devnum = option; break; case Opt_journal_checksum: - break; /* Kept for backwards compatibility */ + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + break; case Opt_journal_async_commit: set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); break; case Opt_noload: set_opt(sbi->s_mount_opt, NOLOAD); @@ -2759,14 +2761,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) - jbd2_journal_set_features(sbi->s_journal, 0, 0, + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - else + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ -- cgit v1.2.3 From fa5d11133b07053270e18fa9c18560e66e79217e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 2 Nov 2009 18:50:49 -0500 Subject: ext4: discard preallocation when restarting a transaction during truncate When restart a transaction during a truncate operation, we drop and reacquire i_data_sem. After reacquiring i_data_sem, we need to discard any inode-based preallocation that might have been grabbed while we released i_data_sem (for example, if pdflush is allocating blocks and racing against the truncate). Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5c5bc5dafff..d1ec698a91d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -193,7 +193,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ - int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, int nblocks) { int ret; @@ -209,6 +209,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) up_write(&EXT4_I(inode)->i_data_sem); ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); return ret; } -- cgit v1.2.3 From b1e19e5601277845b4f17ecd7c9ba04f73ee11aa Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 3 Nov 2009 00:25:53 +0900 Subject: nilfs2: fix dirty page accounting leak causing hang at write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bruno Prémont and Dunphy, Bill noticed me that NILFS will certainly hang on ARM-based targets. I found this was caused by an underflow of dirty pages counter. A b-tree cache routine was marking page dirty without adjusting page account information. This fixes the dirty page accounting leak and resolves the hang on arm-based targets. Reported-by: Bruno Prémont Reported-by: Dunphy, Bill Signed-off-by: Ryusuke Konishi Tested-by: Bruno Prémont Cc: stable --- fs/nilfs2/btnode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 5941958f1e4..435864ce06b 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -276,8 +276,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); - if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage)) - BUG(); + nilfs_btnode_mark_dirty(obh); spin_lock_irq(&btnc->tree_lock); radix_tree_delete(&btnc->page_tree, oldkey); -- cgit v1.2.3 From aeda7f6343e6375a832e52ff5ed389c115023ca5 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 2 Nov 2009 15:08:13 +0900 Subject: nilfs2: fix irregular checkpoint creation due to data flush When nilfs flushes out dirty data to reduce memory pressure, creation of checkpoints is wrongly postponed. This bug causes irregular checkpoint creation especially in small footprint systems. To correct this issue, a timer for the checkpoint creation has to be continued if a log writer does not create a checkpoint. This will do the correction. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/segment.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 683df89dbae..6eff66a070d 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2468,17 +2468,22 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, /* Clear requests (even when the construction failed) */ spin_lock(&sci->sc_state_lock); - sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; - if (req->mode == SC_LSEG_SR) { + sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; sci->sc_seq_done = req->seq_accepted; nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); sci->sc_flush_request = 0; - } else if (req->mode == SC_FLUSH_FILE) - sci->sc_flush_request &= ~FLUSH_FILE_BIT; - else if (req->mode == SC_FLUSH_DAT) - sci->sc_flush_request &= ~FLUSH_DAT_BIT; + } else { + if (req->mode == SC_FLUSH_FILE) + sci->sc_flush_request &= ~FLUSH_FILE_BIT; + else if (req->mode == SC_FLUSH_DAT) + sci->sc_flush_request &= ~FLUSH_DAT_BIT; + /* re-enable timer if checkpoint creation was not done */ + if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_before(jiffies, sci->sc_timer->expires)) + add_timer(sci->sc_timer); + } spin_unlock(&sci->sc_state_lock); } -- cgit v1.2.3 From 05b4358ad564d7a6a51b3717afe771d36711e9c4 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 14 Sep 2009 01:20:35 +0900 Subject: nilfs2: add zero-fill for new btree node buffers Adds missing initialization of newly allocated b-tree node buffers. This avoids garbage data to be mixed in b-tree node blocks. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/btnode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 435864ce06b..84c25382f8e 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -87,6 +87,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, brelse(bh); BUG(); } + memset(bh->b_data, 0, 1 << inode->i_blkbits); bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); -- cgit v1.2.3 From 109f55651954def97fa41ee71c464d268c512ab0 Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 10 Nov 2009 10:48:08 -0500 Subject: ext4: fix ext4_ext_direct_IO()'s return value after converting uninit extents After a direct I/O request covering an uninitalized extent (i.e., created using the fallocate system call) or a hole in a file, ext4 will convert the uninitialized extent so it is marked as initialized by calling ext4_convert_unwritten_extents(). This function returns zero on success. This return value was getting returned by ext4_direct_IO(); however the file system's direct_IO function is supposed to return the number of bytes read or written on a success. By returning zero, it confused the direct I/O code into falling back to buffered I/O unnecessarily. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 1 + fs/ext4/inode.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 10539e36428..441716f33b4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3519,6 +3519,7 @@ retry: * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. */ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, loff_t len) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d1ec698a91d..12d727f8fed 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3772,13 +3772,17 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0) + } else if (ret > 0) { + int err; /* * for non AIO case, since the IO is already * completed, we could do the convertion right here */ - ret = ext4_convert_unwritten_extents(inode, - offset, ret); + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + } return ret; } -- cgit v1.2.3 From 5f5249507e4b5c4fc0f9c93f33d133d8c95f47e1 Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 10 Nov 2009 10:48:04 -0500 Subject: ext4: skip conversion of uninit extents after direct IO if there isn't any At the end of direct I/O operation, ext4_ext_direct_IO() always called ext4_convert_unwritten_extents(), regardless of whether there were any unwritten extents involved in the I/O or not. This commit adds a state flag so that ext4_ext_direct_IO() only calls ext4_convert_unwritten_extents() when necessary. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 22 +++++++++++++++++----- fs/ext4/inode.c | 4 +++- 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 00d153f2f26..8825515eedd 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -322,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ #define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 441716f33b4..e991ae2b461 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3048,12 +3048,18 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); - /* flag the io_end struct that we need convert when IO done */ + /* + * Flag the inode(non aio case) or end_io struct (aio case) + * that this IO needs to convertion to written when IO is + * completed + */ if (io) io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; goto out; } - /* DIO end_io complete, convert the filled extent to written */ + /* async DIO end_io complete, convert the filled extent to written */ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); @@ -3295,10 +3301,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * To avoid unecessary convertion for every aio dio rewrite * to the mid of file, here we flag the IO that is really * need the convertion. - * + * For non asycn direct IO case, flag the inode state + * that we need to perform convertion when IO is done. */ - if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) - io->flag = DIO_AIO_UNWRITTEN; + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + if (io) + io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= + EXT4_STATE_DIO_UNWRITTEN;; + } } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 12d727f8fed..a9ed2bce74d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3772,7 +3772,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0) { + } else if (ret > 0 && (EXT4_I(inode)->i_state & + EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already @@ -3782,6 +3783,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, offset, ret); if (err < 0) ret = err; + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; } return ret; } -- cgit v1.2.3 From 4b70df181611012a3556f017b57dfcef7e1d279f Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 3 Nov 2009 14:44:54 -0500 Subject: ext4: code clean up for dio fallocate handling The ext4_debug() call in ext4_end_io_dio() should be moved after the check to make sure that io_end is non-NULL. The comment above ext4_get_block_dio_write() ("Maximum number of blocks...") is a duplicate; the original and correct comment is above the #define DIO_MAX_BLOCKS up above. Based on review comments from Curt Wohlgemuth. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a9ed2bce74d..2c8caa51add 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3446,8 +3446,6 @@ out: return ret; } -/* Maximum number of blocks we map for direct IO at once. */ - static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -3655,13 +3653,14 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + return; + ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - return; /* if not aio dio with unwritten extents, just free io and return */ if (io_end->flag != DIO_AIO_UNWRITTEN){ -- cgit v1.2.3 From f60311d5f7670d9539b424e4ed8b5c0872fc9e83 Mon Sep 17 00:00:00 2001 From: "Anand V. Avati" Date: Thu, 22 Oct 2009 06:24:52 -0700 Subject: fuse: prevent fuse_put_request on invalid pointer fuse_direct_io() has a loop where requests are allocated in each iteration. if allocation fails, the loop is broken out and follows into an unconditional fuse_put_request() on that invalid pointer. Signed-off-by: Anand V. Avati Signed-off-by: Miklos Szeredi Cc: stable@kernel.org --- fs/fuse/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a3492f7d207..5887a6395ad 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1063,7 +1063,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, break; } } - fuse_put_request(fc, req); + if (!IS_ERR(req)) + fuse_put_request(fc, req); if (res > 0) *ppos = pos; -- cgit v1.2.3 From 0bd87182d3ab18a32a8e9175d3f68754c58e3432 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Nov 2009 11:40:44 +0100 Subject: fuse: fix kunmap in fuse_ioctl_copy_user Looks like another victim of the confusing kmap() vs kmap_atomic() API differences. Reported-by: Todor Gyumyushev Signed-off-by: Jens Axboe Signed-off-by: Miklos Szeredi Cc: Tejun Heo Cc: stable@kernel.org --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5887a6395ad..c18913a777a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1600,7 +1600,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, kaddr += copy; } - kunmap(map); + kunmap(page); } return 0; -- cgit v1.2.3 From 5219f346b0ea2a2a8821f1e966b190788c285b0b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 4 Nov 2009 10:24:52 +0100 Subject: fuse: invalidate target of rename Invalidate the target's attributes, which may have changed (such as nlink, change time) so that they are refreshed on the next getattr(). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 992f6c9410b..8ada78aade5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -712,8 +712,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) + if (newent->d_inode) { + fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation -- cgit v1.2.3 From 89240ba059ca468ae7a8346edf7f95082458c2fc Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 3 Nov 2009 10:22:40 +0100 Subject: x86, fs: Fix x86 procfs stack information for threads on 64-bit This patch fixes two issues in the procfs stack information on x86-64 linux. The 32 bit loader compat_do_execve did not store stack start. (this was figured out by Alexey Dobriyan). The stack information on a x64_64 kernel always shows 0 kbyte stack usage, because of a missing implementation of the KSTK_ESP macro which always returned -1. The new implementation now returns the right value. Signed-off-by: Stefani Seibold Cc: Americo Wang Cc: Alexey Dobriyan Cc: Al Viro Cc: Andrew Morton LKML-Reference: <1257240160.4889.24.camel@wall-e> Signed-off-by: Ingo Molnar --- fs/compat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index d576b552e8e..6c19040ffee 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1532,6 +1532,8 @@ int compat_do_execve(char * filename, if (retval < 0) goto out; + current->stack_start = current->mm->start_stack; + /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; -- cgit v1.2.3 From 4c3da2209b1261af9a948b7509a38904c8eee554 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Nov 2009 02:50:06 -0800 Subject: sysfs: Don't leak secdata when a sysfs_dirent is freed. While refreshing my sysfs patches I noticed a leak in the secdata implementation. We don't free the secdata when we free the sysfs dirent. This is a bug in 2.6.32-rc5 that we really should close. Signed-off-by: Eric W. Biederman Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/sysfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 5fad489ce5b..e0201837d24 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); @@ -285,6 +286,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) sysfs_put(sd->s_symlink.target_sd); if (sysfs_type(sd) & SYSFS_COPY_NAME) kfree(sd->s_name); + if (sd->s_iattr && sd->s_iattr->ia_secdata) + security_release_secctx(sd->s_iattr->ia_secdata, + sd->s_iattr->ia_secdata_len); kfree(sd->s_iattr); sysfs_free_ino(sd->s_ino); kmem_cache_free(sysfs_dir_cachep, sd); -- cgit v1.2.3 From ba230c3f6dc88ec008806adb27b12088486d508e Mon Sep 17 00:00:00 2001 From: Mingming Date: Fri, 6 Nov 2009 04:01:23 -0500 Subject: ext4: Fix return value of ext4_split_unwritten_extents() to fix direct I/O To prepare for a direct I/O write, we need to split the unwritten extents before submitting the I/O. When no extents needed to be split, ext4_split_unwritten_extents() was incorrectly returning 0 instead of the size of uninitialized extents. This bug caused the wrong return value sent back to VFS code when it gets called from async IO path, leading to an unnecessary fall back to buffered IO. This bug also hid the fact that the check to see whether or not a split would be necessary was incorrect; we can only skip splitting the extent if the write completely covers the uninitialized extent. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e991ae2b461..715264b4bae 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2807,6 +2807,8 @@ fix_extent_len: * into three uninitialized extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). + * + * Returns the size of uninitialized extent to be written on success. */ static int ext4_split_unwritten_extents(handle_t *handle, struct inode *inode, @@ -2824,7 +2826,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; int err = 0; - int ret = 0; ext_debug("ext4_split_unwritten_extents: inode %lu," "iblock %llu, max_blocks %u\n", inode->i_ino, @@ -2842,12 +2843,12 @@ static int ext4_split_unwritten_extents(handle_t *handle, ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); /* - * if the entire unintialized extent length less than - * the size of extent to write, there is no need to split - * uninitialized extent + * If the uninitialized extent begins at the same logical + * block where the write begins, and the write completely + * covers the extent, then we don't need to split it. */ - if (allocated <= max_blocks) - return ret; + if ((iblock == ee_block) && (allocated <= max_blocks)) + return allocated; err = ext4_ext_get_access(handle, inode, path + depth); if (err) -- cgit v1.2.3 From ec06aedd44541129840ed52e6165afa3796a27bf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Nov 2009 14:18:29 -0500 Subject: cifs: clean up handling when server doesn't consistently support inode numbers It's possible that a server will return a valid FileID when we query the FILE_INTERNAL_INFO for the root inode, but then zeroed out inode numbers when we do a FindFile with an infolevel of SMB_FIND_FILE_ID_FULL_DIR_INFO. In this situation turn off querying for server inode numbers, generate a warning for the user and just generate an inode number using iunique. Once we generate any inode number with iunique we can no longer use any server inode numbers or we risk collisions, so ensure that we don't do that in cifs_get_inode_info either. Cc: Stable Reported-by: Timothy Normand Miller Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 1 + fs/cifs/inode.c | 7 ++----- fs/cifs/misc.c | 14 ++++++++++++++ fs/cifs/readdir.c | 7 ++++--- 4 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 6928c24d1d4..5646727e33f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -388,4 +388,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); +extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 5e2492535da..cababd8a52d 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -512,13 +512,10 @@ int cifs_get_inode_info(struct inode **pinode, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc1) { + if (rc1 || !fattr.cf_uniqueid) { cFYI(1, ("GetSrvInodeNum rc %d", rc1)); fattr.cf_uniqueid = iunique(sb, ROOT_I); - /* disable serverino if call not supported */ - if (rc1 == -EINVAL) - cifs_sb->mnt_cifs_flags &= - ~CIFS_MOUNT_SERVER_INUM; + cifs_autodisable_serverino(cifs_sb); } } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 0241b25ac33..1e25efcb55c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -715,3 +715,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen, ctoUCS_out: return i; } + +void +cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cifs_sb->mnt_cifs_flags &= CIFS_MOUNT_SERVER_INUM; + cERROR(1, ("Autodisabling the use of server inode numbers on " + "%s. This server doesn't seem to support them " + "properly. Hardlinks will not be recognized on this " + "mount. Consider mounting with the \"noserverino\" " + "option to silence this message.", + cifs_sb->tcon->treeName)); + } +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 1f098ca7163..f84062f9a98 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -727,11 +727,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) pfindEntry, cifs_sb); - /* FIXME: make _to_fattr functions fill this out */ - if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO) + if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { fattr.cf_uniqueid = inum; - else + } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); -- cgit v1.2.3 From f475f6775465283494346663f201ad04810d2e8a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Nov 2009 14:18:49 -0500 Subject: cifs: don't use CIFSGetSrvInodeNumber in is_path_accessible Because it's lighter weight, CIFS tries to use CIFSGetSrvInodeNumber to verify the accessibility of the root inode and then falls back to doing a full QPathInfo if that fails with -EOPNOTSUPP. I have at least a report of a server that returns NT_STATUS_INTERNAL_ERROR rather than something that translates to EOPNOTSUPP. Rather than trying to be clever with that call, just have is_path_accessible do a normal QPathInfo. That call is widely supported and it shouldn't increase the overhead significantly. Cc: Stable Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b0909807991..63ea83ff687 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2220,16 +2220,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) { int rc; - __u64 inode_num; FILE_ALL_INFO *pfile_info; - rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != -EOPNOTSUPP) - return rc; - pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (pfile_info == NULL) return -ENOMEM; -- cgit v1.2.3 From 5399dd1fc8f5e812db931225ef5f67d89f3b1a56 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 7 Nov 2009 18:45:16 +0900 Subject: nilfs2: fix kernel oops in error case of nilfs_ioctl_move_blocks This fixes a kernel oops reported by Markus Trippelsdorf in the email titled "[NILFS users] kernel Oops while running nilfs_cleanerd". The oops was caused by a bug of error path in nilfs_ioctl_move_blocks() function, which was inlined in nilfs_ioctl_clean_segments(). nilfs_ioctl_move_blocks checks duplication of blocks which will be moved in garbage collection. But, the check should have be done within nilfs_ioctl_move_inode_block() to prevent list corruption among buffers storing the target blocks. To fix the kernel oops, this moves forward the duplication check before the list insertion. I also tested this for stable trees [2.6.30, 2.6.31]. Reported-by: Markus Trippelsdorf Signed-off-by: Ryusuke Konishi Cc: stable --- fs/nilfs2/ioctl.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 6572ea4bc4d..89dd73ead9a 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -297,7 +297,18 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, (unsigned long long)vdesc->vd_vblocknr); return ret; } - bh->b_private = vdesc; + if (unlikely(!list_empty(&bh->b_assoc_buffers))) { + printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, " + "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + brelse(bh); + return -EEXIST; + } list_add_tail(&bh->b_assoc_buffers, buffers); return 0; } @@ -335,24 +346,10 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { ret = nilfs_gccache_wait_and_mark_dirty(bh); if (unlikely(ret < 0)) { - if (ret == -EEXIST) { - vdesc = bh->b_private; - printk(KERN_CRIT - "%s: conflicting %s buffer: " - "ino=%llu, cno=%llu, offset=%llu, " - "blocknr=%llu, vblocknr=%llu\n", - __func__, - vdesc->vd_flags ? "node" : "data", - (unsigned long long)vdesc->vd_ino, - (unsigned long long)vdesc->vd_cno, - (unsigned long long)vdesc->vd_offset, - (unsigned long long)vdesc->vd_blocknr, - (unsigned long long)vdesc->vd_vblocknr); - } + WARN_ON(ret == -EEXIST); goto failed; } list_del_init(&bh->b_assoc_buffers); - bh->b_private = NULL; brelse(bh); } return nmembs; @@ -360,7 +357,6 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, failed: list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { list_del_init(&bh->b_assoc_buffers); - bh->b_private = NULL; brelse(bh); } return ret; -- cgit v1.2.3 From c083234f1592ef3fad3d8083663c5e4a357ec77c Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 8 Nov 2009 12:09:24 +0900 Subject: nilfs2: fix missing cleanup of gc cache on error cases This fixes an -rc1 regression brought by the commit: 1cf58fa840472ec7df6bf2312885949ebb308853 ("nilfs2: shorten freeze period due to GC in write operation v3"). Although the patch moved out a function call of nilfs_ioctl_move_blocks() to nilfs_ioctl_clean_segments() from nilfs_ioctl_prepare_clean_segments(), it didn't move corresponding cleanup job needed for the error case. This will move the missing cleanup job to the destination function. Signed-off-by: Ryusuke Konishi Acked-by: Jiro SEKIBA --- fs/nilfs2/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 89dd73ead9a..d24057d58f1 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -467,7 +467,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, return 0; failed: - nilfs_remove_all_gcinode(nilfs); printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", msg, ret); return ret; @@ -556,6 +555,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, else ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + if (ret < 0) + nilfs_remove_all_gcinode(nilfs); clear_nilfs_gc_running(nilfs); out_free: -- cgit v1.2.3 From 1e424a348303694fabdf8b1efbfcb1a892dfa63a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 8 Nov 2009 15:45:44 -0500 Subject: ext4: partial revert to fix double brelse WARNING() This is a partial revert of commit 6487a9d (only the changes made to fs/ext4/namei.c), since it is causing the following brelse() double-free warning when running fsstress on a file system with 1k blocksize and we run into a block allocation failure while converting a single-block directory to a multi-block hash-tree indexed directory. WARNING: at fs/buffer.c:1197 __brelse+0x2e/0x33() Hardware name: VFS: brelse: Trying to free free buffer Modules linked in: Pid: 2226, comm: jbd2/sdd-8 Not tainted 2.6.32-rc6-00577-g0003f55 #101 Call Trace: [] warn_slowpath_common+0x65/0x95 [] warn_slowpath_fmt+0x29/0x2c [] __brelse+0x2e/0x33 [] jbd2_journal_refile_buffer+0x67/0x6c [] jbd2_journal_commit_transaction+0x319/0x14d8 [] ? try_to_del_timer_sync+0x58/0x60 [] ? sched_clock_cpu+0x12a/0x13e [] ? trace_hardirqs_off+0xb/0xd [] ? cpu_clock+0x3f/0x5b [] ? lock_release_holdtime+0x36/0x137 [] ? _spin_unlock_irqrestore+0x44/0x51 [] ? trace_hardirqs_on_caller+0x103/0x124 [] ? trace_hardirqs_on+0xb/0xd [] ? try_to_del_timer_sync+0x58/0x60 [] kjournald2+0x11a/0x310 [] ? autoremove_wake_function+0x0/0x38 [] ? kjournald2+0x0/0x310 [] kthread+0x66/0x6b [] ? kthread+0x0/0x6b [] kernel_thread_helper+0x7/0x10 ---[ end trace 5579351b86af61e3 ]--- Commit 6487a9d was an attempt some buffer head leaks in an ENOSPC error path, but in some cases it actually results in an excess ENOSPC, as shown above. Fixing this means cleaning up who is responsible for releasing the buffer heads from the callee to the caller of add_dirent_to_buf(). Since that's a relatively complex change, and we're late in the rcX development cycle, I'm reverting this now, and holding back a more complete fix until after 2.6.32 ships. We've lived with this buffer_head leak on ENOSPC in ext3 and ext4 for a very long time; a few more months won't kill us. Signed-off-by: "Theodore Ts'o" Cc: Curt Wohlgemuth --- fs/ext4/namei.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7c8fe80bacd..6d2c1b897fc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1518,12 +1518,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { - retval = make_indexed_dir(handle, dentry, inode, bh); - if (retval == -ENOSPC) - brelse(bh); - return retval; - } + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); brelse(bh); } bh = ext4_append(handle, dir, &block, &retval); @@ -1532,10 +1528,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); - if (retval == -ENOSPC) - brelse(bh); - return retval; + return add_dirent_to_buf(handle, dentry, inode, de, bh); } /* @@ -1664,8 +1657,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - if (err != -ENOSPC) - bh = NULL; + bh = NULL; goto cleanup; journal_error: -- cgit v1.2.3 From 96d25e532234bec1a1989e6e1baf702d43a78b0d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Nov 2009 16:15:42 +0900 Subject: NFSv4: Fix a cache validation bug which causes getcwd() to return ENOENT Changeset a65318bf3afc93ce49227e849d213799b072c5fd (NFSv4: Simplify some cache consistency post-op GETATTRs) incorrectly changed the getattr bitmap for readdir(). This causes the readdir() function to fail to return a fileid/inode number, which again exposed a bug in the NFS readdir code that causes spurious ENOENT errors to appear in applications (see http://bugzilla.kernel.org/show_bug.cgi?id=14541). The immediate band aid is to revert the incorrect bitmap change, but more long term, we should change the NFS readdir code to cope with the fact that NFSv4 servers are not required to support fileids/inode numbers. Reported-by: Daniel J Blueman Cc: stable@kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff37454fa78..741a562177f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2767,7 +2767,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .pages = &page, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask, + .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, }; struct nfs4_readdir_res res; struct rpc_message msg = { -- cgit v1.2.3 From ea0174a7137c8ca9f130ca681f3a99c872da6778 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 12 Oct 2009 21:34:27 -0500 Subject: ext3: retry failed direct IO allocations On a 256M 4k block filesystem, doing this in a loop: dd if=/dev/zero of=test oflag=direct bs=1M count=64 rm -f test eventually leads to spurious ENOSPC: dd: writing `test': No space left on device As with other block allocation callers, it looks like we need to potentially retry the allocations on the initial ENOSPC. A similar patch went into ext4 (commit fbbf69456619de5d251cb9f1df609069178c62d5) Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index acf1b142332..069a163393b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1735,6 +1735,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, ssize_t ret; int orphan = 0; size_t count = iov_length(iov, nr_segs); + int retries = 0; if (rw == WRITE) { loff_t final_size = offset + count; @@ -1757,9 +1758,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } } +retry: ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext3_get_block, NULL); + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (orphan) { int err; -- cgit v1.2.3 From fe8bc91c4c30122b357d197117705cfd4fabaf28 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Oct 2009 19:26:15 +0200 Subject: ext3: Wait for proper transaction commit on fsync We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara Reviewed-by: Aneesh Kumar K.V --- fs/ext3/fsync.c | 36 ++++++++++++++++-------------------- fs/ext3/inode.c | 32 +++++++++++++++++++++++++++++++- fs/ext3/super.c | 2 ++ 3 files changed, 49 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 451d166bbe9..8209f266e9a 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -46,19 +46,21 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext3_inode_info *ei = EXT3_I(inode); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; int ret = 0; + tid_t commit_tid; + + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; J_ASSERT(ext3_journal_current_handle() == NULL); /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for a proper transaction + * to commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) goto out; } - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto flush; + if (datasync) + commit_tid = atomic_read(&ei->i_datasync_tid); + else + commit_tid = atomic_read(&ei->i_sync_tid); - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - ret = sync_inode(inode, &wbc); + if (log_start_commit(journal, commit_tid)) { + log_wait_commit(journal, commit_tid); goto out; } -flush: + /* * In case we didn't commit a transaction, we have to flush * disk caches manually so that data really is on persistent diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 069a163393b..354ed3b47b3 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, int err = 0; struct ext3_block_alloc_info *block_i; ext3_fsblk_t current_block; + struct ext3_inode_info *ei = EXT3_I(inode); - block_i = EXT3_I(inode)->i_block_alloc_info; + block_i = ei->i_block_alloc_info; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block @@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); + /* ext3_mark_inode_dirty already updated i_sync_tid */ + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); /* had we spliced it onto indirect block? */ if (where->bh) { @@ -2754,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) struct ext3_inode_info *ei; struct buffer_head *bh; struct inode *inode; + journal_t *journal = EXT3_SB(sb)->s_journal; + transaction_t *transaction; long ret; int block; @@ -2831,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + atomic_set(&ei->i_sync_tid, tid); + atomic_set(&ei->i_datasync_tid, tid); + } + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { /* @@ -3015,6 +3044,7 @@ again: err = rc; ei->i_state &= ~EXT3_STATE_NEW; + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7a520a862f4..427496c4767 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return NULL; ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + atomic_set(&ei->i_datasync_tid, 0); + atomic_set(&ei->i_sync_tid, 0); return &ei->vfs_inode; } -- cgit v1.2.3 From 7b02bec07efe1d6c7d48c786e0c1a38d28fe7245 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 10 Nov 2009 17:13:22 +0800 Subject: JBD/JBD2: free j_wbuf if journal init fails. If journal init fails, we need to free j_wbuf. Cc: Andrew Morton Cc: Jan Kara Signed-off-by: Tao Ma Signed-off-by: Jan Kara --- fs/jbd/journal.c | 2 ++ fs/jbd2/journal.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index bd3c073b485..49d5cd6053c 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -756,6 +756,7 @@ journal_t * journal_init_dev(struct block_device *bdev, return journal; out_err: + kfree(journal->j_wbuf); kfree(journal); return NULL; } @@ -820,6 +821,7 @@ journal_t * journal_init_inode (struct inode *inode) return journal; out_err: + kfree(journal->j_wbuf); kfree(journal); return NULL; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b0ab5219bec..fed85388ee8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -913,6 +913,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, return journal; out_err: + kfree(journal->j_wbuf); jbd2_stats_proc_exit(journal); kfree(journal); return NULL; @@ -986,6 +987,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) return journal; out_err: + kfree(journal->j_wbuf); jbd2_stats_proc_exit(journal); kfree(journal); return NULL; -- cgit v1.2.3 From 6346c93988caa3048bf4d81f9ba3608a7a195aa2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:47 -0500 Subject: Btrfs: fix data allocation hint start Sometimes our start allocation hint when we cow a file can be either EXTENT_HOLE or some other such place holder, which is not optimal. So if we find that our em->block_start is one of these special values, check to see where the first block of the inode is stored, and use that as a hint. If that block is also a special value, just fallback on a hint of 0 and let the allocator figure out a good place to put the data. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 78139efe41f..d8393ddc72a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -743,8 +743,22 @@ static noinline int cow_file_range(struct inode *inode, em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, start, num_bytes); if (em) { - alloc_hint = em->block_start; - free_extent_map(em); + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } } read_unlock(&BTRFS_I(inode)->extent_tree.lock); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); -- cgit v1.2.3 From 249ac1e55c642c670f47aacdc57629bbbf10a8db Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: cleanup transaction starting and fix journal_info usage We use journal_info to tell if we're in a nested transaction to make sure we don't commit the transaction within a nested transaction. We use another method to see if there are any outstanding ioctl trans handles, so if we're starting one do not set current->journal_info, since it will screw with other filesystems. This patch also cleans up the starting stuff so there aren't any magic numbers. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bca82a4ca8e..c207e8c32c9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -163,8 +163,14 @@ static void wait_current_trans(struct btrfs_root *root) } } +enum btrfs_trans_type { + TRANS_START, + TRANS_JOIN, + TRANS_USERSPACE, +}; + static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - int num_blocks, int wait) + int num_blocks, int type) { struct btrfs_trans_handle *h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); @@ -172,7 +178,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, mutex_lock(&root->fs_info->trans_mutex); if (!root->fs_info->log_root_recovering && - ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) + ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || + type == TRANS_USERSPACE)) wait_current_trans(root); ret = join_transaction(root); BUG_ON(ret); @@ -186,7 +193,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->alloc_exclude_start = 0; h->delayed_ref_updates = 0; - if (!current->journal_info) + if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; root->fs_info->running_transaction->use_count++; @@ -198,18 +205,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, 1); + return start_transaction(root, num_blocks, TRANS_START); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, 0); + return start_transaction(root, num_blocks, TRANS_JOIN); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks) { - return start_transaction(r, num_blocks, 2); + return start_transaction(r, num_blocks, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ -- cgit v1.2.3 From 01dea1efc23b511d3b58bb94da07ddb6d6db9895 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fix how we set max_size for free space clusters This patch fixes a problem where max_size can be set to 0 even though we filled the cluster properly. We set max_size to 0 if we restart the cluster window, but if the new start entry is big enough to be our new cluster then we could return with a max_size set to 0, which will mean the next time we try to allocate from this cluster it will fail. So set max_extent to the entry's size. Tested this on my box and now we actually allocate from the cluster after we fill it. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5c2caad7621..cb2849f0325 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1296,7 +1296,7 @@ again: window_start = entry->offset; window_free = entry->bytes; last = entry; - max_extent = 0; + max_extent = entry->bytes; } else { last = next; window_free += next->bytes; -- cgit v1.2.3 From 5df6a9f606bf2ee25ab8031bff124ed883b823be Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fix some metadata enospc issues We weren't reserving metadata space for rename, rmdir and unlink, which could cause problems. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d8393ddc72a..bb7fd807280 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2488,7 +2488,19 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) root = BTRFS_I(dir)->root; + /* + * 5 items for unlink inode + * 1 for orphan + */ + ret = btrfs_reserve_metadata_space(root, 6); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_unreserve_metadata_space(root, 6); + return PTR_ERR(trans); + } btrfs_set_trans_block_group(trans, dir); @@ -2503,6 +2515,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 6); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2583,7 +2596,16 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; + ret = btrfs_reserve_metadata_space(root, 5); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_unreserve_metadata_space(root, 5); + return PTR_ERR(trans); + } + btrfs_set_trans_block_group(trans, dir); if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { @@ -2606,6 +2628,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) out: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 5); btrfs_btree_balance_dirty(root, nr); if (ret && !err) @@ -5297,11 +5320,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, return -ENOTEMPTY; /* - * 2 items for dir items - * 1 item for orphan entry - * 1 item for ref + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. */ - ret = btrfs_reserve_metadata_space(root, 4); + ret = btrfs_reserve_metadata_space(root, 11); if (ret) return ret; @@ -5417,7 +5443,7 @@ out_fail: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); - btrfs_unreserve_metadata_space(root, 4); + btrfs_unreserve_metadata_space(root, 11); return ret; } -- cgit v1.2.3 From df66916e71231e9f2377cac9c5c1e2d190f9a427 Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Fri, 6 Nov 2009 14:33:01 +0000 Subject: Btrfs: skip btrfs_release_path in btrfs_update_root and btrfs_del_root We don't need to call btrfs_release_path because btrfs_free_path will do that for us. Signed-off-by: Li Dongyang Signed-off-by: Chris Mason --- fs/btrfs/root-tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9351428f30e..67fa2d29d66 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -159,7 +159,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root write_extent_buffer(l, item, ptr, sizeof(*item)); btrfs_mark_buffer_dirty(path->nodes[0]); out: - btrfs_release_path(root, path); btrfs_free_path(path); return ret; } @@ -332,7 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, BUG_ON(refs != 0); ret = btrfs_del_item(trans, root, path); out: - btrfs_release_path(root, path); btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 4eb3991c5def39bcf553c14ebe2618fcb47b627f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 10 Nov 2009 09:01:43 +0000 Subject: Btrfs: avoid null deref in unpin_extent_cache() I re-orderred the checks to avoid dereferencing "em" if it was null. Found by smatch static checker. Signed-off-by: Dan Carpenter Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2c726b7b9fa..ccbdcb54ec5 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -208,7 +208,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); - WARN_ON(em->start != start || !em); + WARN_ON(!em || em->start != start); if (!em) goto out; -- cgit v1.2.3 From ccf0e72537a9f68611ca575121afd08e2b4d0fb0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: find ideal block group for caching This patch changes a few things. Hopefully the comments are helpfull, but I'll try and be as verbose here. Problem: My fedora box was taking 1 minute and 21 seconds to boot with btrfs as root. Part of this problem was we pick the first block group we can find and start caching it, even if it may not have enough free space. The other problem is we only search for cached block groups the first time around, which we won't find any cached block groups because this is a newly mounted fs, so we end up caching several block groups during bootup, which with alot of fragmentation takes around 30-45 seconds to complete, which bogs down the system. So Solution: 1) Don't cache block groups willy-nilly at first. Instead try and figure out which block group has the most free, and therefore will take the least amount of time to cache. 2) Don't be so picky about cached block groups. The other problem is once we've filled up a cluster, if the block group isn't finished caching the next time we try and do the allocation we'll completely ignore the cluster and start searching from the beginning of the space, which makes us cache more block groups, which slows us down even more. So instead of skipping block groups that are not finished caching when we have a hint, only skip the block group if it hasn't started caching yet. There is one other tweak in here. Before if we allocated a chunk and still couldn't find new space, we'd end up switching the space info to force another chunk allocation. This could make us end up with way too many chunks, so keep track of this particular case. With this patch and my previous cluster fixes my fedora box now boots in 43 seconds, and according to the bootchart is not held up by our block group caching at all. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 109 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c56f91639dc..2a4cdceeb57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4101,7 +4101,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_CACHED_ONLY = 0, + LOOP_FIND_IDEAL = 0, LOOP_CACHING_NOWAIT = 1, LOOP_CACHING_WAIT = 2, LOOP_ALLOC_CHUNK = 3, @@ -4130,12 +4130,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group = NULL; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; + int done_chunk_alloc = 0; struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; + u64 ideal_cache_percent = 0; + u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -4171,14 +4174,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { +ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - block_group_cache_done(block_group)) { + (block_group->cached != BTRFS_CACHE_NO || + search_start == ideal_cache_offset)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -4190,13 +4198,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, */ btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); - } else + } else { goto have_block_group; + } } else if (block_group) { btrfs_put_block_group(block_group); } } - search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { @@ -4208,28 +4216,45 @@ search: have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + u64 free_percent; + + free_percent = btrfs_block_group_used(&block_group->item); + free_percent *= 100; + free_percent = div64_u64(free_percent, + block_group->key.offset); + free_percent = 100 - free_percent; + if (free_percent > ideal_cache_percent && + likely(!block_group->ro)) { + ideal_cache_offset = block_group->key.objectid; + ideal_cache_percent = free_percent; + } + /* - * we want to start caching kthreads, but not too many - * right off the bat so we don't overwhelm the system, - * so only start them if there are less than 2 and we're - * in the initial allocation phase. + * We only want to start kthread caching if we are at + * the point where we will wait for caching to make + * progress, or if our ideal search is over and we've + * found somebody to start caching. */ if (loop > LOOP_CACHING_NOWAIT || - atomic_read(&space_info->caching_threads) < 2) { + (loop > LOOP_FIND_IDEAL && + atomic_read(&space_info->caching_threads) < 2)) { ret = cache_block_group(block_group); BUG_ON(ret); } - } - - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { found_uncached_bg = true; - /* if we only want cached bgs, loop */ - if (loop == LOOP_CACHED_ONLY) + /* + * If loop is set for cached only, try the next block + * group. + */ + if (loop == LOOP_FIND_IDEAL) goto loop; } + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) + found_uncached_bg = true; + if (unlikely(block_group->ro)) goto loop; @@ -4409,9 +4434,11 @@ loop: } up_read(&space_info->groups_sem); - /* LOOP_CACHED_ONLY, only search fully cached block groups - * LOOP_CACHING_NOWAIT, search partially cached block groups, but - * dont wait foR them to finish caching + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for + * for them to make caching progress. Also + * determine the best possible bg to cache + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try @@ -4420,12 +4447,47 @@ loop: if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && (found_uncached_bg || empty_size || empty_cluster || allowed_chunk_alloc)) { - if (found_uncached_bg) { + if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; - if (loop < LOOP_CACHING_WAIT) { - loop++; + loop++; + if (!ideal_cache_percent && + atomic_read(&space_info->caching_threads)) goto search; - } + + /* + * 1 of the following 2 things have happened so far + * + * 1) We found an ideal block group for caching that + * is mostly full and will cache quickly, so we might + * as well wait for it. + * + * 2) We searched for cached only and we didn't find + * anything, and we didn't start any caching kthreads + * either, so chances are we will loop through and + * start a couple caching kthreads, and then come back + * around and just wait for them. This will be slower + * because we will have 2 caching kthreads reading at + * the same time when we could have just started one + * and waited for it to get far enough to give us an + * allocation, so go ahead and go to the wait caching + * loop. + */ + loop = LOOP_CACHING_WAIT; + search_start = ideal_cache_offset; + ideal_cache_percent = 0; + goto ideal_cache; + } else if (loop == LOOP_FIND_IDEAL) { + /* + * Didn't find a uncached bg, wait on anything we find + * next. + */ + loop = LOOP_CACHING_WAIT; + goto search; + } + + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; } if (loop == LOOP_ALLOC_CHUNK) { @@ -4437,7 +4499,8 @@ loop: ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, 1); allowed_chunk_alloc = 0; - } else { + done_chunk_alloc = 1; + } else if (!done_chunk_alloc) { space_info->force_alloc = 1; } -- cgit v1.2.3 From f5a84ee3cdd88d96b7bcede10af58598ad8d52a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Nov 2009 21:23:48 -0500 Subject: Btrfs: fallback on uncompressed io if compressed io fails Currently compressed IO does not deal with not having its entire extent able to be allocated. So if we have enough free space to allocate for the extent, but its not contiguous, it will fail spectacularly. This patch fixes this by falling back on uncompressed IO which lets us spread the delalloc extent across multiple extents. I tested this by making us randomly think the reservation had failed to make it fallback on the uncompressed io way and it seemed to work fine. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bb7fd807280..d3d7d46a6af 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -538,7 +538,7 @@ static noinline int submit_compressed_extents(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree; - int ret; + int ret = 0; if (list_empty(&async_cow->extents)) return 0; @@ -552,6 +552,7 @@ static noinline int submit_compressed_extents(struct inode *inode, io_tree = &BTRFS_I(inode)->io_tree; +retry: /* did the compression code fall back to uncompressed IO? */ if (!async_extent->pages) { int page_started = 0; @@ -562,11 +563,11 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->ram_size - 1, GFP_NOFS); /* allocate blocks */ - cow_file_range(inode, async_cow->locked_page, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0); + ret = cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); /* * if page_started, cow_file_range inserted an @@ -574,7 +575,7 @@ static noinline int submit_compressed_extents(struct inode *inode, * and IO for us. Otherwise, we need to submit * all those pages down to the drive. */ - if (!page_started) + if (!page_started && !ret) extent_write_locked_range(io_tree, inode, async_extent->start, async_extent->start + @@ -602,7 +603,21 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->compressed_size, 0, alloc_hint, (u64)-1, &ins, 1); - BUG_ON(ret); + if (ret) { + int i; + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + goto retry; + } + em = alloc_extent_map(GFP_NOFS); em->start = async_extent->start; em->len = async_extent->ram_size; -- cgit v1.2.3 From 33b258086441dd07e00133c79fcd8cbc6a76d737 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 11 Nov 2009 10:16:57 -0500 Subject: Btrfs: allow more metadata chunk preallocation On an FS where all of the space has not been allocated into chunks yet, the enospc can return enospc just because the existing metadata chunks are full. We get around this by allowing more metadata chunks to be allocated up to a certain limit, and finding the right limit is a little fuzzy. The problem is the reservations for delalloc would preallocate way too much of the FS as metadata. We need to start saying no and just force some IO to happen. But we also need to let a reasonable amount of the FS become metadata. This bumps the hard limit up, later releases will have a better system. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2a4cdceeb57..8d1fd6dc22a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2976,10 +2976,10 @@ static int maybe_allocate_chunk(struct btrfs_root *root, free_space = btrfs_super_total_bytes(disk_super); /* - * we allow the metadata to grow to a max of either 5gb or 5% of the + * we allow the metadata to grow to a max of either 10gb or 5% of the * space in the volume. */ - min_metadata = min((u64)5 * 1024 * 1024 * 1024, + min_metadata = min((u64)10 * 1024 * 1024 * 1024, div64_u64(free_space * 5, 100)); if (info->total_bytes >= min_metadata) { spin_unlock(&info->lock); -- cgit v1.2.3 From a6dbd429d8dd3382bbd9594b8d2ec74843a260d9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Nov 2009 15:53:34 -0500 Subject: Btrfs: fix panic when trying to destroy a newly allocated There is a problem where iget5_locked will look for an inode, not find it, and then subsequently try to allocate it. Another CPU will have raced in and allocated the inode instead, so when iget5_locked gets the inode spin lock again and does a search, it finds the new inode. So it goes ahead and calls destroy_inode on the inode it just allocated. The problem is we don't set BTRFS_I(inode)->root until the new inode is completely initialized. This patch makes us set root to NULL when alloc'ing a new inode, so when we get to btrfs_destroy_inode and we see that root is NULL we can just free up the memory and continue on. This fixes the panic http://www.kerneloops.org/submitresult.php?number=812690 Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3d7d46a6af..ee92801fc5d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5180,6 +5180,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->logged_trans = 0; ei->outstanding_extents = 0; ei->reserved_extents = 0; + ei->root = NULL; spin_lock_init(&ei->accounting_lock); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); @@ -5195,6 +5196,14 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + /* + * This can happen where we create an inode, but somebody else also + * created the same inode and we need to destroy the one we already + * created. + */ + if (!root) + goto free; + /* * Make sure we're properly removed from the ordered operation * lists. @@ -5230,6 +5239,7 @@ void btrfs_destroy_inode(struct inode *inode) } inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); +free: kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -- cgit v1.2.3 From ff5e4b51a397568c6a2901903c35a4bfaaf752a4 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 12 Nov 2009 09:53:50 +0100 Subject: fs/jbd: Export log_start_commit to fix ext3 build. This fixes: ERROR: "log_start_commit" [fs/ext3/ext3.ko] undefined! Signed-off-by: Stefan Schmidt --- fs/jbd/journal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 49d5cd6053c..4160afad6d0 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -73,6 +73,7 @@ EXPORT_SYMBOL(journal_errno); EXPORT_SYMBOL(journal_ack_err); EXPORT_SYMBOL(journal_clear_err); EXPORT_SYMBOL(log_wait_commit); +EXPORT_SYMBOL(log_start_commit); EXPORT_SYMBOL(journal_start_commit); EXPORT_SYMBOL(journal_force_commit_nested); EXPORT_SYMBOL(journal_wipe); -- cgit v1.2.3 From 29f12ca32122db98481150be09d35bd72b68045e Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 11 Nov 2009 14:26:32 -0800 Subject: pidns: fix a leak in /proc dentries and inodes with pid namespaces. Daniel Lezcano reported a leak in 'struct pid' and 'struct pid_namespace' that is discussed in: http://lkml.org/lkml/2009/10/2/159. To summarize the thread, when container-init is terminated, it sets the PF_EXITING flag, zaps other processes in the container and waits to reap them. As a part of reaping, the container-init should flush any /proc dentries associated with the processes. But because the container-init is itself exiting and the following PF_EXITING check, the dentries are not flushed, resulting in leak in /proc inodes and dentries. This fix reverts the commit 7766755a2f249e7e0 ("Fix /proc dcache deadlock in do_exit") which introduced the check for PF_EXITING. At the time of the commit, shrink_dcache_parent() flushed dentries from other filesystems also and could have caused a deadlock which the commit fixed. But as pointed out by Eric Biederman, after commit 0feae5c47aabdde59, shrink_dcache_parent() no longer affects other filesystems. So reverting the commit is now safe. As pointed out by Jan Kara, the leak is not as critical since the unclaimed space will be reclaimed under memory pressure or by: echo 3 > /proc/sys/vm/drop_caches But since this check is no longer required, its best to remove it. Signed-off-by: Sukadev Bhattiprolu Reported-by: Daniel Lezcano Acked-by: Eric W. Biederman Acked-by: Jan Kara Cc: Andrea Arcangeli Cc: Serge Hallyn Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 837469a9659..af643b5aefe 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2597,8 +2597,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - if (!(current->flags & PF_EXITING)) - shrink_dcache_parent(dentry); + shrink_dcache_parent(dentry); d_drop(dentry); dput(dentry); } -- cgit v1.2.3 From 7779d7bed950a7fb1af4f540c2f82a6b81b65901 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Nov 2009 14:26:34 -0800 Subject: fs: add missing compat_ptr handling for FS_IOC_RESVSP ioctl For FS_IOC_RESVSP and FS_IOC_RESVSP64 compat_sys_ioctl() uses its arg argument as a pointer to userspace. However it is missing a a call to compat_ptr() which will do a proper pointer conversion. This was introduced with 3e63cbb1 "fs: Add new pre-allocation ioctls to vfs for compatibility with legacy xfs ioctls". Signed-off-by: Heiko Carstens Cc: Ankit Jain Acked-by: Christoph Hellwig Cc: Al Viro Acked-by: Arnd Bergmann Acked-by: David S. Miller Cc: [2.6.31.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f91fd51b32e..d84e7058c29 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1800,7 +1800,7 @@ struct space_resv_32 { /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, unsigned long arg) { - struct space_resv_32 __user *p32 = (void __user *)arg; + struct space_resv_32 __user *p32 = compat_ptr(arg); struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || @@ -2802,7 +2802,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, #else case FS_IOC_RESVSP: case FS_IOC_RESVSP64: - error = ioctl_preallocate(filp, (void __user *)arg); + error = ioctl_preallocate(filp, compat_ptr(arg)); goto out_fput; #endif -- cgit v1.2.3 From fc63cf237078c86214abcb2ee9926d8ad289da9b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 11 Nov 2009 14:26:48 -0800 Subject: exec: setup_arg_pages() fails to return errors In setup_arg_pages we work hard to assign a value to ret, but on exit we always return 0. Also remove a now duplicated exit path and branch to out_unlock instead. Signed-off-by: Anton Blanchard Acked-by: Serge Hallyn Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index d49be6bc179..ba112bd4a33 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -624,10 +624,8 @@ int setup_arg_pages(struct linux_binprm *bprm, /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); - if (ret) { - up_write(&mm->mmap_sem); - return ret; - } + if (ret) + goto out_unlock; } #ifdef CONFIG_STACK_GROWSUP @@ -641,7 +639,7 @@ int setup_arg_pages(struct linux_binprm *bprm, out_unlock: up_write(&mm->mmap_sem); - return 0; + return ret; } EXPORT_SYMBOL(setup_arg_pages); -- cgit v1.2.3 From e04b5ef8b49db87d01a9b3a47fe41a918a0c0ff5 Mon Sep 17 00:00:00 2001 From: Mike Hommey Date: Wed, 11 Nov 2009 14:26:55 -0800 Subject: __generic_block_fiemap(): fix for files bigger than 4GB Because of an integer overflow on start_blk, various kind of wrong results would be returned by the generic_block_fiemap() handler, such as no extents when there is a 4GB+ hole at the beginning of the file, or wrong fe_logical when an extent starts after the first 4GB. Signed-off-by: Mike Hommey Cc: Alexander Viro Cc: Steven Whitehouse Cc: Theodore Ts'o Cc: Eric Sandeen Cc: Josef Bacik Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 7b17a14396f..6c751106c2e 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -254,7 +254,7 @@ int __generic_block_fiemap(struct inode *inode, u64 len, get_block_t *get_block) { struct buffer_head tmp; - unsigned int start_blk; + unsigned long long start_blk; long long length = 0, map_len = 0; u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; -- cgit v1.2.3 From c1ea985c710f41e97f1c72c29bbf367375370f0b Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 12 Nov 2009 00:13:32 +0900 Subject: nilfs2: fix lock order reversal in chcp operation Will fix the following lock order reversal lockdep detected: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.32-rc6 #7 ------------------------------------------------------- chcp/30157 is trying to acquire lock: (&nilfs->ns_mount_mutex){+.+.+.}, at: [] nilfs_cpfile_change_cpmode+0x46/0x752 [nilfs2] but task is already holding lock: (&nilfs->ns_segctor_sem){++++.+}, at: [] nilfs_transaction_begin+0xba/0x110 [nilfs2] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&nilfs->ns_segctor_sem){++++.+}: [] __lock_acquire+0x109c/0x139d [] lock_acquire+0x89/0xa0 [] down_read+0x31/0x45 [] nilfs_attach_checkpoint+0x8f/0x16b [nilfs2] [] nilfs_get_sb+0x3e7/0x653 [nilfs2] [] vfs_kern_mount+0x8b/0x124 [] do_kern_mount+0x37/0xc3 [] do_mount+0x64d/0x69d [] sys_mount+0x66/0x95 [] sysenter_do_call+0x12/0x32 -> #1 (&type->s_umount_key#31/1){+.+.+.}: [] __lock_acquire+0x109c/0x139d [] lock_acquire+0x89/0xa0 [] down_write_nested+0x34/0x52 [] sget+0x22e/0x389 [] nilfs_get_sb+0x187/0x653 [nilfs2] [] vfs_kern_mount+0x8b/0x124 [] do_kern_mount+0x37/0xc3 [] do_mount+0x64d/0x69d [] sys_mount+0x66/0x95 [] sysenter_do_call+0x12/0x32 -> #0 (&nilfs->ns_mount_mutex){+.+.+.}: [] __lock_acquire+0xe27/0x139d [] lock_acquire+0x89/0xa0 [] mutex_lock_nested+0x41/0x23e [] nilfs_cpfile_change_cpmode+0x46/0x752 [nilfs2] [] nilfs_ioctl+0x11a/0x7da [nilfs2] [] vfs_ioctl+0x27/0x6e [] do_vfs_ioctl+0x491/0x4db [] sys_ioctl+0x45/0x5f [] sysenter_do_call+0x12/0x32 Signed-off-by: Ryusuke Konishi --- fs/nilfs2/cpfile.c | 2 -- fs/nilfs2/ioctl.c | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 1c6cfb59128..3f5d5d06f53 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -871,7 +871,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) * exclusive with a new mount job. Though it doesn't cover * umount, it's enough for the purpose. */ - mutex_lock(&nilfs->ns_mount_mutex); if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { /* Current implementation does not have to protect plain read-only mounts since they are exclusive @@ -880,7 +879,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) ret = -EBUSY; } else ret = nilfs_cpfile_clear_snapshot(cpfile, cno); - mutex_unlock(&nilfs->ns_mount_mutex); return ret; case NILFS_SNAPSHOT: return nilfs_cpfile_set_snapshot(cpfile, cno); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index d24057d58f1..f6af76042d8 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -99,7 +99,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { - struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct inode *cpfile = nilfs->ns_cpfile; struct nilfs_transaction_info ti; struct nilfs_cpmode cpmode; int ret; @@ -109,14 +110,17 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, if (copy_from_user(&cpmode, argp, sizeof(cpmode))) return -EFAULT; + mutex_lock(&nilfs->ns_mount_mutex); nilfs_transaction_begin(inode->i_sb, &ti, 0); ret = nilfs_cpfile_change_cpmode( cpfile, cpmode.cm_cno, cpmode.cm_mode); if (unlikely(ret < 0)) { nilfs_transaction_abort(inode->i_sb); + mutex_unlock(&nilfs->ns_mount_mutex); return ret; } nilfs_transaction_commit(inode->i_sb); /* never fails */ + mutex_unlock(&nilfs->ns_mount_mutex); return ret; } -- cgit v1.2.3 From 7aee47b0bb9f93baecdbea205e878fe0f155f7da Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Fri, 6 Nov 2009 14:50:22 -0800 Subject: ocfs2: Trivial cleanup of jbd compatibility layer removal Mainline commit 53ef99cad9878f02f27bb30bc304fc42af8bdd6e removed the JBD compatibility layer from OCFS2. This patch removes the last remaining remnants of that. Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2.h | 7 +------ fs/ocfs2/uptodate.c | 5 ----- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index eae40460242..d963d863870 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -35,12 +35,7 @@ #include #include #include -#ifndef CONFIG_OCFS2_COMPAT_JBD -# include -#else -# include -# include "ocfs2_jbd_compat.h" -#endif +#include /* For union ocfs2_dlm_lksb */ #include "stackglue.h" diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index b6284f235d2..c61369342a2 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -53,11 +53,6 @@ #include #include #include -#ifndef CONFIG_OCFS2_COMPAT_JBD -# include -#else -# include -#endif #define MLOG_MASK_PREFIX ML_UPTODATE -- cgit v1.2.3 From 479c2553af9a176a0613894b9f1ec73425fd56a3 Mon Sep 17 00:00:00 2001 From: Petr Vandrovec Date: Sat, 14 Nov 2009 10:47:07 +0100 Subject: Fix memory corruption caused by nfsd readdir+ Commit 8177e6d6dfb9cd03d9bdeb647c32161f8f58f686 ("nfsd: clean up readdirplus encoding") introduced single character typo in nfs3 readdir+ implementation. Unfortunately that typo has quite bad side effects: random memory corruption, followed (on my box) with immediate spontaneous box reboot. Using 'p1' instead of 'p' fixes my Linux box rebooting whenever VMware ESXi box tries to list contents of my home directory. Signed-off-by: Petr Vandrovec Cc: "J. Bruce Fields" Cc: Neil Brown Signed-off-by: Linus Torvalds --- fs/nfsd/nfs3xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index edf926e1062..d0a2ce1b432 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -958,7 +958,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p1 = encode_entry_baggage(cd, p1, name, namlen, ino); if (plus) - p = encode_entryplus_baggage(cd, p1, name, namlen); + p1 = encode_entryplus_baggage(cd, p1, name, namlen); /* determine entry word length and lengths to go in pages */ num_entry_words = p1 - tmp; -- cgit v1.2.3 From 18dafac1a4c6c88867a50f9a82492976f20383d6 Mon Sep 17 00:00:00 2001 From: Jiro SEKIBA Date: Sun, 15 Nov 2009 13:49:45 +0900 Subject: nilfs2: deleted inconsistent comment in nilfs_load_inode_block() The comment says, "Caller of this function MUST lock s_inode_lock", however just above the comment, it locks s_inode_lock in the function. Signed-off-by: Jiro SEKIBA Signed-off-by: Ryusuke Konishi --- fs/nilfs2/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 5040220c373..2a0a5a3ac13 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -664,7 +664,6 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, int err; spin_lock(&sbi->s_inode_lock); - /* Caller of this function MUST lock s_inode_lock */ if (ii->i_bh == NULL) { spin_unlock(&sbi->s_inode_lock); err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, -- cgit v1.2.3 From f534dc994397560343be4a3223b9bbaa8e739e1f Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Mon, 16 Nov 2009 12:03:16 +0530 Subject: cifs: clear server inode number flag while autodisabling Fix the commit ec06aedd44 that intended to turn off querying for server inode numbers when server doesn't consistently support inode numbers. Presumably the commit didn't actually clear the CIFS_MOUNT_SERVER_INUM flag, perhaps a typo. Signed-off-by: Suresh Jayaraman Acked-by: Jeff Layton Cc: Stable Signed-off-by: Steve French --- fs/cifs/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1e25efcb55c..d27d4ec6579 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -720,7 +720,7 @@ void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - cifs_sb->mnt_cifs_flags &= CIFS_MOUNT_SERVER_INUM; + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; cERROR(1, ("Autodisabling the use of server inode numbers on " "%s. This server doesn't seem to support them " "properly. Hardlinks will not be recognized on this " -- cgit v1.2.3 From 8ec6dba2581754e375be66f7bedd708d856d8b30 Mon Sep 17 00:00:00 2001 From: Jan Rekorajski Date: Mon, 16 Nov 2009 11:57:02 +0000 Subject: XFS bug in log recover with quota (bugzilla id 855) Hi, I was hit by a bug in linux 2.6.31 when XFS is not able to recover the log after a crash if fs was mounted with quotas. Gory details in XFS bugzilla: http://oss.sgi.com/bugzilla/show_bug.cgi?id=855. It looks like wrong struct is used in buffer length check, and the following patch should fix the problem. xfs_dqblk_t has a size of 104+32 bytes, while xfs_disk_dquot_t is 104 bytes long, and this is exactly what I see in system logs - "XFS: dquot too small (104) in xlog_recover_do_dquot_trans." Signed-off-by: Jan Rekorajski Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_log_recover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1099395d7d6..fb17f8226b0 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1980,7 +1980,7 @@ xlog_recover_do_reg_buffer( "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) { + if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { cmn_err(CE_ALERT, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); @@ -2635,7 +2635,7 @@ xlog_recover_do_dquot_trans( "XFS: NULL dquot in %s.", __func__); return XFS_ERROR(EIO); } - if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) { + if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { cmn_err(CE_ALERT, "XFS: dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); -- cgit v1.2.3 From 6c06f072c2d797ddbb2270363de97c53ebbe0385 Mon Sep 17 00:00:00 2001 From: "Nathaniel W. Turner" Date: Mon, 16 Nov 2009 19:51:48 +0000 Subject: xfs: copy li_lsn before dropping AIL lock Access to log items on the AIL is generally protected by m_ail_lock; this is particularly needed when we're getting or setting the 64-bit li_lsn on a 32-bit platform. This patch fixes a couple places where we were accessing the log item after dropping the AIL lock on 32-bit machines. This can result in a partially-zeroed log->l_tail_lsn if xfs_trans_ail_delete is racing with xfs_trans_ail_update, and in at least some cases, this can leave the l_tail_lsn with a zero cycle number, which means xlog_space_left will think the log is full (unless CONFIG_XFS_DEBUG is set, in which case we'll trip an ASSERT), leading to processes stuck forever in xlog_grant_log_space. Thanks to Adrian VanderSpek for first spotting the race potential and to Dave Chinner for debug assistance. Signed-off-by: Nathaniel W. Turner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_trans_ail.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index f31271c30de..2ffc570679b 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -467,6 +467,7 @@ xfs_trans_ail_update( { xfs_log_item_t *dlip = NULL; xfs_log_item_t *mlip; /* ptr to minimum lip */ + xfs_lsn_t tail_lsn; mlip = xfs_ail_min(ailp); @@ -483,8 +484,16 @@ xfs_trans_ail_update( if (mlip == dlip) { mlip = xfs_ail_min(ailp); + /* + * It is not safe to access mlip after the AIL lock is + * dropped, so we must get a copy of li_lsn before we do + * so. This is especially important on 32-bit platforms + * where accessing and updating 64-bit values like li_lsn + * is not atomic. + */ + tail_lsn = mlip->li_lsn; spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); } else { spin_unlock(&ailp->xa_lock); } @@ -514,6 +523,7 @@ xfs_trans_ail_delete( { xfs_log_item_t *dlip; xfs_log_item_t *mlip; + xfs_lsn_t tail_lsn; if (lip->li_flags & XFS_LI_IN_AIL) { mlip = xfs_ail_min(ailp); @@ -527,9 +537,16 @@ xfs_trans_ail_delete( if (mlip == dlip) { mlip = xfs_ail_min(ailp); + /* + * It is not safe to access mlip after the AIL lock + * is dropped, so we must get a copy of li_lsn + * before we do so. This is especially important + * on 32-bit platforms where accessing and updating + * 64-bit values like li_lsn is not atomic. + */ + tail_lsn = mlip ? mlip->li_lsn : 0; spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, - (mlip ? mlip->li_lsn : 0)); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); } else { spin_unlock(&ailp->xa_lock); } -- cgit v1.2.3 From 9ebd4eba761b624a6a6c9189335adeddcb1fa0e0 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 17 Nov 2009 14:06:23 -0800 Subject: procfs: fix /proc//stat stack pointer for kernel threads Fix a small issue for the stack pointer in /proc//stat. In case of a kernel thread the value of the printed stack pointer should be 0. Signed-off-by: Stefani Seibold Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 07f77a7945c..822c2d50651 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -571,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, - (permitted) ? task->stack_start : 0, + (permitted && mm) ? task->stack_start : 0, esp, eip, /* The signal information here is obsolete. -- cgit v1.2.3 From 978b4053aefd422713f289f2a315ce2acba62018 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2009 14:06:24 -0800 Subject: fcntl: rename F_OWNER_GID to F_OWNER_PGRP This is for consistency with various ioctl() operations that include the suffix "PGRP" in their names, and also for consistency with PRIO_PGRP, used with setpriority() and getpriority(). Also, using PGRP instead of GID avoids confusion with the common abbreviation of "group ID". I'm fine with anything that makes it more consistent, and if PGRP is what is the predominant abbreviation then I see no need to further confuse matters by adding a third one. Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index fc089f2f7f5..2cf93ec40a6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -284,7 +284,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) type = PIDTYPE_PID; break; - case F_OWNER_GID: + case F_OWNER_PGRP: type = PIDTYPE_PGID; break; @@ -321,7 +321,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) break; case PIDTYPE_PGID: - owner.type = F_OWNER_GID; + owner.type = F_OWNER_PGRP; break; default: -- cgit v1.2.3 From 3d7a641e544e428191667e8b1f83f96fa46dbd65 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:10:23 +0000 Subject: SLOW_WORK: Wait for outstanding work items belonging to a module to clear Wait for outstanding slow work items belonging to a module to clear when unregistering that module as a user of the facility. This prevents the put_ref code of a work item from being taken away before it returns. Signed-off-by: David Howells --- fs/fscache/main.c | 6 +++--- fs/fscache/object.c | 1 + fs/fscache/operation.c | 1 + fs/gfs2/main.c | 4 ++-- fs/gfs2/recovery.c | 1 + 5 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 4de41b59749..add6bdb53f0 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -48,7 +48,7 @@ static int __init fscache_init(void) { int ret; - ret = slow_work_register_user(); + ret = slow_work_register_user(THIS_MODULE); if (ret < 0) goto error_slow_work; @@ -80,7 +80,7 @@ error_kobj: error_cookie_jar: fscache_proc_cleanup(); error_proc: - slow_work_unregister_user(); + slow_work_unregister_user(THIS_MODULE); error_slow_work: return ret; } @@ -97,7 +97,7 @@ static void __exit fscache_exit(void) kobject_put(fscache_root); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); - slow_work_unregister_user(); + slow_work_unregister_user(THIS_MODULE); printk(KERN_NOTICE "FS-Cache: Unloaded\n"); } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 392a41b1b79..d236eb1d6f3 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -45,6 +45,7 @@ static void fscache_enqueue_dependents(struct fscache_object *); static void fscache_dequeue_object(struct fscache_object *); const struct slow_work_ops fscache_object_slow_work_ops = { + .owner = THIS_MODULE, .get_ref = fscache_object_slow_work_get_ref, .put_ref = fscache_object_slow_work_put_ref, .execute = fscache_object_slow_work_execute, diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e7f8d53b8b6..f1a2857b2ff 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -453,6 +453,7 @@ static void fscache_op_execute(struct slow_work *work) } const struct slow_work_ops fscache_op_slow_work_ops = { + .owner = THIS_MODULE, .get_ref = fscache_op_get_ref, .put_ref = fscache_op_put_ref, .execute = fscache_op_execute, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index eacd78a5d08..5b31f7741a8 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void) if (error) goto fail_unregister; - error = slow_work_register_user(); + error = slow_work_register_user(THIS_MODULE); if (error) goto fail_slow; @@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void) gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); - slow_work_unregister_user(); + slow_work_unregister_user(THIS_MODULE); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 59d2695509d..b2bb779f09e 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -593,6 +593,7 @@ fail: } struct slow_work_ops gfs2_recover_ops = { + .owner = THIS_MODULE, .get_ref = gfs2_recover_get_ref, .put_ref = gfs2_recover_put_ref, .execute = gfs2_recover_work, -- cgit v1.2.3 From 440f0affe247e9990c8f8778f1861da4fd7d5e50 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:01 +0000 Subject: FS-Cache: Annotate slow-work runqueue proc lines for FS-Cache work items Annotate slow-work runqueue proc lines for FS-Cache work items. Objects include the object ID and the state. Operations include the object ID, the operation ID and the operation type and state. Signed-off-by: David Howells --- fs/fscache/object.c | 41 ++++++++++++++++++++++++++++++++++++++++- fs/fscache/operation.c | 29 +++++++++++++++++++++++++++++ fs/fscache/page.c | 27 ++++++++++++++++++++++----- 3 files changed, 91 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index d236eb1d6f3..615b63dd9ec 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -14,9 +14,10 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include +#include #include "internal.h" -const char *fscache_object_states[] = { +const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", @@ -33,9 +34,28 @@ const char *fscache_object_states[] = { }; EXPORT_SYMBOL(fscache_object_states); +static const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { + [FSCACHE_OBJECT_INIT] = "INIT", + [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", + [FSCACHE_OBJECT_CREATING] = "CRTN", + [FSCACHE_OBJECT_AVAILABLE] = "AVBL", + [FSCACHE_OBJECT_ACTIVE] = "ACTV", + [FSCACHE_OBJECT_UPDATING] = "UPDT", + [FSCACHE_OBJECT_DYING] = "DYNG", + [FSCACHE_OBJECT_LC_DYING] = "LCDY", + [FSCACHE_OBJECT_ABORT_INIT] = "ABTI", + [FSCACHE_OBJECT_RELEASING] = "RELS", + [FSCACHE_OBJECT_RECYCLING] = "RCYC", + [FSCACHE_OBJECT_WITHDRAWING] = "WTHD", + [FSCACHE_OBJECT_DEAD] = "DEAD", +}; + static void fscache_object_slow_work_put_ref(struct slow_work *); static int fscache_object_slow_work_get_ref(struct slow_work *); static void fscache_object_slow_work_execute(struct slow_work *); +#ifdef CONFIG_SLOW_WORK_PROC +static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); +#endif static void fscache_initialise_object(struct fscache_object *); static void fscache_lookup_object(struct fscache_object *); static void fscache_object_available(struct fscache_object *); @@ -49,6 +69,9 @@ const struct slow_work_ops fscache_object_slow_work_ops = { .get_ref = fscache_object_slow_work_get_ref, .put_ref = fscache_object_slow_work_put_ref, .execute = fscache_object_slow_work_execute, +#ifdef CONFIG_SLOW_WORK_PROC + .desc = fscache_object_slow_work_desc, +#endif }; EXPORT_SYMBOL(fscache_object_slow_work_ops); @@ -326,6 +349,22 @@ static void fscache_object_slow_work_execute(struct slow_work *work) fscache_enqueue_object(object); } +/* + * describe an object for slow-work debugging + */ +#ifdef CONFIG_SLOW_WORK_PROC +static void fscache_object_slow_work_desc(struct slow_work *work, + struct seq_file *m) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + + seq_printf(m, "FSC: OBJ%x: %s", + object->debug_id, + fscache_object_states_short[object->state]); +} +#endif + /* * initialise an object * - check the specified object's parent to see if we can make use of it diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index f1a2857b2ff..91bbe6f0377 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -13,6 +13,7 @@ #define FSCACHE_DEBUG_LEVEL OPERATION #include +#include #include "internal.h" atomic_t fscache_op_debug_id; @@ -31,6 +32,8 @@ void fscache_enqueue_operation(struct fscache_operation *op) _enter("{OBJ%x OP%x,%u}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + fscache_set_op_state(op, "EnQ"); + ASSERT(op->processor != NULL); ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); ASSERTCMP(atomic_read(&op->usage), >, 0); @@ -67,6 +70,8 @@ EXPORT_SYMBOL(fscache_enqueue_operation); static void fscache_run_op(struct fscache_object *object, struct fscache_operation *op) { + fscache_set_op_state(op, "Run"); + object->n_in_progress++; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); @@ -87,6 +92,8 @@ int fscache_submit_exclusive_op(struct fscache_object *object, _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + fscache_set_op_state(op, "SubmitX"); + spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); @@ -190,6 +197,8 @@ int fscache_submit_op(struct fscache_object *object, ASSERTCMP(atomic_read(&op->usage), >, 0); + fscache_set_op_state(op, "Submit"); + spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); @@ -298,6 +307,8 @@ void fscache_put_operation(struct fscache_operation *op) if (!atomic_dec_and_test(&op->usage)) return; + fscache_set_op_state(op, "Put"); + _debug("PUT OP"); if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) BUG(); @@ -452,9 +463,27 @@ static void fscache_op_execute(struct slow_work *work) _leave(""); } +/* + * describe an operation for slow-work debugging + */ +#ifdef CONFIG_SLOW_WORK_PROC +static void fscache_op_desc(struct slow_work *work, struct seq_file *m) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, slow_work); + + seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx", + op->object->debug_id, op->debug_id, + op->name, op->state, op->flags); +} +#endif + const struct slow_work_ops fscache_op_slow_work_ops = { .owner = THIS_MODULE, .get_ref = fscache_op_get_ref, .put_ref = fscache_op_put_ref, .execute = fscache_op_execute, +#ifdef CONFIG_SLOW_WORK_PROC + .desc = fscache_op_desc, +#endif }; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 2568e0eb644..e8bbc395cef 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -63,14 +63,19 @@ static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *p static void fscache_attr_changed_op(struct fscache_operation *op) { struct fscache_object *object = op->object; + int ret; _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); fscache_stat(&fscache_n_attr_changed_calls); - if (fscache_object_is_active(object) && - object->cache->ops->attr_changed(object) < 0) - fscache_abort_object(object); + if (fscache_object_is_active(object)) { + fscache_set_op_state(op, "CallFS"); + ret = object->cache->ops->attr_changed(object); + fscache_set_op_state(op, "Done"); + if (ret < 0) + fscache_abort_object(object); + } _leave(""); } @@ -99,6 +104,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) fscache_operation_init(op, NULL); fscache_operation_init_slow(op, fscache_attr_changed_op); op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE); + fscache_set_op_name(op, "Attr"); spin_lock(&cookie->lock); @@ -184,6 +190,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval( op->start_time = jiffies; INIT_WORK(&op->op.fast_work, fscache_retrieval_work); INIT_LIST_HEAD(&op->to_do); + fscache_set_op_name(&op->op, "Retr"); return op; } @@ -257,6 +264,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, _leave(" = -ENOMEM"); return -ENOMEM; } + fscache_set_op_name(&op->op, "RetrRA1"); spin_lock(&cookie->lock); @@ -369,6 +377,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(mapping, end_io_func, context); if (!op) return -ENOMEM; + fscache_set_op_name(&op->op, "RetrRAN"); spin_lock(&cookie->lock); @@ -461,6 +470,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(page->mapping, NULL, NULL); if (!op) return -ENOMEM; + fscache_set_op_name(&op->op, "RetrAL1"); spin_lock(&cookie->lock); @@ -529,6 +539,8 @@ static void fscache_write_op(struct fscache_operation *_op) _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); + fscache_set_op_state(&op->op, "GetPage"); + spin_lock(&cookie->lock); spin_lock(&object->lock); @@ -559,13 +571,17 @@ static void fscache_write_op(struct fscache_operation *_op) spin_unlock(&cookie->lock); if (page) { + fscache_set_op_state(&op->op, "Store"); ret = object->cache->ops->write_page(op, page); + fscache_set_op_state(&op->op, "EndWrite"); fscache_end_page_write(cookie, page); page_cache_release(page); - if (ret < 0) + if (ret < 0) { + fscache_set_op_state(&op->op, "Abort"); fscache_abort_object(object); - else + } else { fscache_enqueue_operation(&op->op); + } } _leave(""); @@ -634,6 +650,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_operation_init(&op->op, fscache_release_write_op); fscache_operation_init_slow(&op->op, fscache_write_op); op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); + fscache_set_op_name(&op->op, "Write1"); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) -- cgit v1.2.3 From 4fbf4291aa15926cd4fdca0ffe9122e89d0459db Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:04 +0000 Subject: FS-Cache: Allow the current state of all objects to be dumped Allow the current state of all fscache objects to be dumped by doing: cat /proc/fs/fscache/objects By default, all objects and all fields will be shown. This can be restricted by adding a suitable key to one of the caller's keyrings (such as the session keyring): keyctl add user fscache:objlist "" @s The are: K Show hexdump of object key (don't show if not given) A Show hexdump of object aux data (don't show if not given) And paired restrictions: C Show objects that have a cookie c Show objects that don't have a cookie B Show objects that are busy b Show objects that aren't busy W Show objects that have pending writes w Show objects that don't have pending writes R Show objects that have outstanding reads r Show objects that don't have outstanding reads S Show objects that have slow work queued s Show objects that don't have slow work queued If neither side of a restriction pair is given, then both are implied. For example: keyctl add user fscache:objlist KB @s shows objects that are busy, and lists their object keys, but does not dump their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is not implied. Signed-off-by: David Howells --- fs/cachefiles/interface.c | 1 + fs/cachefiles/rdwr.c | 6 +- fs/fscache/Kconfig | 7 + fs/fscache/Makefile | 1 + fs/fscache/cache.c | 1 + fs/fscache/cookie.c | 2 + fs/fscache/internal.h | 13 ++ fs/fscache/object-list.c | 432 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fscache/object.c | 2 +- fs/fscache/operation.c | 3 + fs/fscache/page.c | 6 + fs/fscache/proc.c | 13 ++ 12 files changed, 484 insertions(+), 3 deletions(-) create mode 100644 fs/fscache/object-list.c (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 431accd475a..dd7f852746c 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -331,6 +331,7 @@ static void cachefiles_put_object(struct fscache_object *_object) } cache = object->fscache.cache; + fscache_object_destroy(&object->fscache); kmem_cache_free(cachefiles_object_jar, object); fscache_object_destroyed(cache); } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index a69787e7dd9..3304646dae8 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -333,7 +333,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - op->op.flags = FSCACHE_OP_FAST; + op->op.flags &= FSCACHE_OP_KEEP_FLAGS; + op->op.flags |= FSCACHE_OP_FAST; op->op.processor = cachefiles_read_copier; pagevec_init(&pagevec, 0); @@ -639,7 +640,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, pagevec_init(&pagevec, 0); - op->op.flags = FSCACHE_OP_FAST; + op->op.flags &= FSCACHE_OP_KEEP_FLAGS; + op->op.flags |= FSCACHE_OP_FAST; op->op.processor = cachefiles_read_copier; INIT_LIST_HEAD(&backpages); diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 9bbb8ce7bea..864dac20a24 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -54,3 +54,10 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_OBJECT_LIST + bool "Maintain global object list for debugging purposes" + depends on FSCACHE && PROC_FS + help + Maintain a global list of active fscache objects that can be + retrieved through /proc/fs/fscache/objects for debugging purposes diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 91571b95aac..6d561531cb3 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -15,5 +15,6 @@ fscache-y := \ fscache-$(CONFIG_PROC_FS) += proc.o fscache-$(CONFIG_FSCACHE_STATS) += stats.o fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o +fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index e21985bbb1f..724384ef96d 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -263,6 +263,7 @@ int fscache_add_cache(struct fscache_cache *cache, spin_lock(&cache->object_list_lock); list_add_tail(&ifsdef->cache_link, &cache->object_list); spin_unlock(&cache->object_list_lock); + fscache_objlist_add(ifsdef); /* add the cache's netfs definition index object to the top level index * cookie as a known backing object */ diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 72fd18f6c71..9b518732823 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -349,6 +349,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie, object->cookie = cookie; atomic_inc(&cookie->usage); hlist_add_head(&object->cookie_link, &cookie->backing_objects); + + fscache_objlist_add(object); ret = 0; cant_attach_object: diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 1c341304621..fe02973a951 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -88,10 +88,23 @@ extern int fscache_wait_bit_interruptible(void *); /* * object.c */ +extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5]; + extern void fscache_withdrawing_object(struct fscache_cache *, struct fscache_object *); extern void fscache_enqueue_object(struct fscache_object *); +/* + * object-list.c + */ +#ifdef CONFIG_FSCACHE_OBJECT_LIST +extern const struct file_operations fscache_objlist_fops; + +extern void fscache_objlist_add(struct fscache_object *); +#else +#define fscache_objlist_add(object) do {} while(0) +#endif + /* * operation.c */ diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c new file mode 100644 index 00000000000..e590242fa41 --- /dev/null +++ b/fs/fscache/object-list.c @@ -0,0 +1,432 @@ +/* Global fscache object list maintainer and viewer + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include +#include +#include "internal.h" + +static struct rb_root fscache_object_list; +static DEFINE_RWLOCK(fscache_object_list_lock); + +struct fscache_objlist_data { + unsigned long config; /* display configuration */ +#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */ +#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */ +#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */ +#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */ +#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */ +#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */ +#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */ +#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */ +#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */ +#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ +#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ +#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ +#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */ +#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */ + + u8 buf[512]; /* key and aux data buffer */ +}; + +/* + * Add an object to the object list + * - we use the address of the fscache_object structure as the key into the + * tree + */ +void fscache_objlist_add(struct fscache_object *obj) +{ + struct fscache_object *xobj; + struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; + + write_lock(&fscache_object_list_lock); + + while (*p) { + parent = *p; + xobj = rb_entry(parent, struct fscache_object, objlist_link); + + if (obj < xobj) + p = &(*p)->rb_left; + else if (obj > xobj) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&obj->objlist_link, parent, p); + rb_insert_color(&obj->objlist_link, &fscache_object_list); + + write_unlock(&fscache_object_list_lock); +} + +/** + * fscache_object_destroy - Note that a cache object is about to be destroyed + * @object: The object to be destroyed + * + * Note the imminent destruction and deallocation of a cache object record. + */ +void fscache_object_destroy(struct fscache_object *obj) +{ + write_lock(&fscache_object_list_lock); + + BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); + rb_erase(&obj->objlist_link, &fscache_object_list); + + write_unlock(&fscache_object_list_lock); +} +EXPORT_SYMBOL(fscache_object_destroy); + +/* + * find the object in the tree on or after the specified index + */ +static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) +{ + struct fscache_object *pobj, *obj, *minobj = NULL; + struct rb_node *p; + unsigned long pos; + + if (*_pos >= (unsigned long) ERR_PTR(-ENOENT)) + return NULL; + pos = *_pos; + + /* banners (can't represent line 0 by pos 0 as that would involve + * returning a NULL pointer) */ + if (pos == 0) + return (struct fscache_object *) ++(*_pos); + if (pos < 3) + return (struct fscache_object *)pos; + + pobj = (struct fscache_object *)pos; + p = fscache_object_list.rb_node; + while (p) { + obj = rb_entry(p, struct fscache_object, objlist_link); + if (pobj < obj) { + if (!minobj || minobj > obj) + minobj = obj; + p = p->rb_left; + } else if (pobj > obj) { + p = p->rb_right; + } else { + minobj = obj; + break; + } + obj = NULL; + } + + if (!minobj) + *_pos = (unsigned long) ERR_PTR(-ENOENT); + else if (minobj != obj) + *_pos = (unsigned long) minobj; + return minobj; +} + +/* + * set up the iterator to start reading from the first line + */ +static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos) + __acquires(&fscache_object_list_lock) +{ + read_lock(&fscache_object_list_lock); + return fscache_objlist_lookup(_pos); +} + +/* + * move to the next line + */ +static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos) +{ + (*_pos)++; + return fscache_objlist_lookup(_pos); +} + +/* + * clean up after reading + */ +static void fscache_objlist_stop(struct seq_file *m, void *v) + __releases(&fscache_object_list_lock) +{ + read_unlock(&fscache_object_list_lock); +} + +/* + * display an object + */ +static int fscache_objlist_show(struct seq_file *m, void *v) +{ + struct fscache_objlist_data *data = m->private; + struct fscache_object *obj = v; + unsigned long config = data->config; + uint16_t keylen, auxlen; + char _type[3], *type; + bool no_cookie; + u8 *buf = data->buf, *p; + + if ((unsigned long) v == 1) { + seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" + " EM EV F S" + " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); + if (config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, " "); + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + seq_puts(m, "OBJECT_KEY"); + if ((config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) == + (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, ", "); + if (config & FSCACHE_OBJLIST_CONFIG_AUX) + seq_puts(m, "AUX_DATA"); + seq_puts(m, "\n"); + return 0; + } + + if ((unsigned long) v == 2) { + seq_puts(m, "======== ======== ==== ===== === === === == =====" + " == == = =" + " | ================ == == ================"); + if (config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, " ================"); + seq_puts(m, "\n"); + return 0; + } + + /* filter out any unwanted objects */ +#define FILTER(criterion, _yes, _no) \ + do { \ + unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \ + unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \ + if (criterion) { \ + if (!(config & yes)) \ + return 0; \ + } else { \ + if (!(config & no)) \ + return 0; \ + } \ + } while(0) + + if (~config) { + FILTER(obj->cookie, + COOKIE, NOCOOKIE); + FILTER(obj->state != FSCACHE_OBJECT_ACTIVE || + obj->n_ops != 0 || + obj->n_obj_ops != 0 || + obj->flags || + !list_empty(&obj->dependents), + BUSY, IDLE); + FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags), + PENDWR, NOPENDWR); + FILTER(atomic_read(&obj->n_reads), + READS, NOREADS); + FILTER(obj->events & obj->event_mask, + EVENTS, NOEVENTS); + FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW), + WORK, NOWORK); + } + + seq_printf(m, + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ", + obj->debug_id, + obj->parent ? obj->parent->debug_id : -1, + fscache_object_states_short[obj->state], + obj->n_children, + obj->n_ops, + obj->n_obj_ops, + obj->n_in_progress, + obj->n_exclusive, + atomic_read(&obj->n_reads), + obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, + obj->events, + obj->flags, + obj->work.flags); + + no_cookie = true; + keylen = auxlen = 0; + if (obj->cookie) { + spin_lock(&obj->lock); + if (obj->cookie) { + switch (obj->cookie->def->type) { + case 0: + type = "IX"; + break; + case 1: + type = "DT"; + break; + default: + sprintf(_type, "%02u", + obj->cookie->def->type); + type = _type; + break; + } + + seq_printf(m, "%-16s %s %2lx %16p", + obj->cookie->def->name, + type, + obj->cookie->flags, + obj->cookie->netfs_data); + + if (obj->cookie->def->get_key && + config & FSCACHE_OBJLIST_CONFIG_KEY) + keylen = obj->cookie->def->get_key( + obj->cookie->netfs_data, + buf, 400); + + if (obj->cookie->def->get_aux && + config & FSCACHE_OBJLIST_CONFIG_AUX) + auxlen = obj->cookie->def->get_aux( + obj->cookie->netfs_data, + buf + keylen, 512 - keylen); + + no_cookie = false; + } + spin_unlock(&obj->lock); + + if (!no_cookie && (keylen > 0 || auxlen > 0)) { + seq_printf(m, " "); + for (p = buf; keylen > 0; keylen--) + seq_printf(m, "%02x", *p++); + if (auxlen > 0) { + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + seq_printf(m, ", "); + for (; auxlen > 0; auxlen--) + seq_printf(m, "%02x", *p++); + } + } + } + + if (no_cookie) + seq_printf(m, "\n"); + else + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations fscache_objlist_ops = { + .start = fscache_objlist_start, + .stop = fscache_objlist_stop, + .next = fscache_objlist_next, + .show = fscache_objlist_show, +}; + +/* + * get the configuration for filtering the list + */ +static void fscache_objlist_config(struct fscache_objlist_data *data) +{ +#ifdef CONFIG_KEYS + struct user_key_payload *confkey; + unsigned long config; + struct key *key; + const char *buf; + int len; + + key = request_key(&key_type_user, "fscache:objlist", NULL); + if (IS_ERR(key)) + goto no_config; + + config = 0; + rcu_read_lock(); + + confkey = key->payload.data; + buf = confkey->data; + + for (len = confkey->datalen - 1; len >= 0; len--) { + switch (buf[len]) { + case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break; + case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break; + case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break; + case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break; + case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break; + case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break; + case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break; + case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break; + case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break; + case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break; + case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break; + case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break; + } + } + + rcu_read_unlock(); + key_put(key); + + if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE))) + config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE; + if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE))) + config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE; + if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR))) + config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR; + if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS))) + config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS; + if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS))) + config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS; + if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK))) + config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK; + + data->config = config; + return; + +no_config: +#endif + data->config = ULONG_MAX; +} + +/* + * open "/proc/fs/fscache/objects" to provide a list of active objects + * - can be configured by a user-defined key added to the caller's keyrings + */ +static int fscache_objlist_open(struct inode *inode, struct file *file) +{ + struct fscache_objlist_data *data; + struct seq_file *m; + int ret; + + ret = seq_open(file, &fscache_objlist_ops); + if (ret < 0) + return ret; + + m = file->private_data; + + /* buffer for key extraction */ + data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL); + if (!data) { + seq_release(inode, file); + return -ENOMEM; + } + + /* get the configuration key */ + fscache_objlist_config(data); + + m->private = data; + return 0; +} + +/* + * clean up on close + */ +static int fscache_objlist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + kfree(m->private); + m->private = NULL; + return seq_release(inode, file); +} + +const struct file_operations fscache_objlist_fops = { + .owner = THIS_MODULE, + .open = fscache_objlist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = fscache_objlist_release, +}; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 615b63dd9ec..ad1644f073b 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -34,7 +34,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { }; EXPORT_SYMBOL(fscache_object_states); -static const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { +const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { [FSCACHE_OBJECT_INIT] = "INIT", [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", [FSCACHE_OBJECT_CREATING] = "CRTN", diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 91bbe6f0377..09e43b6e822 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -322,6 +322,9 @@ void fscache_put_operation(struct fscache_operation *op) object = op->object; + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + /* now... we may get called with the object spinlock held, so we * complete the cleanup here only if we can immediately acquire the * lock, and defer it otherwise */ diff --git a/fs/fscache/page.c b/fs/fscache/page.c index e8bbc395cef..c5973e38ce3 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -275,6 +275,9 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); + atomic_inc(&object->n_reads); + set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + if (fscache_submit_op(object, &op->op) < 0) goto nobufs_unlock; spin_unlock(&cookie->lock); @@ -386,6 +389,9 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); + atomic_inc(&object->n_reads); + set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + if (fscache_submit_op(object, &op->op) < 0) goto nobufs_unlock; spin_unlock(&cookie->lock); diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index beeab44bc31..1d9e4951a59 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -37,10 +37,20 @@ int __init fscache_proc_init(void) goto error_histogram; #endif +#ifdef CONFIG_FSCACHE_OBJECT_LIST + if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, + &fscache_objlist_fops)) + goto error_objects; +#endif + _leave(" = 0"); return 0; +#ifdef CONFIG_FSCACHE_OBJECT_LIST +error_objects: +#endif #ifdef CONFIG_FSCACHE_HISTOGRAM + remove_proc_entry("fs/fscache/histogram", NULL); error_histogram: #endif #ifdef CONFIG_FSCACHE_STATS @@ -58,6 +68,9 @@ error_dir: */ void fscache_proc_cleanup(void) { +#ifdef CONFIG_FSCACHE_OBJECT_LIST + remove_proc_entry("fs/fscache/objects", NULL); +#endif #ifdef CONFIG_FSCACHE_HISTOGRAM remove_proc_entry("fs/fscache/histogram", NULL); #endif -- cgit v1.2.3 From 52bd75fdb135d6133d878ae60c6e7e3f4ebc1cfc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:08 +0000 Subject: FS-Cache: Add counters for entry/exit to/from cache operation functions Count entries to and exits from cache operation table functions. Maintain these as a single counter that's added to or removed from as appropriate. Signed-off-by: David Howells --- fs/fscache/cache.c | 4 ++++ fs/fscache/cookie.c | 9 ++++++++- fs/fscache/internal.h | 22 ++++++++++++++++++++++ fs/fscache/object.c | 26 ++++++++++++++++++++++++-- fs/fscache/page.c | 29 +++++++++++++++++++++++------ fs/fscache/stats.c | 37 +++++++++++++++++++++++++++++++++++++ 6 files changed, 118 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 724384ef96d..6a3c48abd67 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -381,11 +381,15 @@ void fscache_withdraw_cache(struct fscache_cache *cache) /* make sure all pages pinned by operations on behalf of the netfs are * written to disk */ + fscache_stat(&fscache_n_cop_sync_cache); cache->ops->sync_cache(cache); + fscache_stat_d(&fscache_n_cop_sync_cache); /* dissociate all the netfs pages backed by this cache from the block * mappings in the cache */ + fscache_stat(&fscache_n_cop_dissociate_pages); cache->ops->dissociate_pages(cache); + fscache_stat_d(&fscache_n_cop_dissociate_pages); /* we now have to destroy all the active objects pertaining to this * cache - which we do by passing them off to thread pool to be diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 9b518732823..432482edc73 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -249,7 +249,9 @@ static int fscache_alloc_object(struct fscache_cache *cache, /* ask the cache to allocate an object (we may end up with duplicate * objects at this stage, but we sort that out later) */ + fscache_stat(&fscache_n_cop_alloc_object); object = cache->ops->alloc_object(cache, cookie); + fscache_stat_d(&fscache_n_cop_alloc_object); if (IS_ERR(object)) { fscache_stat(&fscache_n_object_no_alloc); ret = PTR_ERR(object); @@ -270,8 +272,11 @@ static int fscache_alloc_object(struct fscache_cache *cache, /* only attach if we managed to allocate all we needed, otherwise * discard the object we just allocated and instead use the one * attached to the cookie */ - if (fscache_attach_object(cookie, object) < 0) + if (fscache_attach_object(cookie, object) < 0) { + fscache_stat(&fscache_n_cop_put_object); cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); + } _leave(" = 0"); return 0; @@ -287,7 +292,9 @@ object_already_extant: return 0; error_put: + fscache_stat(&fscache_n_cop_put_object); cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); error: _leave(" = %d", ret); return ret; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index fe02973a951..b85cc890681 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -208,11 +208,33 @@ extern atomic_t fscache_n_checkaux_okay; extern atomic_t fscache_n_checkaux_update; extern atomic_t fscache_n_checkaux_obsolete; +extern atomic_t fscache_n_cop_alloc_object; +extern atomic_t fscache_n_cop_lookup_object; +extern atomic_t fscache_n_cop_lookup_complete; +extern atomic_t fscache_n_cop_grab_object; +extern atomic_t fscache_n_cop_update_object; +extern atomic_t fscache_n_cop_drop_object; +extern atomic_t fscache_n_cop_put_object; +extern atomic_t fscache_n_cop_sync_cache; +extern atomic_t fscache_n_cop_attr_changed; +extern atomic_t fscache_n_cop_read_or_alloc_page; +extern atomic_t fscache_n_cop_read_or_alloc_pages; +extern atomic_t fscache_n_cop_allocate_page; +extern atomic_t fscache_n_cop_allocate_pages; +extern atomic_t fscache_n_cop_write_page; +extern atomic_t fscache_n_cop_uncache_page; +extern atomic_t fscache_n_cop_dissociate_pages; + static inline void fscache_stat(atomic_t *stat) { atomic_inc(stat); } +static inline void fscache_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + extern const struct file_operations fscache_stats_fops; #else diff --git a/fs/fscache/object.c b/fs/fscache/object.c index ad1644f073b..0d65c0c92b4 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -144,13 +144,17 @@ static void fscache_object_state_machine(struct fscache_object *object) case FSCACHE_OBJECT_UPDATING: clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); fscache_stat(&fscache_n_updates_run); + fscache_stat(&fscache_n_cop_update_object); object->cache->ops->update_object(object); + fscache_stat_d(&fscache_n_cop_update_object); goto active_transit; /* handle an object dying during lookup or creation */ case FSCACHE_OBJECT_LC_DYING: object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); + fscache_stat(&fscache_n_cop_lookup_complete); object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); spin_lock(&object->lock); object->state = FSCACHE_OBJECT_DYING; @@ -416,7 +420,9 @@ static void fscache_initialise_object(struct fscache_object *object) * binding on to us, so we need to make sure we don't * add ourself to the list multiple times */ if (list_empty(&object->dep_link)) { + fscache_stat(&fscache_n_cop_grab_object); object->cache->ops->grab_object(object); + fscache_stat_d(&fscache_n_cop_grab_object); list_add(&object->dep_link, &parent->dependents); @@ -478,7 +484,9 @@ static void fscache_lookup_object(struct fscache_object *object) object->cache->tag->name); fscache_stat(&fscache_n_object_lookups); + fscache_stat(&fscache_n_cop_lookup_object); object->cache->ops->lookup_object(object); + fscache_stat_d(&fscache_n_cop_lookup_object); if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); @@ -602,7 +610,9 @@ static void fscache_object_available(struct fscache_object *object) } spin_unlock(&object->lock); + fscache_stat(&fscache_n_cop_lookup_complete); object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); fscache_enqueue_dependents(object); fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); @@ -625,7 +635,9 @@ static void fscache_drop_object(struct fscache_object *object) list_del_init(&object->cache_link); spin_unlock(&cache->object_list_lock); + fscache_stat(&fscache_n_cop_drop_object); cache->ops->drop_object(object); + fscache_stat_d(&fscache_n_cop_drop_object); if (parent) { _debug("release parent OBJ%x {%d}", @@ -640,7 +652,9 @@ static void fscache_drop_object(struct fscache_object *object) } /* this just shifts the object release to the slow work processor */ + fscache_stat(&fscache_n_cop_put_object); object->cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); _leave(""); } @@ -730,8 +744,12 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); + int ret; - return object->cache->ops->grab_object(object) ? 0 : -EAGAIN; + fscache_stat(&fscache_n_cop_grab_object); + ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN; + fscache_stat_d(&fscache_n_cop_grab_object); + return ret; } /* @@ -742,7 +760,9 @@ static void fscache_object_slow_work_put_ref(struct slow_work *work) struct fscache_object *object = container_of(work, struct fscache_object, work); - return object->cache->ops->put_object(object); + fscache_stat(&fscache_n_cop_put_object); + object->cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); } /* @@ -779,7 +799,9 @@ static void fscache_enqueue_dependents(struct fscache_object *object) /* sort onto appropriate lists */ fscache_enqueue_object(dep); + fscache_stat(&fscache_n_cop_put_object); dep->cache->ops->put_object(dep); + fscache_stat_d(&fscache_n_cop_put_object); if (!list_empty(&object->dependents)) cond_resched_lock(&object->lock); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index c5973e38ce3..250dfd34c07 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -71,7 +71,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op) if (fscache_object_is_active(object)) { fscache_set_op_state(op, "CallFS"); + fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); + fscache_stat_d(&fscache_n_cop_attr_changed); fscache_set_op_state(op, "Done"); if (ret < 0) fscache_abort_object(object); @@ -300,11 +302,15 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, /* ask the cache to honour the operation */ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + fscache_stat(&fscache_n_cop_allocate_page); ret = object->cache->ops->allocate_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_allocate_page); if (ret == 0) ret = -ENODATA; } else { + fscache_stat(&fscache_n_cop_read_or_alloc_page); ret = object->cache->ops->read_or_alloc_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_read_or_alloc_page); } if (ret == -ENOMEM) @@ -358,7 +364,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, void *context, gfp_t gfp) { - fscache_pages_retrieval_func_t func; struct fscache_retrieval *op; struct fscache_object *object; int ret; @@ -413,11 +418,17 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, } /* ask the cache to honour the operation */ - if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) - func = object->cache->ops->allocate_pages; - else - func = object->cache->ops->read_or_alloc_pages; - ret = func(op, pages, nr_pages, gfp); + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + fscache_stat(&fscache_n_cop_allocate_pages); + ret = object->cache->ops->allocate_pages( + op, pages, nr_pages, gfp); + fscache_stat_d(&fscache_n_cop_allocate_pages); + } else { + fscache_stat(&fscache_n_cop_read_or_alloc_pages); + ret = object->cache->ops->read_or_alloc_pages( + op, pages, nr_pages, gfp); + fscache_stat_d(&fscache_n_cop_read_or_alloc_pages); + } if (ret == -ENOMEM) fscache_stat(&fscache_n_retrievals_nomem); @@ -500,7 +511,9 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, } /* ask the cache to honour the operation */ + fscache_stat(&fscache_n_cop_allocate_page); ret = object->cache->ops->allocate_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_allocate_page); if (ret < 0) fscache_stat(&fscache_n_allocs_nobufs); @@ -578,7 +591,9 @@ static void fscache_write_op(struct fscache_operation *_op) if (page) { fscache_set_op_state(&op->op, "Store"); + fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); + fscache_stat_d(&fscache_n_cop_write_page); fscache_set_op_state(&op->op, "EndWrite"); fscache_end_page_write(cookie, page); page_cache_release(page); @@ -786,7 +801,9 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) if (TestClearPageFsCache(page) && object->cache->ops->uncache_page) { /* the cache backend releases the cookie lock */ + fscache_stat(&fscache_n_cop_uncache_page); object->cache->ops->uncache_page(object, page); + fscache_stat_d(&fscache_n_cop_uncache_page); goto done; } diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 65deb99e756..20233fb44bf 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -93,6 +93,23 @@ atomic_t fscache_n_checkaux_okay; atomic_t fscache_n_checkaux_update; atomic_t fscache_n_checkaux_obsolete; +atomic_t fscache_n_cop_alloc_object; +atomic_t fscache_n_cop_lookup_object; +atomic_t fscache_n_cop_lookup_complete; +atomic_t fscache_n_cop_grab_object; +atomic_t fscache_n_cop_update_object; +atomic_t fscache_n_cop_drop_object; +atomic_t fscache_n_cop_put_object; +atomic_t fscache_n_cop_sync_cache; +atomic_t fscache_n_cop_attr_changed; +atomic_t fscache_n_cop_read_or_alloc_page; +atomic_t fscache_n_cop_read_or_alloc_pages; +atomic_t fscache_n_cop_allocate_page; +atomic_t fscache_n_cop_allocate_pages; +atomic_t fscache_n_cop_write_page; +atomic_t fscache_n_cop_uncache_page; +atomic_t fscache_n_cop_dissociate_pages; + /* * display the general statistics */ @@ -192,6 +209,26 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), atomic_read(&fscache_n_op_gc)); + + seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n", + atomic_read(&fscache_n_cop_alloc_object), + atomic_read(&fscache_n_cop_lookup_object), + atomic_read(&fscache_n_cop_lookup_complete), + atomic_read(&fscache_n_cop_grab_object)); + seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n", + atomic_read(&fscache_n_cop_update_object), + atomic_read(&fscache_n_cop_drop_object), + atomic_read(&fscache_n_cop_put_object), + atomic_read(&fscache_n_cop_attr_changed), + atomic_read(&fscache_n_cop_sync_cache)); + seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n", + atomic_read(&fscache_n_cop_read_or_alloc_page), + atomic_read(&fscache_n_cop_read_or_alloc_pages), + atomic_read(&fscache_n_cop_allocate_page), + atomic_read(&fscache_n_cop_allocate_pages), + atomic_read(&fscache_n_cop_write_page), + atomic_read(&fscache_n_cop_uncache_page), + atomic_read(&fscache_n_cop_dissociate_pages)); return 0; } -- cgit v1.2.3 From 7e311a207d596b9273d811149d6e3e14f05ac4c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:11 +0000 Subject: FS-Cache: Clear netfs pointers in cookie after detaching object, not before Clear the pointers from the fscache_cookie struct to netfs private data after clearing the pointer to the cookie from the fscache_object struct and releasing the object lock, rather than before. This allows the netfs private data pointers to be relied on simply by holding the object lock, rather than having to hold the cookie lock. This is makes things simpler as the cookie lock has to be taken before the object lock, but sometimes the object pointer is all that the code has. Signed-off-by: David Howells --- fs/fscache/cookie.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 432482edc73..b1870a6c2cd 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -437,12 +437,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; - /* detach pointers back to the netfs */ spin_lock(&cookie->lock); - cookie->netfs_data = NULL; - cookie->def = NULL; - /* break links with all the active objects */ while (!hlist_empty(&cookie->backing_objects)) { object = hlist_entry(cookie->backing_objects.first, @@ -465,6 +461,10 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) BUG(); } + /* detach pointers back to the netfs */ + cookie->netfs_data = NULL; + cookie->def = NULL; + spin_unlock(&cookie->lock); if (cookie->parent) { -- cgit v1.2.3 From b34df792b4e9e311db47fad27949095d0629c197 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:14 +0000 Subject: FS-Cache: Use radix tree preload correctly in tracking of pages to be stored __fscache_write_page() attempts to load the radix tree preallocation pool for the CPU it is on before calling radix_tree_insert(), as the insertion must be done inside a pair of spinlocks. Use of the preallocation pool, however, is contingent on the radix tree being initialised without __GFP_WAIT specified. __fscache_acquire_cookie() was passing GFP_NOFS to INIT_RADIX_TREE() - but that includes __GFP_WAIT. The solution is to AND out __GFP_WAIT. Additionally, the banner comment to radix_tree_preload() is altered to make note of this prerequisite. Possibly there should be a WARN_ON() too. Without this fix, I have seen the following recursive deadlock caused by radix_tree_insert() attempting to allocate memory inside the spinlocked region, which resulted in FS-Cache being called back into to release memory - which required the spinlock already held. ============================================= [ INFO: possible recursive locking detected ] 2.6.32-rc6-cachefs #24 --------------------------------------------- nfsiod/7916 is trying to acquire lock: (&cookie->lock){+.+.-.}, at: [] __fscache_uncache_page+0xdb/0x160 [fscache] but task is already holding lock: (&cookie->lock){+.+.-.}, at: [] __fscache_write_page+0x15c/0x3f3 [fscache] other info that might help us debug this: 5 locks held by nfsiod/7916: #0: (nfsiod){+.+.+.}, at: [] worker_thread+0x19a/0x2e2 #1: (&task->u.tk_work#2){+.+.+.}, at: [] worker_thread+0x19a/0x2e2 #2: (&cookie->lock){+.+.-.}, at: [] __fscache_write_page+0x15c/0x3f3 [fscache] #3: (&object->lock#2){+.+.-.}, at: [] __fscache_write_page+0x197/0x3f3 [fscache] #4: (&cookie->stores_lock){+.+...}, at: [] __fscache_write_page+0x19f/0x3f3 [fscache] stack backtrace: Pid: 7916, comm: nfsiod Not tainted 2.6.32-rc6-cachefs #24 Call Trace: [] __lock_acquire+0x1649/0x16e3 [] ? __lock_acquire+0x7b7/0x16e3 [] ? dump_trace+0x248/0x257 [] lock_acquire+0x57/0x6d [] ? __fscache_uncache_page+0xdb/0x160 [fscache] [] _spin_lock+0x2c/0x3b [] ? __fscache_uncache_page+0xdb/0x160 [fscache] [] __fscache_uncache_page+0xdb/0x160 [fscache] [] ? __fscache_check_page_write+0x0/0x71 [fscache] [] nfs_fscache_release_page+0x86/0xc4 [nfs] [] nfs_release_page+0x3c/0x41 [nfs] [] try_to_release_page+0x32/0x3b [] shrink_page_list+0x316/0x4ac [] ? mark_held_locks+0x52/0x70 [] ? _spin_unlock_irq+0x2b/0x31 [] shrink_inactive_list+0x392/0x67c [] ? mark_held_locks+0x52/0x70 [] shrink_list+0x8d/0x8f [] shrink_zone+0x278/0x33c [] ? ktime_get_ts+0xad/0xba [] try_to_free_pages+0x22e/0x392 [] ? isolate_pages_global+0x0/0x212 [] __alloc_pages_nodemask+0x3dc/0x5cf [] cache_alloc_refill+0x34d/0x6c1 [] ? radix_tree_node_alloc+0x52/0x5c [] kmem_cache_alloc+0xb2/0x118 [] radix_tree_node_alloc+0x52/0x5c [] radix_tree_insert+0x57/0x19c [] __fscache_write_page+0x1e3/0x3f3 [fscache] [] __nfs_readpage_to_fscache+0x58/0x11e [nfs] [] nfs_readpage_release+0x34/0x9b [nfs] [] nfs_readpage_release_full+0x32/0x4b [nfs] [] rpc_release_calldata+0x12/0x14 [sunrpc] [] rpc_free_task+0x59/0x61 [sunrpc] [] rpc_async_release+0x10/0x12 [sunrpc] [] worker_thread+0x1ef/0x2e2 [] ? worker_thread+0x19a/0x2e2 [] ? thread_return+0x3e/0x101 [] ? rpc_async_release+0x0/0x12 [sunrpc] [] ? autoremove_wake_function+0x0/0x34 [] ? trace_hardirqs_on+0xd/0xf [] ? worker_thread+0x0/0x2e2 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? add_wait_queue+0x15/0x44 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 Signed-off-by: David Howells --- fs/fscache/cookie.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index b1870a6c2cd..e6854f5222f 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -102,7 +102,9 @@ struct fscache_cookie *__fscache_acquire_cookie( cookie->netfs_data = netfs_data; cookie->flags = 0; - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS); + /* radix tree insertion won't use the preallocation pool unless it's + * told it may not wait */ + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT); switch (cookie->def->type) { case FSCACHE_COOKIE_TYPE_INDEX: -- cgit v1.2.3 From 5753c441889253e4323eee85f791a1d64cf08196 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:19 +0000 Subject: FS-Cache: Permit cache retrieval ops to be interrupted in the initial wait phase Permit the operations to retrieve data from the cache or to allocate space in the cache for future writes to be interrupted whilst they're waiting for permission for the operation to proceed. Typically this wait occurs whilst the cache object is being looked up on disk in the background. If an interruption occurs, and the operation has not yet been given the go-ahead to run, the operation is dequeued and cancelled, and control returns to the read operation of the netfs routine with none of the requested pages having been read or in any way marked as known by the cache. This means that the initial wait is done interruptibly rather than uninterruptibly. In addition, extra stats values are made available to show the number of ops cancelled and the number of cache space allocations interrupted. Signed-off-by: David Howells --- fs/fscache/internal.h | 3 ++ fs/fscache/operation.c | 82 +++++++++++++++++++++++++++++++++----------------- fs/fscache/page.c | 55 ++++++++++++++++++++++++++++----- fs/fscache/stats.c | 12 +++++--- 4 files changed, 113 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index b85cc890681..50324ad2b19 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -112,6 +112,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); +extern int fscache_cancel_op(struct fscache_operation *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); extern void fscache_operation_gc(struct work_struct *); @@ -140,6 +141,7 @@ extern atomic_t fscache_n_op_enqueue; extern atomic_t fscache_n_op_deferred_release; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; +extern atomic_t fscache_n_op_cancelled; extern atomic_t fscache_n_attr_changed; extern atomic_t fscache_n_attr_changed_ok; @@ -151,6 +153,7 @@ extern atomic_t fscache_n_allocs; extern atomic_t fscache_n_allocs_ok; extern atomic_t fscache_n_allocs_wait; extern atomic_t fscache_n_allocs_nobufs; +extern atomic_t fscache_n_allocs_intr; extern atomic_t fscache_n_alloc_ops; extern atomic_t fscache_n_alloc_op_waits; diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 09e43b6e822..296492efb81 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -34,32 +34,31 @@ void fscache_enqueue_operation(struct fscache_operation *op) fscache_set_op_state(op, "EnQ"); + ASSERT(list_empty(&op->pend_link)); ASSERT(op->processor != NULL); ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); ASSERTCMP(atomic_read(&op->usage), >, 0); - if (list_empty(&op->pend_link)) { - switch (op->flags & FSCACHE_OP_TYPE) { - case FSCACHE_OP_FAST: - _debug("queue fast"); - atomic_inc(&op->usage); - if (!schedule_work(&op->fast_work)) - fscache_put_operation(op); - break; - case FSCACHE_OP_SLOW: - _debug("queue slow"); - slow_work_enqueue(&op->slow_work); - break; - case FSCACHE_OP_MYTHREAD: - _debug("queue for caller's attention"); - break; - default: - printk(KERN_ERR "FS-Cache: Unexpected op type %lx", - op->flags); - BUG(); - break; - } - fscache_stat(&fscache_n_op_enqueue); + fscache_stat(&fscache_n_op_enqueue); + switch (op->flags & FSCACHE_OP_TYPE) { + case FSCACHE_OP_FAST: + _debug("queue fast"); + atomic_inc(&op->usage); + if (!schedule_work(&op->fast_work)) + fscache_put_operation(op); + break; + case FSCACHE_OP_SLOW: + _debug("queue slow"); + slow_work_enqueue(&op->slow_work); + break; + case FSCACHE_OP_MYTHREAD: + _debug("queue for caller's attention"); + break; + default: + printk(KERN_ERR "FS-Cache: Unexpected op type %lx", + op->flags); + BUG(); + break; } } EXPORT_SYMBOL(fscache_enqueue_operation); @@ -97,6 +96,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); + ASSERT(list_empty(&op->pend_link)); ret = -ENOBUFS; if (fscache_object_is_active(object)) { @@ -202,6 +202,7 @@ int fscache_submit_op(struct fscache_object *object, spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); + ASSERT(list_empty(&op->pend_link)); ostate = object->state; smp_rmb(); @@ -273,12 +274,7 @@ void fscache_start_operations(struct fscache_object *object) stop = true; } list_del_init(&op->pend_link); - object->n_in_progress++; - - if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) - wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - if (op->processor) - fscache_enqueue_operation(op); + fscache_run_op(object, op); /* the pending queue was holding a ref on the object */ fscache_put_operation(op); @@ -290,6 +286,36 @@ void fscache_start_operations(struct fscache_object *object) object->n_in_progress, object->debug_id); } +/* + * cancel an operation that's pending on an object + */ +int fscache_cancel_op(struct fscache_operation *op) +{ + struct fscache_object *object = op->object; + int ret; + + _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); + + spin_lock(&object->lock); + + ret = -EBUSY; + if (!list_empty(&op->pend_link)) { + fscache_stat(&fscache_n_op_cancelled); + list_del_init(&op->pend_link); + object->n_ops--; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + fscache_put_operation(op); + ret = 0; + } + + spin_unlock(&object->lock); + _leave(" = %d", ret); + return ret; +} + /* * release an operation * - queues pending ops if this is the last in-progress op diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 250dfd34c07..e6f2e61133a 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -295,8 +295,20 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { _debug(">>> WT"); fscache_stat(&fscache_n_retrieval_op_waits); - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) { + ret = -ERESTARTSYS; + goto error; + } + + /* it's been removed from the pending queue by another + * party, so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } _debug("<<< GO"); } @@ -313,6 +325,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, fscache_stat_d(&fscache_n_cop_read_or_alloc_page); } +error: if (ret == -ENOMEM) fscache_stat(&fscache_n_retrievals_nomem); else if (ret == -ERESTARTSYS) @@ -412,8 +425,20 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { _debug(">>> WT"); fscache_stat(&fscache_n_retrieval_op_waits); - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) { + ret = -ERESTARTSYS; + goto error; + } + + /* it's been removed from the pending queue by another + * party, so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } _debug("<<< GO"); } @@ -430,6 +455,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, fscache_stat_d(&fscache_n_cop_read_or_alloc_pages); } +error: if (ret == -ENOMEM) fscache_stat(&fscache_n_retrievals_nomem); else if (ret == -ERESTARTSYS) @@ -505,8 +531,20 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { _debug(">>> WT"); fscache_stat(&fscache_n_alloc_op_waits); - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) { + ret = -ERESTARTSYS; + goto error; + } + + /* it's been removed from the pending queue by another + * party, so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } _debug("<<< GO"); } @@ -515,7 +553,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ret = object->cache->ops->allocate_page(op, page, gfp); fscache_stat_d(&fscache_n_cop_allocate_page); - if (ret < 0) +error: + if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_allocs_intr); + else if (ret < 0) fscache_stat(&fscache_n_allocs_nobufs); else fscache_stat(&fscache_n_allocs_ok); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 20233fb44bf..4c07439d130 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -25,6 +25,7 @@ atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; +atomic_t fscache_n_op_cancelled; atomic_t fscache_n_attr_changed; atomic_t fscache_n_attr_changed_ok; @@ -36,6 +37,7 @@ atomic_t fscache_n_allocs; atomic_t fscache_n_allocs_ok; atomic_t fscache_n_allocs_wait; atomic_t fscache_n_allocs_nobufs; +atomic_t fscache_n_allocs_intr; atomic_t fscache_n_alloc_ops; atomic_t fscache_n_alloc_op_waits; @@ -169,11 +171,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_attr_changed_nomem), atomic_read(&fscache_n_attr_changed_calls)); - seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n", + seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n", atomic_read(&fscache_n_allocs), atomic_read(&fscache_n_allocs_ok), atomic_read(&fscache_n_allocs_wait), - atomic_read(&fscache_n_allocs_nobufs)); + atomic_read(&fscache_n_allocs_nobufs), + atomic_read(&fscache_n_allocs_intr)); seq_printf(m, "Allocs : ops=%u owt=%u\n", atomic_read(&fscache_n_alloc_ops), atomic_read(&fscache_n_alloc_op_waits)); @@ -201,10 +204,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_store_ops), atomic_read(&fscache_n_store_calls)); - seq_printf(m, "Ops : pend=%u run=%u enq=%u\n", + seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u\n", atomic_read(&fscache_n_op_pend), atomic_read(&fscache_n_op_run), - atomic_read(&fscache_n_op_enqueue)); + atomic_read(&fscache_n_op_enqueue), + atomic_read(&fscache_n_op_cancelled)); seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), -- cgit v1.2.3 From 6897e3df8fc37bd4a58bbcdef8306da7fc175584 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:22 +0000 Subject: FS-Cache: The object-available state can't rely on the cookie to be available The object-available state in the object processing state machine (as processed by fscache_object_available()) can't rely on the cookie to be available because the FSCACHE_COOKIE_CREATING bit may have been cleared by fscache_obtained_object() prior to the object being put into the FSCACHE_OBJECT_AVAILABLE state. Clearing the FSCACHE_COOKIE_CREATING bit on a cookie permits __fscache_relinquish_cookie() to proceed and detach the cookie from the object. To deal with this, we don't dereference object->cookie in fscache_object_available() if the object has already been detached. In addition, a couple of assertions are added into fscache_drop_object() to make sure the object is unbound from the cookie before it gets there. Signed-off-by: David Howells --- fs/fscache/object.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 0d65c0c92b4..1a1afa82f79 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -158,7 +158,8 @@ static void fscache_object_state_machine(struct fscache_object *object) spin_lock(&object->lock); object->state = FSCACHE_OBJECT_DYING; - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + if (object->cookie && + test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); @@ -594,7 +595,8 @@ static void fscache_object_available(struct fscache_object *object) spin_lock(&object->lock); - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) + if (object->cookie && + test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); fscache_done_parent_op(object); @@ -631,6 +633,9 @@ static void fscache_drop_object(struct fscache_object *object) _enter("{OBJ%x,%d}", object->debug_id, object->n_children); + ASSERTCMP(object->cookie, ==, NULL); + ASSERT(hlist_unhashed(&object->cookie_link)); + spin_lock(&cache->object_list_lock); list_del_init(&object->cache_link); spin_unlock(&cache->object_list_lock); -- cgit v1.2.3 From 1bccf513ac49d44604ba1cddcc29f5886e70f1b6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:25 +0000 Subject: FS-Cache: Fix lock misorder in fscache_write_op() FS-Cache has two structs internally for keeping track of the internal state of a cached file: the fscache_cookie struct, which represents the netfs's state, and fscache_object struct, which represents the cache's state. Each has a pointer that points to the other (when both are in existence), and each has a spinlock for pointer maintenance. Since netfs operations approach these structures from the cookie side, they get the cookie lock first, then the object lock. Cache operations, on the other hand, approach from the object side, and get the object lock first. It is not then permitted for a cache operation to get the cookie lock whilst it is holding the object lock lest deadlock occur; instead, it must do one of two things: (1) increment the cookie usage counter, drop the object lock and then get both locks in order, or (2) simply hold the object lock as certain parts of the cookie may not be altered whilst the object lock is held. It is also not permitted to follow either pointer without holding the lock at the end you start with. To break the pointers between the cookie and the object, both locks must be held. fscache_write_op(), however, violates the locking rules: It attempts to get the cookie lock without (a) checking that the cookie pointer is a valid pointer, and (b) holding the object lock to protect the cookie pointer whilst it follows it. This is so that it can access the pending page store tree without interference from __fscache_write_page(). This is fixed by splitting the cookie lock, such that the page store tracking tree is protected by its own lock, and checking that the cookie pointer is non-NULL before we attempt to follow it whilst holding the object lock. The new lock is subordinate to both the cookie lock and the object lock, and so should be taken after those. Signed-off-by: David Howells --- fs/fscache/cookie.c | 1 + fs/fscache/internal.h | 4 ++++ fs/fscache/page.c | 52 ++++++++++++++++++++++++++++++++++----------------- fs/fscache/stats.c | 10 ++++++++-- 4 files changed, 48 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index e6854f5222f..f979659c1b3 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -36,6 +36,7 @@ void fscache_cookie_init_once(void *_cookie) memset(cookie, 0, sizeof(*cookie)); spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); INIT_HLIST_HEAD(&cookie->backing_objects); } diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 50324ad2b19..ba1853fa1ff 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -17,6 +17,7 @@ * - cache->object_list_lock * - object->lock * - object->parent->lock + * - cookie->stores_lock * - fscache_thread_lock * */ @@ -174,6 +175,9 @@ extern atomic_t fscache_n_stores_nobufs; extern atomic_t fscache_n_stores_oom; extern atomic_t fscache_n_store_ops; extern atomic_t fscache_n_store_calls; +extern atomic_t fscache_n_store_pages; +extern atomic_t fscache_n_store_radix_deletes; +extern atomic_t fscache_n_store_pages_over_limit; extern atomic_t fscache_n_marks; extern atomic_t fscache_n_uncaches; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index e6f2e61133a..3ea8897bc21 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -45,16 +45,26 @@ EXPORT_SYMBOL(__fscache_wait_on_page_write); /* * note that a page has finished being written to the cache */ -static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page) +static void fscache_end_page_write(struct fscache_object *object, + struct page *page) { - struct page *xpage; + struct fscache_cookie *cookie; + struct page *xpage = NULL; - spin_lock(&cookie->lock); - xpage = radix_tree_delete(&cookie->stores, page->index); - spin_unlock(&cookie->lock); - ASSERT(xpage != NULL); - - wake_up_bit(&cookie->flags, 0); + spin_lock(&object->lock); + cookie = object->cookie; + if (cookie) { + /* delete the page from the tree if it is now no longer + * pending */ + spin_lock(&cookie->stores_lock); + fscache_stat(&fscache_n_store_radix_deletes); + xpage = radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); + wake_up_bit(&cookie->flags, 0); + } + spin_unlock(&object->lock); + if (xpage) + page_cache_release(xpage); } /* @@ -591,7 +601,7 @@ static void fscache_write_op(struct fscache_operation *_op) struct fscache_storage *op = container_of(_op, struct fscache_storage, op); struct fscache_object *object = op->op.object; - struct fscache_cookie *cookie = object->cookie; + struct fscache_cookie *cookie; struct page *page; unsigned n; void *results[1]; @@ -601,16 +611,17 @@ static void fscache_write_op(struct fscache_operation *_op) fscache_set_op_state(&op->op, "GetPage"); - spin_lock(&cookie->lock); spin_lock(&object->lock); + cookie = object->cookie; - if (!fscache_object_is_active(object)) { + if (!fscache_object_is_active(object) || !cookie) { spin_unlock(&object->lock); - spin_unlock(&cookie->lock); _leave(""); return; } + spin_lock(&cookie->stores_lock); + fscache_stat(&fscache_n_store_calls); /* find a page to store */ @@ -621,23 +632,25 @@ static void fscache_write_op(struct fscache_operation *_op) goto superseded; page = results[0]; _debug("gang %d [%lx]", n, page->index); - if (page->index > op->store_limit) + if (page->index > op->store_limit) { + fscache_stat(&fscache_n_store_pages_over_limit); goto superseded; + } radix_tree_tag_clear(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG); + spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); - spin_unlock(&cookie->lock); if (page) { fscache_set_op_state(&op->op, "Store"); + fscache_stat(&fscache_n_store_pages); fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); fscache_stat_d(&fscache_n_cop_write_page); fscache_set_op_state(&op->op, "EndWrite"); - fscache_end_page_write(cookie, page); - page_cache_release(page); + fscache_end_page_write(object, page); if (ret < 0) { fscache_set_op_state(&op->op, "Abort"); fscache_abort_object(object); @@ -653,9 +666,9 @@ superseded: /* this writer is going away and there aren't any more things to * write */ _debug("cease"); + spin_unlock(&cookie->stores_lock); clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); spin_unlock(&object->lock); - spin_unlock(&cookie->lock); _leave(""); } @@ -731,6 +744,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, /* add the page to the pending-storage radix tree on the backing * object */ spin_lock(&object->lock); + spin_lock(&cookie->stores_lock); _debug("store limit %llx", (unsigned long long) object->store_limit); @@ -751,6 +765,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) goto already_pending; + spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); @@ -772,6 +787,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, already_queued: fscache_stat(&fscache_n_stores_again); already_pending: + spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); spin_unlock(&cookie->lock); radix_tree_preload_end(); @@ -781,7 +797,9 @@ already_pending: return 0; submit_failed: + spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); page_cache_release(page); ret = -ENOBUFS; goto nobufs; diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 4c07439d130..1d53ea68409 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -58,6 +58,9 @@ atomic_t fscache_n_stores_nobufs; atomic_t fscache_n_stores_oom; atomic_t fscache_n_store_ops; atomic_t fscache_n_store_calls; +atomic_t fscache_n_store_pages; +atomic_t fscache_n_store_radix_deletes; +atomic_t fscache_n_store_pages_over_limit; atomic_t fscache_n_marks; atomic_t fscache_n_uncaches; @@ -200,9 +203,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_stores_again), atomic_read(&fscache_n_stores_nobufs), atomic_read(&fscache_n_stores_oom)); - seq_printf(m, "Stores : ops=%u run=%u\n", + seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n", atomic_read(&fscache_n_store_ops), - atomic_read(&fscache_n_store_calls)); + atomic_read(&fscache_n_store_calls), + atomic_read(&fscache_n_store_pages), + atomic_read(&fscache_n_store_radix_deletes), + atomic_read(&fscache_n_store_pages_over_limit)); seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u\n", atomic_read(&fscache_n_op_pend), -- cgit v1.2.3 From 285e728b0ac55b53a673114096168d6f74930167 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:29 +0000 Subject: FS-Cache: Don't delete pending pages from the page-store tracking tree Don't delete pending pages from the page-store tracking tree, but rather send them for another write as they've presumably been updated. Signed-off-by: David Howells --- fs/fscache/page.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3ea8897bc21..022a5da8e13 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -57,8 +57,11 @@ static void fscache_end_page_write(struct fscache_object *object, /* delete the page from the tree if it is now no longer * pending */ spin_lock(&cookie->stores_lock); - fscache_stat(&fscache_n_store_radix_deletes); - xpage = radix_tree_delete(&cookie->stores, page->index); + if (!radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG)) { + fscache_stat(&fscache_n_store_radix_deletes); + xpage = radix_tree_delete(&cookie->stores, page->index); + } spin_unlock(&cookie->stores_lock); wake_up_bit(&cookie->flags, 0); } -- cgit v1.2.3 From e3d4d28b1c8cc7c26536a50b43d86ccd39878550 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:32 +0000 Subject: FS-Cache: Handle read request vs lookup, creation or other cache failure FS-Cache doesn't correctly handle the netfs requesting a read from the cache on an object that failed or was withdrawn by the cache. A trace similar to the following might be seen: CacheFiles: Lookup failed error -105 [exe ] unexpected submission OP165afe [OBJ6cac OBJECT_LC_DYING] [exe ] objstate=OBJECT_LC_DYING [OBJECT_LC_DYING] [exe ] objflags=0 [exe ] objevent=9 [fffffffffffffffb] [exe ] ops=0 inp=0 exc=0 Pid: 6970, comm: exe Not tainted 2.6.32-rc6-cachefs #50 Call Trace: [] fscache_submit_op+0x3ff/0x45a [fscache] [] __fscache_read_or_alloc_pages+0x187/0x3c4 [fscache] [] ? nfs_readpage_from_fscache_complete+0x0/0x66 [nfs] [] __nfs_readpages_from_fscache+0x7e/0x176 [nfs] [] ? __alloc_pages_nodemask+0x11c/0x5cf [] nfs_readpages+0x114/0x1d7 [nfs] [] __do_page_cache_readahead+0x15f/0x1ec [] ? __do_page_cache_readahead+0x73/0x1ec [] ra_submit+0x1c/0x20 [] ondemand_readahead+0x227/0x23a [] page_cache_sync_readahead+0x17/0x19 [] generic_file_aio_read+0x236/0x5a0 [] nfs_file_read+0xe4/0xf3 [nfs] [] do_sync_read+0xe3/0x120 [] ? _spin_unlock_irq+0x2b/0x31 [] ? autoremove_wake_function+0x0/0x34 [] ? selinux_file_permission+0x5d/0x10f [] ? thread_return+0x3e/0x101 [] ? security_file_permission+0x11/0x13 [] vfs_read+0xaa/0x16f [] ? trace_hardirqs_on_caller+0x10c/0x130 [] sys_read+0x45/0x6c [] system_call_fastpath+0x16/0x1b The object state might also be OBJECT_DYING or OBJECT_WITHDRAWING. This should be handled by simply rejecting the new operation with ENOBUFS. There's no need to log an error for it. Events of this type now appear in the stats file under Ops:rej. Signed-off-by: David Howells --- fs/fscache/internal.h | 1 + fs/fscache/operation.c | 5 +++++ fs/fscache/stats.c | 6 ++++-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index ba1853fa1ff..a0769872b19 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -143,6 +143,7 @@ extern atomic_t fscache_n_op_deferred_release; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; extern atomic_t fscache_n_op_cancelled; +extern atomic_t fscache_n_op_rejected; extern atomic_t fscache_n_attr_changed; extern atomic_t fscache_n_attr_changed_ok; diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 296492efb81..313e79a1426 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -232,6 +232,11 @@ int fscache_submit_op(struct fscache_object *object, list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; + } else if (object->state == FSCACHE_OBJECT_DYING || + object->state == FSCACHE_OBJECT_LC_DYING || + object->state == FSCACHE_OBJECT_WITHDRAWING) { + fscache_stat(&fscache_n_op_rejected); + ret = -ENOBUFS; } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 1d53ea68409..045ba396dbf 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -26,6 +26,7 @@ atomic_t fscache_n_op_deferred_release; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; atomic_t fscache_n_op_cancelled; +atomic_t fscache_n_op_rejected; atomic_t fscache_n_attr_changed; atomic_t fscache_n_attr_changed_ok; @@ -210,11 +211,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_store_radix_deletes), atomic_read(&fscache_n_store_pages_over_limit)); - seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u\n", + seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", atomic_read(&fscache_n_op_pend), atomic_read(&fscache_n_op_run), atomic_read(&fscache_n_op_enqueue), - atomic_read(&fscache_n_op_cancelled)); + atomic_read(&fscache_n_op_cancelled), + atomic_read(&fscache_n_op_rejected)); seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), -- cgit v1.2.3 From 201a15428bd54f83eccec8b7c64a04b8f9431204 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:35 +0000 Subject: FS-Cache: Handle pages pending storage that get evicted under OOM conditions Handle netfs pages that the vmscan algorithm wants to evict from the pagecache under OOM conditions, but that are waiting for write to the cache. Under these conditions, vmscan calls the releasepage() function of the netfs, asking if a page can be discarded. The problem is typified by the following trace of a stuck process: kslowd005 D 0000000000000000 0 4253 2 0x00000080 ffff88001b14f370 0000000000000046 ffff880020d0d000 0000000000000007 0000000000000006 0000000000000001 ffff88001b14ffd8 ffff880020d0d2a8 000000000000ddf0 00000000000118c0 00000000000118c0 ffff880020d0d2a8 Call Trace: [] __fscache_wait_on_page_write+0x8b/0xa7 [fscache] [] ? autoremove_wake_function+0x0/0x34 [] ? __fscache_check_page_write+0x63/0x70 [fscache] [] nfs_fscache_release_page+0x4e/0xc4 [nfs] [] nfs_release_page+0x3c/0x41 [nfs] [] try_to_release_page+0x32/0x3b [] shrink_page_list+0x316/0x4ac [] shrink_inactive_list+0x392/0x67c [] ? __mutex_unlock_slowpath+0x100/0x10b [] ? trace_hardirqs_on_caller+0x10c/0x130 [] ? mutex_unlock+0x9/0xb [] shrink_list+0x8d/0x8f [] shrink_zone+0x278/0x33c [] ? ktime_get_ts+0xad/0xba [] try_to_free_pages+0x22e/0x392 [] ? isolate_pages_global+0x0/0x212 [] __alloc_pages_nodemask+0x3dc/0x5cf [] grab_cache_page_write_begin+0x65/0xaa [] ext3_write_begin+0x78/0x1eb [] generic_file_buffered_write+0x109/0x28c [] ? current_fs_time+0x22/0x29 [] __generic_file_aio_write+0x350/0x385 [] ? generic_file_aio_write+0x4a/0xae [] generic_file_aio_write+0x60/0xae [] do_sync_write+0xe3/0x120 [] ? autoremove_wake_function+0x0/0x34 [] ? __dentry_open+0x1a5/0x2b8 [] ? dentry_open+0x82/0x89 [] cachefiles_write_page+0x298/0x335 [cachefiles] [] fscache_write_op+0x178/0x2c2 [fscache] [] fscache_op_execute+0x7a/0xd1 [fscache] [] slow_work_execute+0x18f/0x2d1 [] slow_work_thread+0x1c5/0x308 [] ? autoremove_wake_function+0x0/0x34 [] ? slow_work_thread+0x0/0x308 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? tg_shares_up+0x171/0x227 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 In the above backtrace, the following is happening: (1) A page storage operation is being executed by a slow-work thread (fscache_write_op()). (2) FS-Cache farms the operation out to the cache to perform (cachefiles_write_page()). (3) CacheFiles is then calling Ext3 to perform the actual write, using Ext3's standard write (do_sync_write()) under KERNEL_DS directly from the netfs page. (4) However, for Ext3 to perform the write, it must allocate some memory, in particular, it must allocate at least one page cache page into which it can copy the data from the netfs page. (5) Under OOM conditions, the memory allocator can't immediately come up with a page, so it uses vmscan to find something to discard (try_to_free_pages()). (6) vmscan finds a clean netfs page it might be able to discard (possibly the one it's trying to write out). (7) The netfs is called to throw the page away (nfs_release_page()) - but it's called with __GFP_WAIT, so the netfs decides to wait for the store to complete (__fscache_wait_on_page_write()). (8) This blocks a slow-work processing thread - possibly against itself. The system ends up stuck because it can't write out any netfs pages to the cache without allocating more memory. To avoid this, we make FS-Cache cancel some writes that aren't in the middle of actually being performed. This means that some data won't make it into the cache this time. To support this, a new FS-Cache function is added fscache_maybe_release_page() that replaces what the netfs releasepage() functions used to do with respect to the cache. The decisions fscache_maybe_release_page() makes are counted and displayed through /proc/fs/fscache/stats on a line labelled "VmScan". There are four counters provided: "nos=N" - pages that weren't pending storage; "gon=N" - pages that were pending storage when we first looked, but weren't by the time we got the object lock; "bsy=N" - pages that we ignored as they were actively being written when we looked; and "can=N" - pages that we cancelled the storage of. What I'd really like to do is alter the behaviour of the cancellation heuristics, depending on how necessary it is to expel pages. If there are plenty of other pages that aren't waiting to be written to the cache that could be ejected first, then it would be nice to hold up on immediate cancellation of cache writes - but I don't see a way of doing that. Signed-off-by: David Howells --- fs/9p/cache.c | 14 +-------- fs/afs/file.c | 15 ++-------- fs/fscache/internal.h | 5 ++++ fs/fscache/page.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-- fs/fscache/stats.c | 11 +++++++ fs/nfs/fscache.c | 10 ++----- 6 files changed, 100 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 51c94e26a34..bcc5357a906 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -343,18 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) BUG_ON(!vcookie->fscache); - if (PageFsCache(page)) { - if (fscache_check_page_write(vcookie->fscache, page)) { - if (!(gfp & __GFP_WAIT)) - return 0; - fscache_wait_on_page_write(vcookie->fscache, page); - } - - fscache_uncache_page(vcookie->fscache, page); - ClearPageFsCache(page); - } - - return 1; + return fscache_maybe_release_page(vnode->cache, page, gfp); } void __v9fs_fscache_invalidate_page(struct page *page) @@ -368,7 +357,6 @@ void __v9fs_fscache_invalidate_page(struct page *page) fscache_wait_on_page_write(vcookie->fscache, page); BUG_ON(!PageLocked(page)); fscache_uncache_page(vcookie->fscache, page); - ClearPageFsCache(page); } } diff --git a/fs/afs/file.c b/fs/afs/file.c index 681c2a7b013..39b301662f2 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -315,7 +315,6 @@ static void afs_invalidatepage(struct page *page, unsigned long offset) struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); fscache_wait_on_page_write(vnode->cache, page); fscache_uncache_page(vnode->cache, page); - ClearPageFsCache(page); } #endif @@ -349,17 +348,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) /* deny if page is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page)) { - if (fscache_check_page_write(vnode->cache, page)) { - if (!(gfp_flags & __GFP_WAIT)) { - _leave(" = F [cache busy]"); - return 0; - } - fscache_wait_on_page_write(vnode->cache, page); - } - - fscache_uncache_page(vnode->cache, page); - ClearPageFsCache(page); + if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) { + _leave(" = F [cache busy]"); + return 0; } #endif diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index a0769872b19..e5046519b15 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -180,6 +180,11 @@ extern atomic_t fscache_n_store_pages; extern atomic_t fscache_n_store_radix_deletes; extern atomic_t fscache_n_store_pages_over_limit; +extern atomic_t fscache_n_store_vmscan_not_storing; +extern atomic_t fscache_n_store_vmscan_gone; +extern atomic_t fscache_n_store_vmscan_busy; +extern atomic_t fscache_n_store_vmscan_cancelled; + extern atomic_t fscache_n_marks; extern atomic_t fscache_n_uncaches; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 022a5da8e13..fc76798bd96 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -42,6 +42,75 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa } EXPORT_SYMBOL(__fscache_wait_on_page_write); +/* + * decide whether a page can be released, possibly by cancelling a store to it + * - we're allowed to sleep if __GFP_WAIT is flagged + */ +bool __fscache_maybe_release_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct page *xpage; + void *val; + + _enter("%p,%p,%x", cookie, page, gfp); + + rcu_read_lock(); + val = radix_tree_lookup(&cookie->stores, page->index); + if (!val) { + rcu_read_unlock(); + fscache_stat(&fscache_n_store_vmscan_not_storing); + __fscache_uncache_page(cookie, page); + return true; + } + + /* see if the page is actually undergoing storage - if so we can't get + * rid of it till the cache has finished with it */ + if (radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG)) { + rcu_read_unlock(); + goto page_busy; + } + + /* the page is pending storage, so we attempt to cancel the store and + * discard the store request so that the page can be reclaimed */ + spin_lock(&cookie->stores_lock); + rcu_read_unlock(); + + if (radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG)) { + /* the page started to undergo storage whilst we were looking, + * so now we can only wait or return */ + spin_unlock(&cookie->stores_lock); + goto page_busy; + } + + xpage = radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); + + if (xpage) { + fscache_stat(&fscache_n_store_vmscan_cancelled); + fscache_stat(&fscache_n_store_radix_deletes); + ASSERTCMP(xpage, ==, page); + } else { + fscache_stat(&fscache_n_store_vmscan_gone); + } + + wake_up_bit(&cookie->flags, 0); + if (xpage) + page_cache_release(xpage); + __fscache_uncache_page(cookie, page); + return true; + +page_busy: + /* we might want to wait here, but that could deadlock the allocator as + * the slow-work threads writing to the cache may all end up sleeping + * on memory allocation */ + fscache_stat(&fscache_n_store_vmscan_busy); + return false; +} +EXPORT_SYMBOL(__fscache_maybe_release_page); + /* * note that a page has finished being written to the cache */ @@ -57,6 +126,8 @@ static void fscache_end_page_write(struct fscache_object *object, /* delete the page from the tree if it is now no longer * pending */ spin_lock(&cookie->stores_lock); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); if (!radix_tree_tag_get(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG)) { fscache_stat(&fscache_n_store_radix_deletes); @@ -640,8 +711,12 @@ static void fscache_write_op(struct fscache_operation *_op) goto superseded; } - radix_tree_tag_clear(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG); + if (page) { + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + } spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 045ba396dbf..cda69994e06 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -63,6 +63,11 @@ atomic_t fscache_n_store_pages; atomic_t fscache_n_store_radix_deletes; atomic_t fscache_n_store_pages_over_limit; +atomic_t fscache_n_store_vmscan_not_storing; +atomic_t fscache_n_store_vmscan_gone; +atomic_t fscache_n_store_vmscan_busy; +atomic_t fscache_n_store_vmscan_cancelled; + atomic_t fscache_n_marks; atomic_t fscache_n_uncaches; @@ -211,6 +216,12 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_store_radix_deletes), atomic_read(&fscache_n_store_pages_over_limit)); + seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n", + atomic_read(&fscache_n_store_vmscan_not_storing), + atomic_read(&fscache_n_store_vmscan_gone), + atomic_read(&fscache_n_store_vmscan_busy), + atomic_read(&fscache_n_store_vmscan_cancelled)); + seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", atomic_read(&fscache_n_op_pend), atomic_read(&fscache_n_op_run), diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 70fad69eb95..fa588006588 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -359,17 +359,13 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp) BUG_ON(!cookie); - if (fscache_check_page_write(cookie, page)) { - if (!(gfp & __GFP_WAIT)) - return 0; - fscache_wait_on_page_write(cookie, page); - } - if (PageFsCache(page)) { dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", cookie, page, nfsi); - fscache_uncache_page(cookie, page); + if (!fscache_maybe_release_page(cookie, page, gfp)) + return 0; + nfs_add_fscache_stats(page->mapping->host, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); } -- cgit v1.2.3 From 2175bb06dc6cf2af9c098a1770561f9e63edae4e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:38 +0000 Subject: FS-Cache: Add a retirement stat counter Add a stat counter to count retirement events rather than ordinary release events (the retire argument to fscache_relinquish_cookie()). Signed-off-by: David Howells --- fs/fscache/cookie.c | 2 ++ fs/fscache/internal.h | 1 + fs/fscache/stats.c | 6 ++++-- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index f979659c1b3..990535071a8 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -415,6 +415,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) unsigned long event; fscache_stat(&fscache_n_relinquishes); + if (retire) + fscache_stat(&fscache_n_relinquishes_retire); if (!cookie) { fscache_stat(&fscache_n_relinquishes_null); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index e5046519b15..2bf463d2608 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -202,6 +202,7 @@ extern atomic_t fscache_n_updates_run; extern atomic_t fscache_n_relinquishes; extern atomic_t fscache_n_relinquishes_null; extern atomic_t fscache_n_relinquishes_waitcrt; +extern atomic_t fscache_n_relinquishes_retire; extern atomic_t fscache_n_cookie_index; extern atomic_t fscache_n_cookie_data; diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index cda69994e06..9e15289eb5c 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -85,6 +85,7 @@ atomic_t fscache_n_updates_run; atomic_t fscache_n_relinquishes; atomic_t fscache_n_relinquishes_null; atomic_t fscache_n_relinquishes_waitcrt; +atomic_t fscache_n_relinquishes_retire; atomic_t fscache_n_cookie_index; atomic_t fscache_n_cookie_data; @@ -168,10 +169,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_updates_null), atomic_read(&fscache_n_updates_run)); - seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n", + seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n", atomic_read(&fscache_n_relinquishes), atomic_read(&fscache_n_relinquishes_null), - atomic_read(&fscache_n_relinquishes_waitcrt)); + atomic_read(&fscache_n_relinquishes_waitcrt), + atomic_read(&fscache_n_relinquishes_retire)); seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", atomic_read(&fscache_n_attr_changed), -- cgit v1.2.3 From d461d26dde901b0523c46b0317e7fccf574a3933 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:41 +0000 Subject: FS-Cache: Make sure FSCACHE_COOKIE_LOOKING_UP cleared on lookup failure We must make sure that FSCACHE_COOKIE_LOOKING_UP is cleared on lookup failure (if an object reaches the LC_DYING state), and we should clear it before clearing FSCACHE_COOKIE_CREATING. If this doesn't happen then fscache_wait_for_deferred_lookup() may hold allocation and retrieval operations indefinitely until they're interrupted by signals - which in turn pins the dying object until they go away. Signed-off-by: David Howells --- fs/fscache/object.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 1a1afa82f79..74bc562a2cb 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -105,6 +105,7 @@ static inline void fscache_done_parent_op(struct fscache_object *object) static void fscache_object_state_machine(struct fscache_object *object) { enum fscache_object_state new_state; + struct fscache_cookie *cookie; ASSERT(object != NULL); @@ -158,11 +159,17 @@ static void fscache_object_state_machine(struct fscache_object *object) spin_lock(&object->lock); object->state = FSCACHE_OBJECT_DYING; - if (object->cookie && - test_and_clear_bit(FSCACHE_COOKIE_CREATING, - &object->cookie->flags)) - wake_up_bit(&object->cookie->flags, - FSCACHE_COOKIE_CREATING); + cookie = object->cookie; + if (cookie) { + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, + &cookie->flags)) + wake_up_bit(&cookie->flags, + FSCACHE_COOKIE_LOOKING_UP); + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + &cookie->flags)) + wake_up_bit(&cookie->flags, + FSCACHE_COOKIE_CREATING); + } spin_unlock(&object->lock); fscache_done_parent_op(object); -- cgit v1.2.3 From 60d543ca724be155c2b6166e36a00c80b21bd810 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:45 +0000 Subject: FS-Cache: Start processing an object's operations on that object's death Start processing an object's operations when that object moves into the DYING state as the object cannot be destroyed until all its outstanding operations have completed. Furthermore, make sure that read and allocation operations handle being woken up on a dead object. Such events are recorded in the Allocs.abt and Retrvls.abt statistics as viewable through /proc/fs/fscache/stats. The code for waiting for object activation for the read and allocation operations is also extracted into its own function as it is much the same in all cases, differing only in the stats incremented. Signed-off-by: David Howells --- fs/fscache/internal.h | 5 +++ fs/fscache/object.c | 1 + fs/fscache/page.c | 112 +++++++++++++++++++++++++------------------------- fs/fscache/stats.c | 12 ++++-- 4 files changed, 69 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 2bf463d2608..5b49a373689 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -156,6 +156,7 @@ extern atomic_t fscache_n_allocs_ok; extern atomic_t fscache_n_allocs_wait; extern atomic_t fscache_n_allocs_nobufs; extern atomic_t fscache_n_allocs_intr; +extern atomic_t fscache_n_allocs_object_dead; extern atomic_t fscache_n_alloc_ops; extern atomic_t fscache_n_alloc_op_waits; @@ -166,6 +167,7 @@ extern atomic_t fscache_n_retrievals_nodata; extern atomic_t fscache_n_retrievals_nobufs; extern atomic_t fscache_n_retrievals_intr; extern atomic_t fscache_n_retrievals_nomem; +extern atomic_t fscache_n_retrievals_object_dead; extern atomic_t fscache_n_retrieval_ops; extern atomic_t fscache_n_retrieval_op_waits; @@ -249,9 +251,12 @@ static inline void fscache_stat_d(atomic_t *stat) atomic_dec(stat); } +#define __fscache_stat(stat) (stat) + extern const struct file_operations fscache_stats_fops; #else +#define __fscache_stat(stat) (NULL) #define fscache_stat(stat) do {} while (0) #endif diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 74bc562a2cb..c85c9f58216 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -201,6 +201,7 @@ static void fscache_object_state_machine(struct fscache_object *object) } spin_unlock(&object->lock); fscache_enqueue_dependents(object); + fscache_start_operations(object); goto terminal_transit; /* handle an abort during initialisation */ diff --git a/fs/fscache/page.c b/fs/fscache/page.c index fc76798bd96..c598ea4c4e7 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -313,6 +313,43 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) return 0; } +/* + * wait for an object to become active (or dead) + */ +static int fscache_wait_for_retrieval_activation(struct fscache_object *object, + struct fscache_retrieval *op, + atomic_t *stat_op_waits, + atomic_t *stat_object_dead) +{ + int ret; + + if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags)) + goto check_if_dead; + + _debug(">>> WT"); + fscache_stat(stat_op_waits); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) + return -ERESTARTSYS; + + /* it's been removed from the pending queue by another party, + * so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } + _debug("<<< GO"); + +check_if_dead: + if (unlikely(fscache_object_is_dead(object))) { + fscache_stat(stat_object_dead); + return -ENOBUFS; + } + return 0; +} + /* * read a page from the cache or allocate a block in which to store it * - we return: @@ -376,25 +413,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ - if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { - _debug(">>> WT"); - fscache_stat(&fscache_n_retrieval_op_waits); - if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, - TASK_INTERRUPTIBLE) < 0) { - ret = fscache_cancel_op(&op->op); - if (ret == 0) { - ret = -ERESTARTSYS; - goto error; - } - - /* it's been removed from the pending queue by another - * party, so we should get to run shortly */ - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - } - _debug("<<< GO"); - } + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead)); + if (ret < 0) + goto error; /* ask the cache to honour the operation */ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { @@ -506,25 +530,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ - if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { - _debug(">>> WT"); - fscache_stat(&fscache_n_retrieval_op_waits); - if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, - TASK_INTERRUPTIBLE) < 0) { - ret = fscache_cancel_op(&op->op); - if (ret == 0) { - ret = -ERESTARTSYS; - goto error; - } - - /* it's been removed from the pending queue by another - * party, so we should get to run shortly */ - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - } - _debug("<<< GO"); - } + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead)); + if (ret < 0) + goto error; /* ask the cache to honour the operation */ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { @@ -612,25 +623,12 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_alloc_ops); - if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { - _debug(">>> WT"); - fscache_stat(&fscache_n_alloc_op_waits); - if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, - TASK_INTERRUPTIBLE) < 0) { - ret = fscache_cancel_op(&op->op); - if (ret == 0) { - ret = -ERESTARTSYS; - goto error; - } - - /* it's been removed from the pending queue by another - * party, so we should get to run shortly */ - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - } - _debug("<<< GO"); - } + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_alloc_op_waits), + __fscache_stat(&fscache_n_allocs_object_dead)); + if (ret < 0) + goto error; /* ask the cache to honour the operation */ fscache_stat(&fscache_n_cop_allocate_page); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 9e15289eb5c..05f77caf4a2 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -39,6 +39,7 @@ atomic_t fscache_n_allocs_ok; atomic_t fscache_n_allocs_wait; atomic_t fscache_n_allocs_nobufs; atomic_t fscache_n_allocs_intr; +atomic_t fscache_n_allocs_object_dead; atomic_t fscache_n_alloc_ops; atomic_t fscache_n_alloc_op_waits; @@ -49,6 +50,7 @@ atomic_t fscache_n_retrievals_nodata; atomic_t fscache_n_retrievals_nobufs; atomic_t fscache_n_retrievals_intr; atomic_t fscache_n_retrievals_nomem; +atomic_t fscache_n_retrievals_object_dead; atomic_t fscache_n_retrieval_ops; atomic_t fscache_n_retrieval_op_waits; @@ -188,9 +190,10 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_allocs_wait), atomic_read(&fscache_n_allocs_nobufs), atomic_read(&fscache_n_allocs_intr)); - seq_printf(m, "Allocs : ops=%u owt=%u\n", + seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n", atomic_read(&fscache_n_alloc_ops), - atomic_read(&fscache_n_alloc_op_waits)); + atomic_read(&fscache_n_alloc_op_waits), + atomic_read(&fscache_n_allocs_object_dead)); seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" " int=%u oom=%u\n", @@ -201,9 +204,10 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_retrievals_nobufs), atomic_read(&fscache_n_retrievals_intr), atomic_read(&fscache_n_retrievals_nomem)); - seq_printf(m, "Retrvls: ops=%u owt=%u\n", + seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n", atomic_read(&fscache_n_retrieval_ops), - atomic_read(&fscache_n_retrieval_op_waits)); + atomic_read(&fscache_n_retrieval_op_waits), + atomic_read(&fscache_n_retrievals_object_dead)); seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", atomic_read(&fscache_n_stores), -- cgit v1.2.3 From 868411be3f445a83fafbd734f3e426400138add5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:48 +0000 Subject: FS-Cache: Actually requeue an object when requested FS-Cache objects have an FSCACHE_OBJECT_EV_REQUEUE event that can theoretically be raised to ask the state machine to requeue the object for further processing before the work function returns to the slow-work facility. However, fscache_object_work_execute() was clearing that bit before checking the event mask to see whether the object has any pending events that require it to be requeued immediately. Instead, the bit should be cleared after the check and enqueue. Signed-off-by: David Howells --- fs/fscache/object.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index c85c9f58216..f3f952cf887 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -353,13 +353,12 @@ static void fscache_object_slow_work_execute(struct slow_work *work) _enter("{OBJ%x}", object->debug_id); - clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); - start = jiffies; fscache_object_state_machine(object); fscache_hist(fscache_objs_histogram, start); if (object->events & object->event_mask) fscache_enqueue_object(object); + clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); } /* -- cgit v1.2.3 From a17754fb8c28af19cd70dcbec6d5b0773b94e0c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:52 +0000 Subject: CacheFiles: Don't write a full page if there's only a partial page to cache cachefiles_write_page() writes a full page to the backing file for the last page of the netfs file, even if the netfs file's last page is only a partial page. This causes the EOF on the backing file to be extended beyond the EOF of the netfs, and thus the backing file will be truncated by cachefiles_attr_changed() called from cachefiles_lookup_object(). So we need to limit the write we make to the backing file on that last page such that it doesn't push the EOF too far. Also, if a backing file that has a partial page at the end is expanded, we discard the partial page and refetch it on the basis that we then have a hole in the file with invalid data, and should the power go out... A better way to deal with this could be to record a note that the partial page contains invalid data until the correct data is written into it. This isn't a problem for netfs's that discard the whole backing file if the file size changes (such as NFS). Signed-off-by: David Howells --- fs/cachefiles/interface.c | 20 +++++++++++++++++--- fs/cachefiles/rdwr.c | 23 +++++++++++++++++++---- 2 files changed, 36 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index dd7f852746c..8e67abf0598 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -404,12 +404,26 @@ static int cachefiles_attr_changed(struct fscache_object *_object) if (oi_size == ni_size) return 0; - newattrs.ia_size = ni_size; - newattrs.ia_valid = ATTR_SIZE; - cachefiles_begin_secure(cache, &saved_cred); mutex_lock(&object->backer->d_inode->i_mutex); + + /* if there's an extension to a partial page at the end of the backing + * file, we need to discard the partial page so that we pick up new + * data after it */ + if (oi_size & ~PAGE_MASK && ni_size > oi_size) { + _debug("discard tail %llx", oi_size); + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = oi_size & PAGE_MASK; + ret = notify_change(object->backer, &newattrs); + if (ret < 0) + goto truncate_failed; + } + + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = ni_size; ret = notify_change(object->backer, &newattrs); + +truncate_failed: mutex_unlock(&object->backer->d_inode->i_mutex); cachefiles_end_secure(cache, saved_cred); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 3304646dae8..5a84fd7109a 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -803,7 +803,8 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) struct cachefiles_cache *cache; mm_segment_t old_fs; struct file *file; - loff_t pos; + loff_t pos, eof; + size_t len; void *data; int ret; @@ -837,15 +838,29 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) ret = -EIO; if (file->f_op->write) { pos = (loff_t) page->index << PAGE_SHIFT; + + /* we mustn't write more data than we have, so we have + * to beware of a partial page at EOF */ + eof = object->fscache.store_limit_l; + len = PAGE_SIZE; + if (eof & ~PAGE_MASK) { + ASSERTCMP(pos, <, eof); + if (eof - pos < PAGE_SIZE) { + _debug("cut short %llx to %llx", + pos, eof); + len = eof - pos; + ASSERTCMP(pos + len, ==, eof); + } + } + data = kmap(page); old_fs = get_fs(); set_fs(KERNEL_DS); ret = file->f_op->write( - file, (const void __user *) data, PAGE_SIZE, - &pos); + file, (const void __user *) data, len, &pos); set_fs(old_fs); kunmap(page); - if (ret != PAGE_SIZE) + if (ret != len) ret = -EIO; } fput(file); -- cgit v1.2.3 From 5e929b33c3935ecb029b3e495356b2b8af432efa Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:55 +0000 Subject: CacheFiles: Handle truncate unlocking the page we're reading Handle truncate unlocking the page we're attempting to read from the backing device before the read has completed. This was causing reports like the following to occur: Pid: 4765, comm: kslowd Not tainted 2.6.30.1 #1 Call Trace: [] ? cachefiles_read_waiter+0xd9/0x147 [cachefiles] [] ? __wait_on_bit+0x60/0x6f [] ? __wake_up_common+0x3f/0x71 [] ? __wake_up+0x30/0x44 [] ? __wake_up_bit+0x28/0x2d [] ? ext3_truncate+0x4d7/0x8ed [ext3] [] ? pagevec_lookup+0x17/0x1f [] ? unmap_mapping_range+0x59/0x1ff [] ? __wake_up+0x30/0x44 [] ? vmtruncate+0xc2/0xe2 [] ? inode_setattr+0x22/0x10a [] ? ext3_setattr+0x17b/0x1e6 [ext3] [] ? notify_change+0x186/0x2c9 [] ? cachefiles_attr_changed+0x133/0x1cd [cachefiles] [] ? cachefiles_lookup_object+0xcf/0x12a [cachefiles] [] ? fscache_lookup_object+0x110/0x122 [fscache] [] ? fscache_object_slow_work_execute+0x590/0x6bc [fscache] [] ? slow_work_thread+0x285/0x43a [] ? autoremove_wake_function+0x0/0x2e [] ? slow_work_thread+0x0/0x43a [] ? kthread+0x54/0x81 [] ? child_rip+0xa/0x20 [] ? kthread+0x0/0x81 [] ? child_rip+0x0/0x20 CacheFiles: I/O Error: Readpage failed on backing file 200000000000810 FS-Cache: Cache cachefiles stopped due to I/O error Reported-by: Christian Kujau Reported-by: Takashi Iwai Reported-by: Duc Le Minh Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 5a84fd7109a..1d833256386 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -40,8 +40,10 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, _debug("--- monitor %p %lx ---", page, page->flags); - if (!PageUptodate(page) && !PageError(page)) - dump_stack(); + if (!PageUptodate(page) && !PageError(page)) { + /* unlocked, not uptodate and not erronous? */ + _debug("page probably truncated"); + } /* remove from the waitqueue */ list_del(&wait->task_list); @@ -60,6 +62,84 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, return 0; } +/* + * handle a probably truncated page + * - check to see if the page is still relevant and reissue the read if + * possible + * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we + * must wait again and 0 if successful + */ +static int cachefiles_read_reissue(struct cachefiles_object *object, + struct cachefiles_one_read *monitor) +{ + struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct page *backpage = monitor->back_page, *backpage2; + int ret; + + kenter("{ino=%lx},{%lx,%lx}", + object->backer->d_inode->i_ino, + backpage->index, backpage->flags); + + /* skip if the page was truncated away completely */ + if (backpage->mapping != bmapping) { + kleave(" = -ENODATA [mapping]"); + return -ENODATA; + } + + backpage2 = find_get_page(bmapping, backpage->index); + if (!backpage2) { + kleave(" = -ENODATA [gone]"); + return -ENODATA; + } + + if (backpage != backpage2) { + put_page(backpage2); + kleave(" = -ENODATA [different]"); + return -ENODATA; + } + + /* the page is still there and we already have a ref on it, so we don't + * need a second */ + put_page(backpage2); + + INIT_LIST_HEAD(&monitor->op_link); + add_page_wait_queue(backpage, &monitor->monitor); + + if (trylock_page(backpage)) { + ret = -EIO; + if (PageError(backpage)) + goto unlock_discard; + ret = 0; + if (PageUptodate(backpage)) + goto unlock_discard; + + kdebug("reissue read"); + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto unlock_discard; + } + + /* but the page may have been read before the monitor was installed, so + * the monitor may miss the event - so we have to ensure that we do get + * one in such a case */ + if (trylock_page(backpage)) { + _debug("jumpstart %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + + /* it'll reappear on the todo list */ + kleave(" = -EINPROGRESS"); + return -EINPROGRESS; + +unlock_discard: + unlock_page(backpage); + spin_lock_irq(&object->work_lock); + list_del(&monitor->op_link); + spin_unlock_irq(&object->work_lock); + kleave(" = %d", ret); + return ret; +} + /* * copy data from backing pages to netfs pages to complete a read operation * - driven by FS-Cache's thread pool @@ -92,20 +172,26 @@ static void cachefiles_read_copier(struct fscache_operation *_op) _debug("- copy {%lu}", monitor->back_page->index); - error = -EIO; + recheck: if (PageUptodate(monitor->back_page)) { copy_highpage(monitor->netfs_page, monitor->back_page); pagevec_add(&pagevec, monitor->netfs_page); fscache_mark_pages_cached(monitor->op, &pagevec); error = 0; - } - - if (error) + } else if (!PageError(monitor->back_page)) { + /* the page has probably been truncated */ + error = cachefiles_read_reissue(object, monitor); + if (error == -EINPROGRESS) + goto next; + goto recheck; + } else { cachefiles_io_error_obj( object, "Readpage failed on backing file %lx", (unsigned long) monitor->back_page->flags); + error = -EIO; + } page_cache_release(monitor->back_page); @@ -114,6 +200,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) fscache_put_retrieval(op); kfree(monitor); + next: /* let the thread pool have some air occasionally */ max--; if (max < 0 || need_resched()) { -- cgit v1.2.3 From 6511de33c877a53b3df545bc06c29e0f272837ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:58 +0000 Subject: CacheFiles: Mark parent directory locks as I_MUTEX_PARENT to keep lockdep happy Mark parent directory locks as I_MUTEX_PARENT in the callers of cachefiles_bury_object() so that lockdep doesn't complain when that invokes vfs_unlink(): ============================================= [ INFO: possible recursive locking detected ] 2.6.32-rc6-cachefs #47 --------------------------------------------- kslowd002/3089 is trying to acquire lock: (&sb->s_type->i_mutex_key#7){+.+.+.}, at: [] vfs_unlink+0x8b/0x128 but task is already holding lock: (&sb->s_type->i_mutex_key#7){+.+.+.}, at: [] cachefiles_walk_to_object+0x1b0/0x831 [cachefiles] other info that might help us debug this: 1 lock held by kslowd002/3089: #0: (&sb->s_type->i_mutex_key#7){+.+.+.}, at: [] cachefiles_walk_to_object+0x1b0/0x831 [cachefiles] stack backtrace: Pid: 3089, comm: kslowd002 Not tainted 2.6.32-rc6-cachefs #47 Call Trace: [] __lock_acquire+0x1649/0x16e3 [] ? inode_has_perm+0x5f/0x61 [] lock_acquire+0x57/0x6d [] ? vfs_unlink+0x8b/0x128 [] mutex_lock_nested+0x54/0x292 [] ? vfs_unlink+0x8b/0x128 [] ? selinux_inode_permission+0x8e/0x90 [] ? security_inode_permission+0x1c/0x1e [] ? inode_permission+0x99/0xa5 [] vfs_unlink+0x8b/0x128 [] ? kfree+0xed/0xf9 [] cachefiles_bury_object+0xb6/0x420 [cachefiles] [] ? trace_hardirqs_on+0xd/0xf [] ? cachefiles_check_object_xattr+0x233/0x293 [cachefiles] [] cachefiles_walk_to_object+0x4ff/0x831 [cachefiles] [] ? finish_task_switch+0x0/0xb2 [] cachefiles_lookup_object+0xac/0x12a [cachefiles] [] fscache_lookup_object+0x1c7/0x214 [fscache] [] fscache_object_state_machine+0xa5/0x52d [fscache] [] fscache_object_slow_work_execute+0x5f/0xa0 [fscache] [] slow_work_execute+0x18f/0x2d1 [] slow_work_thread+0x1c5/0x308 [] ? autoremove_wake_function+0x0/0x34 [] ? slow_work_thread+0x0/0x308 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 Signed-off-by: Daivd Howells --- fs/cachefiles/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 4ce818ae39e..3df86952ca6 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -254,7 +254,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); ret = cachefiles_bury_object(cache, dir, object->dentry); dput(dir); @@ -307,7 +307,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); -- cgit v1.2.3 From d0e27b7808dc667f3015be0b6888f6d680e222c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:12:02 +0000 Subject: CacheFiles: Better showing of debugging information in active object problems Show more debugging information if cachefiles_mark_object_active() is asked to activate an active object. This may happen, for instance, if the netfs tries to register an object with the same key multiple times. The code is changed to (a) get the appropriate object lock to protect the cookie pointer whilst we dereference it, and (b) get and display the cookie key if available. Signed-off-by: David Howells --- fs/cachefiles/namei.c | 102 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 3df86952ca6..00a0cda8f47 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -27,6 +27,76 @@ static int cachefiles_wait_bit(void *flags) return 0; } +#define CACHEFILES_KEYBUF_SIZE 512 + +/* + * dump debugging info about an object + */ +static noinline +void __cachefiles_printk_object(struct cachefiles_object *object, + const char *prefix, + u8 *keybuf) +{ + struct fscache_cookie *cookie; + unsigned keylen, loop; + + printk(KERN_ERR "%sobject: OBJ%x\n", + prefix, object->fscache.debug_id); + printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n", + prefix, fscache_object_states[object->fscache.state], + object->fscache.flags, object->fscache.work.flags, + object->fscache.events, + object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); + printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", + prefix, object->fscache.n_ops, object->fscache.n_in_progress, + object->fscache.n_exclusive); + printk(KERN_ERR "%sparent=%p\n", + prefix, object->fscache.parent); + + spin_lock(&object->fscache.lock); + cookie = object->fscache.cookie; + if (cookie) { + printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n", + prefix, + object->fscache.cookie, + object->fscache.cookie->parent, + object->fscache.cookie->netfs_data, + object->fscache.cookie->flags); + if (keybuf) + keylen = cookie->def->get_key(cookie->netfs_data, keybuf, + CACHEFILES_KEYBUF_SIZE); + else + keylen = 0; + } else { + printk(KERN_ERR "%scookie=NULL\n", prefix); + keylen = 0; + } + spin_unlock(&object->fscache.lock); + + if (keylen) { + printk(KERN_ERR "%skey=[%u] '", prefix, keylen); + for (loop = 0; loop < keylen; loop++) + printk("%02x", keybuf[loop]); + printk("'\n"); + } +} + +/* + * dump debugging info about a pair of objects + */ +static noinline void cachefiles_printk_object(struct cachefiles_object *object, + struct cachefiles_object *xobject) +{ + u8 *keybuf; + + keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); + if (object) + __cachefiles_printk_object(object, "", keybuf); + if (xobject) + __cachefiles_printk_object(xobject, "x", keybuf); + kfree(keybuf); +} + /* * record the fact that an object is now active */ @@ -42,8 +112,11 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache, try_again: write_lock(&cache->active_lock); - if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) + if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { + printk(KERN_ERR "CacheFiles: Error: Object already active\n"); + cachefiles_printk_object(object, NULL); BUG(); + } dentry = object->dentry; _p = &cache->active_nodes.rb_node; @@ -76,32 +149,7 @@ wait_for_old_object: printk(KERN_ERR "\n"); printk(KERN_ERR "CacheFiles: Error:" " Unexpected object collision\n"); - printk(KERN_ERR "xobject: OBJ%x\n", - xobject->fscache.debug_id); - printk(KERN_ERR "xobjstate=%s\n", - fscache_object_states[xobject->fscache.state]); - printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags); - printk(KERN_ERR "xobjevent=%lx [%lx]\n", - xobject->fscache.events, xobject->fscache.event_mask); - printk(KERN_ERR "xops=%u inp=%u exc=%u\n", - xobject->fscache.n_ops, xobject->fscache.n_in_progress, - xobject->fscache.n_exclusive); - printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n", - xobject->fscache.cookie, - xobject->fscache.cookie->parent, - xobject->fscache.cookie->netfs_data, - xobject->fscache.cookie->flags); - printk(KERN_ERR "xparent=%p\n", - xobject->fscache.parent); - printk(KERN_ERR "object: OBJ%x\n", - object->fscache.debug_id); - printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n", - object->fscache.cookie, - object->fscache.cookie->parent, - object->fscache.cookie->netfs_data, - object->fscache.cookie->flags); - printk(KERN_ERR "parent=%p\n", - object->fscache.parent); + cachefiles_printk_object(object, xobject); BUG(); } atomic_inc(&xobject->usage); -- cgit v1.2.3 From fee096deb4f33897937b974cb2c5168bab7935be Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:12:05 +0000 Subject: CacheFiles: Catch an overly long wait for an old active object Catch an overly long wait for an old, dying active object when we want to replace it with a new one. The probability is that all the slow-work threads are hogged, and the delete can't get a look in. What we do instead is: (1) if there's nothing in the slow work queue, we sleep until either the dying object has finished dying or there is something in the slow work queue behind which we can queue our object. (2) if there is something in the slow work queue, we return ETIMEDOUT to fscache_lookup_object(), which then puts us back on the slow work queue, presumably behind the deletion that we're blocked by. We are then deferred for a while until we work our way back through the queue - without blocking a slow-work thread unnecessarily. A backtrace similar to the following may appear in the log without this patch: INFO: task kslowd004:5711 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kslowd004 D 0000000000000000 0 5711 2 0x00000080 ffff88000340bb80 0000000000000046 ffff88002550d000 0000000000000000 ffff88002550d000 0000000000000007 ffff88000340bfd8 ffff88002550d2a8 000000000000ddf0 00000000000118c0 00000000000118c0 ffff88002550d2a8 Call Trace: [] ? trace_hardirqs_on+0xd/0xf [] ? cachefiles_wait_bit+0x0/0xd [cachefiles] [] cachefiles_wait_bit+0x9/0xd [cachefiles] [] __wait_on_bit+0x43/0x76 [] ? ext3_xattr_get+0x1ec/0x270 [] out_of_line_wait_on_bit+0x69/0x74 [] ? cachefiles_wait_bit+0x0/0xd [cachefiles] [] ? wake_bit_function+0x0/0x2e [] cachefiles_mark_object_active+0x203/0x23b [cachefiles] [] cachefiles_walk_to_object+0x558/0x827 [cachefiles] [] cachefiles_lookup_object+0xac/0x12a [cachefiles] [] fscache_lookup_object+0x1c7/0x214 [fscache] [] fscache_object_state_machine+0xa5/0x52d [fscache] [] fscache_object_slow_work_execute+0x5f/0xa0 [fscache] [] slow_work_execute+0x18f/0x2d1 [] slow_work_thread+0x1c5/0x308 [] ? autoremove_wake_function+0x0/0x34 [] ? slow_work_thread+0x0/0x308 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 1 lock held by kslowd004/5711: #0: (&sb->s_type->i_mutex_key#7/1){+.+.+.}, at: [] cachefiles_walk_to_object+0x1b3/0x827 [cachefiles] Signed-off-by: David Howells --- fs/cachefiles/interface.c | 6 ++-- fs/cachefiles/namei.c | 87 ++++++++++++++++++++++++++++++++++++----------- fs/fscache/internal.h | 1 + fs/fscache/object.c | 10 +++++- fs/fscache/stats.c | 4 ++- 5 files changed, 85 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 8e67abf0598..9d3c426044a 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -114,8 +114,9 @@ nomem_lookup_data: /* * attempt to look up the nominated node in this cache + * - return -ETIMEDOUT to be scheduled again */ -static void cachefiles_lookup_object(struct fscache_object *_object) +static int cachefiles_lookup_object(struct fscache_object *_object) { struct cachefiles_lookup_data *lookup_data; struct cachefiles_object *parent, *object; @@ -145,13 +146,14 @@ static void cachefiles_lookup_object(struct fscache_object *_object) object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) cachefiles_attr_changed(&object->fscache); - if (ret < 0) { + if (ret < 0 && ret != -ETIMEDOUT) { printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n", ret); fscache_object_lookup_error(&object->fscache); } _leave(" [%d]", ret); + return ret; } /* diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 00a0cda8f47..14ac4806e29 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -21,12 +21,6 @@ #include #include "internal.h" -static int cachefiles_wait_bit(void *flags) -{ - schedule(); - return 0; -} - #define CACHEFILES_KEYBUF_SIZE 512 /* @@ -100,8 +94,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, /* * record the fact that an object is now active */ -static void cachefiles_mark_object_active(struct cachefiles_cache *cache, - struct cachefiles_object *object) +static int cachefiles_mark_object_active(struct cachefiles_cache *cache, + struct cachefiles_object *object) { struct cachefiles_object *xobject; struct rb_node **_p, *_parent = NULL; @@ -139,8 +133,8 @@ try_again: rb_insert_color(&object->active_node, &cache->active_nodes); write_unlock(&cache->active_lock); - _leave(""); - return; + _leave(" = 0"); + return 0; /* an old object from a previous incarnation is hogging the slot - we * need to wait for it to be destroyed */ @@ -155,13 +149,64 @@ wait_for_old_object: atomic_inc(&xobject->usage); write_unlock(&cache->active_lock); - _debug(">>> wait"); - wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE, - cachefiles_wait_bit, TASK_UNINTERRUPTIBLE); - _debug("<<< waited"); + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { + wait_queue_head_t *wq; + + signed long timeout = 60 * HZ; + wait_queue_t wait; + bool requeue; + + /* if the object we're waiting for is queued for processing, + * then just put ourselves on the queue behind it */ + if (slow_work_is_queued(&xobject->fscache.work)) { + _debug("queue OBJ%x behind OBJ%x immediately", + object->fscache.debug_id, + xobject->fscache.debug_id); + goto requeue; + } + + /* otherwise we sleep until either the object we're waiting for + * is done, or the slow-work facility wants the thread back to + * do other work */ + wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); + init_wait(&wait); + requeue = false; + do { + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) + break; + requeue = slow_work_sleep_till_thread_needed( + &object->fscache.work, &timeout); + } while (timeout > 0 && !requeue); + finish_wait(wq, &wait); + + if (requeue && + test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { + _debug("queue OBJ%x behind OBJ%x after wait", + object->fscache.debug_id, + xobject->fscache.debug_id); + goto requeue; + } + + if (timeout <= 0) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error: Overlong" + " wait for old active object to go away\n"); + cachefiles_printk_object(object, xobject); + goto requeue; + } + } + + ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); cache->cache.ops->put_object(&xobject->fscache); goto try_again; + +requeue: + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + cache->cache.ops->put_object(&xobject->fscache); + _leave(" = -ETIMEDOUT"); + return -ETIMEDOUT; } /* @@ -466,12 +511,15 @@ lookup_again: } /* note that we're now using this object */ - cachefiles_mark_object_active(cache, object); + ret = cachefiles_mark_object_active(cache, object); mutex_unlock(&dir->d_inode->i_mutex); dput(dir); dir = NULL; + if (ret == -ETIMEDOUT) + goto mark_active_timed_out; + _debug("=== OBTAINED_OBJECT ==="); if (object->new) { @@ -515,6 +563,10 @@ create_error: cachefiles_io_error(cache, "Create/mkdir failed"); goto error; +mark_active_timed_out: + _debug("mark active timed out"); + goto release_dentry; + check_error: _debug("check error %d", ret); write_lock(&cache->active_lock); @@ -522,7 +574,7 @@ check_error: clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); write_unlock(&cache->active_lock); - +release_dentry: dput(object->dentry); object->dentry = NULL; goto error_out; @@ -543,9 +595,6 @@ error: error_out2: dput(dir); error_out: - if (ret == -ENOSPC) - ret = -ENOBUFS; - _leave(" = error %d", -ret); return ret; } diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 5b49a373689..0ca2566e038 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -215,6 +215,7 @@ extern atomic_t fscache_n_object_no_alloc; extern atomic_t fscache_n_object_lookups; extern atomic_t fscache_n_object_lookups_negative; extern atomic_t fscache_n_object_lookups_positive; +extern atomic_t fscache_n_object_lookups_timed_out; extern atomic_t fscache_n_object_created; extern atomic_t fscache_n_object_avail; extern atomic_t fscache_n_object_dead; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index f3f952cf887..e513ac599c8 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -468,6 +468,7 @@ static void fscache_lookup_object(struct fscache_object *object) { struct fscache_cookie *cookie = object->cookie; struct fscache_object *parent; + int ret; _enter(""); @@ -493,12 +494,19 @@ static void fscache_lookup_object(struct fscache_object *object) fscache_stat(&fscache_n_object_lookups); fscache_stat(&fscache_n_cop_lookup_object); - object->cache->ops->lookup_object(object); + ret = object->cache->ops->lookup_object(object); fscache_stat_d(&fscache_n_cop_lookup_object); if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + if (ret == -ETIMEDOUT) { + /* probably stuck behind another object, so move this one to + * the back of the queue */ + fscache_stat(&fscache_n_object_lookups_timed_out); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } + _leave(""); } diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 05f77caf4a2..46435f3aae6 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -98,6 +98,7 @@ atomic_t fscache_n_object_no_alloc; atomic_t fscache_n_object_lookups; atomic_t fscache_n_object_lookups_negative; atomic_t fscache_n_object_lookups_positive; +atomic_t fscache_n_object_lookups_timed_out; atomic_t fscache_n_object_created; atomic_t fscache_n_object_avail; atomic_t fscache_n_object_dead; @@ -160,10 +161,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_acquires_nobufs), atomic_read(&fscache_n_acquires_oom)); - seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n", + seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n", atomic_read(&fscache_n_object_lookups), atomic_read(&fscache_n_object_lookups_negative), atomic_read(&fscache_n_object_lookups_positive), + atomic_read(&fscache_n_object_lookups_timed_out), atomic_read(&fscache_n_object_created)); seq_printf(m, "Updates: n=%u nul=%u run=%u\n", -- cgit v1.2.3 From 14e69647c868459bcb910f771851ca7c699efd21 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:12:08 +0000 Subject: CacheFiles: Don't log lookup/create failing with ENOBUFS Don't log the CacheFiles lookup/create object routined failing with ENOBUFS as under high memory load or high cache load they can do this quite a lot. This error simply means that the requested object cannot be created on disk due to lack of space, or due to failure of the backing filesystem to find sufficient resources. Signed-off-by: David Howells --- fs/cachefiles/interface.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 9d3c426044a..27089311fbe 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -147,8 +147,9 @@ static int cachefiles_lookup_object(struct fscache_object *_object) cachefiles_attr_changed(&object->fscache); if (ret < 0 && ret != -ETIMEDOUT) { - printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n", - ret); + if (ret != -ENOBUFS) + printk(KERN_WARNING + "CacheFiles: Lookup failed error %d\n", ret); fscache_object_lookup_error(&object->fscache); } -- cgit v1.2.3 From 0109d7e614e016a842569628116f54847a177f6e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Nov 2009 21:50:36 +0000 Subject: SLOW_WORK: Fix CIFS to pass THIS_MODULE to slow_work_register_user() As of the patch: SLOW_WORK: Wait for outstanding work items belonging to a module to clear Wait for outstanding slow work items belonging to a module to clear when unregistering that module as a user of the facility. This prevents the put_ref code of a work item from being taken away before it returns. slow_work_register_user() takes a module pointer as an argument. CIFS must now pass THIS_MODULE as that argument, lest the following error be observed: fs/cifs/cifsfs.c: In function 'init_cifs': fs/cifs/cifsfs.c:1040: error: too few arguments to function 'slow_work_register_user' Signed-off-by: David Howells --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9a5e4f5f312..29f1da761bb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1037,7 +1037,7 @@ init_cifs(void) if (rc) goto out_unregister_key_type; #endif - rc = slow_work_register_user(); + rc = slow_work_register_user(THIS_MODULE); if (rc) goto out_unregister_resolver_key; -- cgit v1.2.3 From 1c2ea8a2c0b71cc5e07f518370d458d692c9b21a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Nov 2009 21:50:40 +0000 Subject: SLOW_WORK: Fix GFS2 to #include before using THIS_MODULE GFS2 has been altered to pass THIS_MODULE to slow_work_register_user(), but hasn't been altered to #include to provide it, resulting in the following error: fs/gfs2/recovery.c:596: error: 'THIS_MODULE' undeclared here (not in a function) Add the missing #include. Signed-off-by: David Howells --- fs/gfs2/recovery.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b2bb779f09e..09fa3196557 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -7,6 +7,7 @@ * of the GNU General Public License version 2. */ +#include #include #include #include -- cgit v1.2.3 From 4fa9f4ede88b4e2ff135b6e5717499d734508c62 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Nov 2009 21:50:44 +0000 Subject: FS-Cache: Provide nop fscache_stat_d() if CONFIG_FSCACHE_STATS=n Provide nop fscache_stat_d() macro if CONFIG_FSCACHE_STATS=n lest errors like the following occur: fs/fscache/cache.c: In function 'fscache_withdraw_cache': fs/fscache/cache.c:386: error: implicit declaration of function 'fscache_stat_d' fs/fscache/cache.c:386: error: 'fscache_n_cop_sync_cache' undeclared (first use in this function) fs/fscache/cache.c:386: error: (Each undeclared identifier is reported only once fs/fscache/cache.c:386: error: for each function it appears in.) fs/fscache/cache.c:392: error: 'fscache_n_cop_dissociate_pages' undeclared (first use in this function) Signed-off-by: David Howells --- fs/fscache/internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 0ca2566e038..edd7434ab6e 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -259,6 +259,7 @@ extern const struct file_operations fscache_stats_fops; #define __fscache_stat(stat) (NULL) #define fscache_stat(stat) do {} while (0) +#define fscache_stat_d(stat) do {} while (0) #endif /* -- cgit v1.2.3 From 8e6c0332d5032aef2d3bc8f41771f999112c8c66 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Nov 2009 22:17:59 +0000 Subject: [CIFS] fix oops in cifs_lookup during net boot Fixes bugzilla.kernel.org bug number 14641 Lookup called during network boot (network root filesystem for diskless workstation) has case where nd is null in lookup. This patch fixes that in cifs_lookup. (Shirish noted that 2.6.30 and 2.6.31 stable need the same check) Signed-off-by: Shirish Pargaonkar Acked-by: Jeff Layton Tested-by: Vladimir Stavrinov CC: Stable Signed-off-by: Steve French --- fs/cifs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 627a60a6c1b..32771f581b6 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -643,7 +643,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, * O_EXCL: optimize away the lookup, but don't hash the dentry. Let * the VFS handle the create. */ - if (nd->flags & LOOKUP_EXCL) { + if (nd && (nd->flags & LOOKUP_EXCL)) { d_instantiate(direntry, NULL); return 0; } @@ -675,7 +675,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, * reduction in network traffic in the other paths. */ if (pTcon->unix_ext) { - if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && + if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && (nd->intent.open.flags & O_CREAT)) { rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, -- cgit v1.2.3 From cea62343956c24452700c06cf028b72414c58a74 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Nov 2009 22:49:37 +0000 Subject: [CIFS] Duplicate data on appending to some Samba servers SMB writes are sent with a starting offset and length. When the server supports the newer SMB trans2 posix open (rather than using the SMB NTCreateX) a file can be opened with SMB_O_APPEND flag, and for that case Samba server assumes that the offset sent in SMBWriteX is unneeded since the write should go to the end of the file - which can cause problems if the write was cached (since the beginning part of a page could be written twice by the client mm). Jeff suggested that masking the flag on posix open on the client is easiest for the time being. Note that recent Samba server also had an unrelated problem with SMB NTCreateX and append (see samba bugzilla bug number 6898) which should not affect current Linux clients (unless cifs Unix Extensions are disabled). The cifs client did not send the O_APPEND flag on posix open before 2.6.29 so the fix is unneeded on early kernels. In the future, for the non-cached case (O_DIRECT, and forcedirectio mounts) it would be possible and useful to send O_APPEND on posix open (for Windows case: FILE_APPEND_DATA but not FILE_WRITE_DATA on SMB NTCreateX) but for cached writes although the vfs sets the offset to end of file it may fragment a write across pages - so we can't send O_APPEND on open (could result in sending part of a page twice). CC: Stable Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/dir.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 32771f581b6..d3a6b07e335 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -214,8 +214,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode, posix_flags |= SMB_O_EXCL; if (oflags & O_TRUNC) posix_flags |= SMB_O_TRUNC; - if (oflags & O_APPEND) - posix_flags |= SMB_O_APPEND; if (oflags & O_SYNC) posix_flags |= SMB_O_SYNC; if (oflags & O_DIRECTORY) -- cgit v1.2.3 From 2f81e752da4781fc276689fc14391346d0dbbe78 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 25 Nov 2009 00:11:31 +0000 Subject: [CIFS] Fix sparse warning Also update CHANGES file Signed-off-by: Steve French --- fs/cifs/CHANGES | 9 +++++++++ fs/cifs/dir.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 145540a316a..094ea65afc8 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,12 @@ +Version 1.61 +------------ +Fix append problem to Samba servers (files opened with O_APPEND could +have duplicated data). Fix oops in cifs_lookup. Workaround problem +mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. +Disable use of server inode numbers when server only +partially supports them (e.g. for one server querying inode numbers on +FindFirst fails but QPathInfo queries works). + Version 1.60 ------------- Fix memory leak in reconnect. Fix oops in DFS mount error path. diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index d3a6b07e335..1f42f772865 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -643,7 +643,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, */ if (nd && (nd->flags & LOOKUP_EXCL)) { d_instantiate(direntry, NULL); - return 0; + return NULL; } /* can not grab the rename sem here since it would -- cgit v1.2.3 From 1b7323965a8c6eee9dc4e345a7ae4bff1dc93149 Mon Sep 17 00:00:00 2001 From: Csaba Henk Date: Fri, 27 Nov 2009 19:30:14 +0530 Subject: fuse: reject O_DIRECT flag also in fuse_create The comment in fuse_open about O_DIRECT: "VFS checks this, but only _after_ ->open()" also holds for fuse_create, however, the same kind of check was missing there. As an impact of this bug, open(newfile, O_RDWR|O_CREAT|O_DIRECT) fails, but a stub newfile will remain if the fuse server handled the implied FUSE_CREATE request appropriately. Other impact: in the above situation ima_file_free() will complain to open/free imbalance if CONFIG_IMA is set. Signed-off-by: Csaba Henk Signed-off-by: Miklos Szeredi Cc: Harshavardhana Cc: stable@kernel.org --- fs/fuse/dir.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8ada78aade5..4787ae6c5c1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -385,6 +385,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (fc->no_create) return -ENOSYS; + if (flags & O_DIRECT) + return -EINVAL; + forget_req = fuse_get_req(fc); if (IS_ERR(forget_req)) return PTR_ERR(forget_req); -- cgit v1.2.3 From 199bc9ff5ca5e4b3bcaff8927b2983c65f34c263 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 30 Nov 2009 09:06:40 +0000 Subject: jffs2: Fix memory corruption in jffs2_read_inode_range() In 2.6.23 kernel, commit a32ea1e1f925399e0d81ca3f7394a44a6dafa12c ("Fix read/truncate race") fixed a race in the generic code, and as a side effect, now do_generic_file_read() can ask us to readpage() past the i_size. This seems to be correctly handled by the block routines (e.g. block_read_full_page() fills the page with zeroes in case if somebody is trying to read past the last inode's block). JFFS2 doesn't handle this; it assumes that it won't be asked to read pages which don't exist -- and thus that there will be at least _one_ valid 'frag' on the page it's being asked to read. It will fill any holes with the following memset: memset(buf, 0, min(end, frag->ofs + frag->size) - offset); When the 'closest smaller match' returned by jffs2_lookup_node_frag() is actually on a previous page and ends before 'offset', that results in: memset(buf, 0, ); Hopefully, in most cases the corruption is fatal, and quickly causing random oopses, like this: root@10.0.0.4:~/ltp-fs-20090531# ./testcases/kernel/fs/ftest/ftest01 Unable to handle kernel paging request for data at address 0x00000008 Faulting instruction address: 0xc01cd980 Oops: Kernel access of bad area, sig: 11 [#1] [...] NIP [c01cd980] rb_insert_color+0x38/0x184 LR [c0043978] enqueue_hrtimer+0x88/0xc4 Call Trace: [c6c63b60] [c004f9a8] tick_sched_timer+0xa0/0xe4 (unreliable) [c6c63b80] [c0043978] enqueue_hrtimer+0x88/0xc4 [c6c63b90] [c0043a48] __run_hrtimer+0x94/0xbc [c6c63bb0] [c0044628] hrtimer_interrupt+0x140/0x2b8 [c6c63c10] [c000f8e8] timer_interrupt+0x13c/0x254 [c6c63c30] [c001352c] ret_from_except+0x0/0x14 --- Exception: 901 at memset+0x38/0x5c LR = jffs2_read_inode_range+0x144/0x17c [c6c63cf0] [00000000] (null) (unreliable) This patch fixes the issue, plus fixes all LTP tests on NAND/UBI with JFFS2 filesystem that were failing since 2.6.23 (seems like the bug above also broke the truncation). Reported-By: Anton Vorontsov Tested-By: Anton Vorontsov Signed-off-by: David Woodhouse Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/jffs2/read.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c index cfe05c1966a..3f39be1b045 100644 --- a/fs/jffs2/read.c +++ b/fs/jffs2/read.c @@ -164,12 +164,15 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, /* XXX FIXME: Where a single physical node actually shows up in two frags, we read it twice. Don't do that. */ - /* Now we're pointing at the first frag which overlaps our page */ + /* Now we're pointing at the first frag which overlaps our page + * (or perhaps is before it, if we've been asked to read off the + * end of the file). */ while(offset < end) { D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end)); - if (unlikely(!frag || frag->ofs > offset)) { + if (unlikely(!frag || frag->ofs > offset || + frag->ofs + frag->size <= offset)) { uint32_t holesize = end - offset; - if (frag) { + if (frag && frag->ofs > offset) { D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset)); holesize = min(holesize, frag->ofs - offset); } -- cgit v1.2.3 From 6f054164322bc6c1233402b9ed6b40d4af39a98f Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 1 Dec 2009 13:38:45 +0000 Subject: 9p: fix build breakage introduced by FS-Cache While building 2.6.32-rc8-git2 for Fedora I noticed the following thinko in commit 201a15428bd54f83eccec8b7c64a04b8f9431204 ("FS-Cache: Handle pages pending storage that get evicted under OOM conditions"): fs/9p/cache.c: In function '__v9fs_fscache_release_page': fs/9p/cache.c:346: error: 'vnode' undeclared (first use in this function) fs/9p/cache.c:346: error: (Each undeclared identifier is reported only once fs/9p/cache.c:346: error: for each function it appears in.) make[2]: *** [fs/9p/cache.o] Error 1 Fix the 9P filesystem to correctly construct the argument to fscache_maybe_release_page(). Signed-off-by: Kyle McMartin Signed-off-by: Xiaotian Feng [from identical patch] Signed-off-by: Stefan Lippers-Hollmann [from identical patch] Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/9p/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/cache.c b/fs/9p/cache.c index bcc5357a906..e777961939f 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -343,7 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) BUG_ON(!vcookie->fscache); - return fscache_maybe_release_page(vnode->cache, page, gfp); + return fscache_maybe_release_page(vcookie->fscache, page, gfp); } void __v9fs_fscache_invalidate_page(struct page *page) -- cgit v1.2.3 From 3350b2acdd39d23db52710045536b943fe38a35c Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Tue, 1 Dec 2009 14:09:24 +0000 Subject: CacheFiles: Update IMA counters when using dentry_open When IMA is active, using dentry_open without updating the IMA counters will result in free/open imbalance errors when fput is eventually called. Signed-off-by: Marc Dionne Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/cachefiles/rdwr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 1d833256386..a6c8c6fe8df 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -11,6 +11,7 @@ #include #include +#include #include "internal.h" /* @@ -922,6 +923,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) if (IS_ERR(file)) { ret = PTR_ERR(file); } else { + ima_counts_get(file); ret = -EIO; if (file->f_op->write) { pos = (loff_t) page->index << PAGE_SHIFT; -- cgit v1.2.3