From ff392c497b43ddedbab5627b53928a654cc5486e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:36 -0500 Subject: xfs: prevent kernel crash due to corrupted inode log format Andras Korn reported an oops on log replay causes by a corrupted xfs_inode_log_format_t passing a 0 size to kmem_zalloc. This patch handles to small or too large numbers of log regions gracefully by rejecting the log replay with a useful error message. Signed-off-by: Christoph Hellwig Reported-by: Andras Korn Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/xfs_log_recover.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b1047de2fff..61af610d79b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1455,10 +1455,19 @@ xlog_recover_add_to_trans( item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ - item->ri_total = in_f->ilf_size; - ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); - item->ri_buf = kmem_zalloc((item->ri_total * - sizeof(xfs_log_iovec_t)), KM_SLEEP); + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xlog_warn( + "XFS: bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + return XFS_ERROR(EIO); + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ -- cgit v1.2.3 From 7d46be4a25fdfb503c20bad60a618adebfe2ac5c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:35 -0500 Subject: xfs: prevent lockdep false positive in xfs_iget_cache_miss The inode can't be locked by anyone else as we just created it a few lines above and it's not been added to any lookup data structure yet. So use a trylock that must succeed to get around the lockdep warnings. Signed-off-by: Christoph Hellwig Reported-by: Alexander Beregalov Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_iget.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e2fb6210d4c..478e587087f 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -246,9 +246,6 @@ xfs_iget_cache_miss( goto out_destroy; } - if (lock_flags) - xfs_ilock(ip, lock_flags); - /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload @@ -256,7 +253,16 @@ xfs_iget_cache_miss( */ if (radix_tree_preload(GFP_KERNEL)) { error = EAGAIN; - goto out_unlock; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); } mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); @@ -284,7 +290,6 @@ xfs_iget_cache_miss( out_preload_end: write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); -out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: -- cgit v1.2.3 From c141b2928fe20396a9ecdec85526e4b66ae96c90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:37 -0500 Subject: xfs: only issues a cache flush on unmount if barriers are enabled Currently we unconditionally issue a flush from xfs_free_buftarg, but since 2.6.29-rc1 this gives a warning in the style of end_request: I/O error, dev vdb, sector 0 Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 12 ++++++++++-- fs/xfs/linux-2.6/xfs_buf.h | 2 +- fs/xfs/linux-2.6/xfs_super.c | 10 +++++----- 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index cb329edc925..aa1016bb913 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -34,6 +34,12 @@ #include #include +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" + static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); @@ -1435,10 +1441,12 @@ xfs_unregister_buftarg( void xfs_free_buftarg( - xfs_buftarg_t *btp) + struct xfs_mount *mp, + struct xfs_buftarg *btp) { xfs_flush_buftarg(btp, 1); - xfs_blkdev_issue_flush(btp); + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); xfs_free_bufhash(btp); iput(btp->bt_mapping->host); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 288ae7c4c80..9b4d666ad31 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); -extern void xfs_free_buftarg(xfs_buftarg_t *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index c71e226da7f..32ae5028e96 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -734,15 +734,15 @@ xfs_close_devices( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_logdev_targp); + xfs_free_buftarg(mp, mp->m_logdev_targp); xfs_blkdev_put(logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); xfs_blkdev_put(rtdev); } - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); } /* @@ -811,9 +811,9 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: if (rtdev) xfs_blkdev_put(rtdev); -- cgit v1.2.3 From 4184ea7f908d95f329febc3665cf66da8568b467 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Mar 2009 12:39:20 -0400 Subject: Btrfs: Fix locking around adding new space_info Storage allocated to different raid levels in btrfs is tracked by a btrfs_space_info structure, and all of the current space_infos are collected into a list_head. Most filesystems have 3 or 4 of these structs total, and the list is only changed when new raid levels are added or at unmount time. This commit adds rcu locking on the list head, and properly frees things at unmount time. It also clears the space_info->full flag whenever new space is added to the FS. The locking for the space info list goes like this: reads: protected by rcu_read_lock() writes: protected by the chunk_mutex At unmount time we don't need special locking because all the readers are gone. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/extent-tree.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/volumes.c | 2 ++ 3 files changed, 53 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 82491ba8fa4..5e1d4e30e9d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -784,7 +784,14 @@ struct btrfs_fs_info { struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; + + /* + * the space_info list is almost entirely read only. It only changes + * when we add a new raid type to the FS, and that happens + * very rarely. RCU is used to protect it. + */ struct list_head space_info; + spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; @@ -1797,6 +1804,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + int btrfs_check_metadata_free_space(struct btrfs_root *root); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9abf81f71c4..fefe83ad205 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "compat.h" #include "hash.h" #include "crc32c.h" @@ -330,13 +331,33 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, { struct list_head *head = &info->space_info; struct btrfs_space_info *found; - list_for_each_entry(found, head, list) { - if (found->flags == flags) + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags == flags) { + rcu_read_unlock(); return found; + } } + rcu_read_unlock(); return NULL; } +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -1903,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - list_add(&found->list, &info->space_info); INIT_LIST_HEAD(&found->block_groups); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); @@ -1917,6 +1937,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = 0; *space_info = found; + list_add_rcu(&found->list, &info->space_info); return 0; } @@ -6320,6 +6341,7 @@ out: int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; struct rb_node *n; spin_lock(&info->block_group_cache_lock); @@ -6341,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) spin_lock(&info->block_group_cache_lock); } spin_unlock(&info->block_group_cache_lock); + + /* now that all the block groups are freed, go through and + * free all the space_info structs. This is only called during + * the final stages of unmount, and so we know nobody is + * using them. We call synchronize_rcu() once before we start, + * just to be on the safe side. + */ + synchronize_rcu(); + + while(!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + list_del(&space_info->list); + kfree(space_info); + } return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1316139bf9e..7aa3810d7f6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1459,6 +1459,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; + btrfs_clear_space_info_full(device->dev_root->fs_info); + return btrfs_update_device(trans, device); } -- cgit v1.2.3 From 913d952eb573c3d1f7487e83b5590e13e7cae2bd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Mar 2009 13:17:18 -0400 Subject: Btrfs: Clear space_info full when adding new devices The full flag on the space info structs tells the allocator not to try and allocate more chunks because the devices in the FS are fully allocated. When more devices are added, we need to clear the full flag so the allocator knows it has more space available. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7aa3810d7f6..dd06e18e5aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1374,6 +1374,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_add_device(trans, root, device); } + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + unlock_chunks(root); btrfs_commit_transaction(trans, root); -- cgit v1.2.3 From 395a87bfefbc400011417e9eaae33169f9f036c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 10 Mar 2009 18:18:47 -0400 Subject: ext4: fix header check in ext4_ext_search_right() for deep extent trees. The ext4_ext_search_right() function is confusing; it uses a "depth" variable which is 0 at the root and maximum at the leaves, but the on-disk metadata uses a "depth" (actually eh_depth) which is opposite: maximum at the root, and 0 at the leaves. The ext4_ext_check_header() function is given a depth and checks the header agaisnt that depth; it expects the on-disk semantics, but we are giving it the opposite in the while loop in this function. We should be giving it the on-disk notion of "depth" which we can get from (p_depth - depth) - and if you look, the last (more commonly hit) call to ext4_ext_check_header() does just this. Sending in the wrong depth results in (incorrect) messages about corruption: EXT4-fs error (device sdb1): ext4_ext_search_right: bad header in inode #2621457: unexpected eh_depth - magic f30a, entries 340, max 340(0), depth 1(2) http://bugzilla.kernel.org/show_bug.cgi?id=12821 Reported-by: David Dindorp Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e2eab196875..e0aa4fe4f59 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1122,7 +1122,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *ix; struct ext4_extent *ex; ext4_fsblk_t block; - int depth, ee_len; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; BUG_ON(path == NULL); depth = path->p_depth; @@ -1179,7 +1180,8 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, depth)) { + /* subtract from p_depth to get proper eh_depth */ + if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } -- cgit v1.2.3 From ef95d31e6de6be9602ce950b85fb7ab8af46ae42 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:17 -0400 Subject: NFS: Fix misparsing of nfsv4 fs_locations attribute (take 2) The changeset ea31a4437c59219bf3ea946d58984b01a45a289c (nfs: Fix misparsing of nfsv4 fs_locations attribute) causes the mountpath that is calculated at the beginning of try_location() to be clobbered when we later strncpy a non-nul terminated hostname using an incorrect buffer length. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 30befc39b3c..2a2a0a7143a 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -21,7 +21,9 @@ #define NFSDBG_FACILITY NFSDBG_VFS /* - * Check if fs_root is valid + * Convert the NFSv4 pathname components into a standard posix path. + * + * Note that the resulting string will be placed at the end of the buffer */ static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, char *buffer, ssize_t buflen) @@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, { struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; - int page2len; + unsigned int maxbuflen; unsigned int s; mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) return mnt; mountdata->mnt_path = mnt_path; - page2 += strlen(mnt_path) + 1; - page2len = PAGE_SIZE - strlen(mnt_path) - 1; + maxbuflen = mnt_path - 1 - page2; for (s = 0; s < location->nservers; s++) { const struct nfs4_string *buf = &location->servers[s]; struct sockaddr_storage addr; - if (buf->len <= 0 || buf->len >= PAGE_SIZE) + if (buf->len <= 0 || buf->len >= maxbuflen) continue; mountdata->addr = (struct sockaddr *)&addr; @@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, continue; nfs_set_port(mountdata->addr, NFS_PORT); - strncpy(page2, buf->data, page2len); - page2[page2len] = '\0'; + memcpy(page2, buf->data, buf->len); + page2[buf->len] = '\0'; mountdata->hostname = page2; snprintf(page, PAGE_SIZE, "%s:%s", -- cgit v1.2.3 From ae46141ff08f1965b17c531b571953c39ce8b9e2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:18 -0400 Subject: NFSv3: Fix posix ACL code Fix a memory leak due to allocation in the XDR layer. In cases where the RPC call needs to be retransmitted, we end up allocating new pages without clearing the old ones. Fix this by moving the allocation into nfs3_proc_setacls(). Also fix an issue discovered by Kevin Rudd, whereby the amount of memory reserved for the acls in the xdr_buf->head was miscalculated, and causing corruption. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 27 +++++++++++++++++++++------ fs/nfs/nfs3xdr.c | 34 +++++++++++++--------------------- 2 files changed, 34 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index cef62557c87..6bbf0e6daad 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, { struct nfs_server *server = NFS_SERVER(inode); struct nfs_fattr fattr; - struct page *pages[NFSACL_MAXPAGES] = { }; + struct page *pages[NFSACL_MAXPAGES]; struct nfs3_setaclargs args = { .inode = inode, .mask = NFS_ACL, @@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status, count; + int status; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, if (S_ISDIR(inode->i_mode)) { args.mask |= NFS_DFACL; args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); } dprintk("NFS call setacl\n"); @@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, nfs_zap_acl_cache(inode); dprintk("NFS reply setacl: %d\n", status); - /* pages may have been allocated at the xdr layer. */ - for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) - __free_page(args.pages[count]); - switch (status) { case 0: status = nfs_refresh_inode(inode, &fattr); @@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, case -ENOTSUPP: status = -EOPNOTSUPP; } +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } out: return status; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 11cdddec143..6cdeacffde4 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -82,8 +82,10 @@ #define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) #define ACL3_getaclargs_sz (NFS3_fh_sz+1) -#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) -#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) /* @@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, struct nfs3_setaclargs *args) { struct xdr_buf *buf = &req->rq_snd_buf; - unsigned int base, len_in_head, len = nfsacl_size( - (args->mask & NFS_ACL) ? args->acl_access : NULL, - (args->mask & NFS_DFACL) ? args->acl_default : NULL); - int count, err; + unsigned int base; + int err; p = xdr_encode_fhandle(p, NFS_FH(args->inode)); *p++ = htonl(args->mask); - base = (char *)p - (char *)buf->head->iov_base; - /* put as much of the acls into head as possible. */ - len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); - len -= len_in_head; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); - - for (count = 0; (count << PAGE_SHIFT) < len; count++) { - args->pages[count] = alloc_page(GFP_KERNEL); - if (!args->pages[count]) { - while (count) - __free_page(args->pages[--count]); - return -ENOMEM; - } - } - xdr_encode_pages(buf, args->pages, 0, len); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + base = req->rq_slen; + + if (args->npages != 0) + xdr_encode_pages(buf, args->pages, 0, args->len); + else + req->rq_slen += args->len; err = nfsacl_encode(buf, base, args->inode, (args->mask & NFS_ACL) ? -- cgit v1.2.3 From 57df675c60c5cf0748ddba9c7f85afde1530d74d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 Mar 2009 20:33:20 -0400 Subject: NLM: Fix GRANT callback address comparison when IPv6 is enabled The NFS mount command may pass an AF_INET server address to lockd. If lockd happens to be using a PF_INET6 listener, the nlm_cmp_addr() in nlmclnt_grant() will fail to match requests from that host because they will all have a mapped IPv4 AF_INET6 address. Adopt the same solution used in nfs_sockaddr_match_ipaddr() for NFSv4 callbacks: if either address is AF_INET, map it to an AF_INET6 address before doing the comparison. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/clntlock.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 1f3b0fc0d35..aedc47a264c 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) return 0; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap, + struct in6_addr *addr_mapped) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; + + switch (sap->sa_family) { + case AF_INET6: + return &((const struct sockaddr_in6 *)sap)->sin6_addr; + case AF_INET: + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped); + return addr_mapped; + } + + return NULL; +} + +/* + * If lockd is using a PF_INET6 listener, all incoming requests appear + * to come from AF_INET6 remotes. The address of AF_INET remotes are + * mapped to AF_INET6 automatically by the network layer. In case the + * user passed an AF_INET server address at mount time, ensure both + * addresses are AF_INET6 before comparing them. + */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + const struct in6_addr *addr1; + const struct in6_addr *addr2; + struct in6_addr addr1_mapped; + struct in6_addr addr2_mapped; + + addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped); + if (likely(addr1 != NULL)) { + addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped); + if (likely(addr2 != NULL)) + return ipv6_addr_equal(addr1, addr2); + } + + return 0; +} +#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + return nlm_cmp_addr(nlm_addr(host), sap); +} +#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ + /* * The server lockd has called us back to tell us the lock was granted */ @@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) + if (!nlmclnt_cmp_addr(block->b_host, addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; -- cgit v1.2.3 From a71ee337b31271e701f689d544b6153b75609bc5 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Tue, 10 Mar 2009 20:33:21 -0400 Subject: NFS: Handle -ESTALE error in access() Hi Trond, I have been looking at a bugreport where trying to open applications on KDE on a NFS mounted home fails temporarily. There have been multiple reports on different kernel versions pointing to this common issue: http://bugzilla.kernel.org/show_bug.cgi?id=12557 https://bugs.launchpad.net/ubuntu/+source/linux/+bug/269954 http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=508866.html This issue can be reproducible consistently by doing this on a NFS mounted home (KDE): 1. Open 2 xterm sessions 2. From one of the xterm session, do "ssh -X " 3. "stat ~/.Xauthority" on the remote SSH session 4. Close the two xterm sessions 5. On the server do a "stat ~/.Xauthority" 6. Now on the client, try to open xterm This will fail. Even if the filehandle had become stale, the NFS client should invalidate the cache/inode and should repeat LOOKUP. Looking at the packet capture when the failure occurs shows that there were two subsequent ACCESS() calls with the same filehandle and both fails with -ESTALE error. I have tested the fix below. Now the client issue a LOOKUP after the ACCESS() call fails with -ESTALE. If all this makes sense to you, can you consider this for inclusion? Thanks, If the server returns an -ESTALE error due to stale filehandle in response to an ACCESS() call, we need to invalidate the cache and inode so that LOOKUP() can be retried. Without this change, the nfs client retries ACCESS() with the same filehandle, fails again and could lead to temporary failure of applications running on nfs mounted home. Signed-off-by: Suresh Jayaraman Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e35c8199f82..672368f865c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1892,8 +1892,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) cache.cred = cred; cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); - if (status != 0) + if (status != 0) { + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } return status; + } nfs_access_add_cache(inode, &cache); out: if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) -- cgit v1.2.3 From d7371c41b0cda782256b1df759df4e8d4724584c Mon Sep 17 00:00:00 2001 From: Ian Dall Date: Tue, 10 Mar 2009 20:33:22 -0400 Subject: Bug 11061, NFS mounts dropped Addresses: http://bugzilla.kernel.org/show_bug.cgi?id=11061 sockaddr structures can't be reliably compared using memcmp() because there are padding bytes in the structure which can't be guaranteed to be the same even when the sockaddr structures refer to the same socket. Instead compare all the relevant fields. In the case of IPv6 sin6_flowinfo is not compared because it only affects QoS and sin6_scope_id is only compared if the address is "link local" because "link local" addresses need only be unique to a specific link. Signed-off-by: Ian Dall Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9b728f3565a..06654b831d1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -272,6 +272,65 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } #endif +/* + * Test if two ip4 socket addresses refer to the same socket, by + * comparing relevant fields. The padding bytes specifically, are + * not compared. + * + * The caller should ensure both socket addresses are AF_INET. + */ +static int nfs_sockaddr_cmp_ip4(const struct sockaddr_in * saddr1, + const struct sockaddr_in * saddr2) +{ + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) + return 0; + return saddr1->sin_port == saddr2->sin_port; +} + +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_cmp_ip6 (const struct sockaddr_in6 * saddr1, + const struct sockaddr_in6 * saddr2) +{ + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) + return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields. + */ +static int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_cmp_ip4((const struct sockaddr_in *) sa1, + (const struct sockaddr_in *) sa2); + case AF_INET6: + return nfs_sockaddr_cmp_ip6((const struct sockaddr_in6 *) sa1, + (const struct sockaddr_in6 *) sa2); + } + return 0; +} + /* * Find a client by IP address and protocol version * - returns NULL if no such client @@ -344,8 +403,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; + const struct sockaddr *sap = data->addr; list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; @@ -358,7 +419,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ - if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) + if (!nfs_sockaddr_cmp(sap, clap)) continue; atomic_inc(&clp->cl_count); -- cgit v1.2.3 From ad3bdefe877afb47480418fdb05ecd42842de65e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 11 Mar 2009 09:00:04 +0800 Subject: proc: fix kflags to uflags copying in /proc/kpageflags Fix kpf_copy_bit(src,dst) to be kpf_copy_bit(dst,src) to match the actual call patterns, e.g. kpf_copy_bit(kflags, KPF_LOCKED, PG_locked). This misplacement of src/dst only affected reporting of PG_writeback, PG_reclaim and PG_buddy. For others kflags==uflags so not affected. Signed-off-by: Wu Fengguang Reviewed-by: KOSAKI Motohiro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/proc/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 2d1345112a4..e9983837d08 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_RECLAIM 9 #define KPF_BUDDY 10 -#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) +#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos) static ssize_t kpageflags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -- cgit v1.2.3 From 3a95ea1155c5d44aa58dde2f64f0ddafe27fd1fb Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 12 Mar 2009 02:03:23 +0900 Subject: Fix _fat_bmap() locking On swapon() path, it has already i_mutex. So, this uses i_alloc_sem instead of it. Signed-off-by: OGAWA Hirofumi Reported-by: Laurent GUERBY Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6b74d09adbe..de0004fe6e0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ - mutex_lock(&mapping->host->i_mutex); + down_read(&mapping->host->i_alloc_sem); blocknr = generic_block_bmap(mapping, block, fat_get_block); - mutex_unlock(&mapping->host->i_mutex); + up_read(&mapping->host->i_alloc_sem); return blocknr; } -- cgit v1.2.3 From 363911d027d1de1c6df79eb3f487f5476b9619f4 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 12 Mar 2009 03:23:48 +0000 Subject: Squashfs: Valid filesystems are flagged as bad by the corrupted fs patch The corrupted filesystem patch added a check against zlib trying to output too much data in the presence of data corruption. This check triggered if zlib_inflate asked to be called again (Z_OK) with avail_out == 0 and no more output buffers available. This check proves to be rather dumb, as it incorrectly catches the case where zlib has generated all the output, but there are still input bytes to be processed. This patch does a number of things. It removes the original check and replaces it with code to not move to the next output buffer if there are no more output buffers available, relying on zlib to error if it wants an extra output buffer in the case of data corruption. It also replaces the Z_NO_FLUSH flag with the more correct Z_SYNC_FLUSH flag, and makes the error messages more understandable to non-technical users. Signed-off-by: Phillip Lougher Reported-by: Stefan Lippers-Hollmann --- fs/squashfs/block.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 321728f48f2..2a796031034 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -184,15 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, offset = 0; } - if (msblk->stream.avail_out == 0) { - if (page == pages) { - ERROR("zlib_inflate tried to " - "decompress too much data, " - "expected %d bytes. Zlib " - "data probably corrupt\n", - srclength); - goto release_mutex; - } + if (msblk->stream.avail_out == 0 && page < pages) { msblk->stream.next_out = buffer[page++]; msblk->stream.avail_out = PAGE_CACHE_SIZE; } @@ -209,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, zlib_init = 1; } - zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); + zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH); if (msblk->stream.avail_in == 0 && k < b) put_bh(bh[k++]); } while (zlib_err == Z_OK); if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate returned unexpected result" - " 0x%x, srclength %d, avail_in %d," - " avail_out %d\n", zlib_err, srclength, - msblk->stream.avail_in, - msblk->stream.avail_out); + ERROR("zlib_inflate error, data probably corrupt\n"); goto release_mutex; } zlib_err = zlib_inflateEnd(&msblk->stream); if (zlib_err != Z_OK) { - ERROR("zlib_inflateEnd returned unexpected result 0x%x," - " srclength %d\n", zlib_err, srclength); + ERROR("zlib_inflate error, data probably corrupt\n"); goto release_mutex; } length = msblk->stream.total_out; -- cgit v1.2.3 From 2842c3b5449f31470b61db716f1926b594fb6156 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 12 Mar 2009 12:20:01 -0400 Subject: ext4: Print the find_group_flex() warning only once This is a short-term warning, and even printk_ratelimit() can result in too much noise in system logs. So only print it once as a warning. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 627f8c3337a..2d2b3585ee9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -698,6 +698,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct inode *ret; ext4_group_t i; int free = 0; + static int once = 1; ext4_group_t flex_group; /* Cannot create files in a deleted directory */ @@ -719,7 +720,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { ret2 = find_group_other(sb, dir, &group); - if (ret2 == 0 && printk_ratelimit()) + if (ret2 == 0 && once) + once = 0; printk(KERN_NOTICE "ext4: find_group_flex " "failed, fallback succeeded dir %lu\n", dir->i_ino); -- cgit v1.2.3 From 9f4c899c0d90e1b51b6864834f3877b47c161a0e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 12 Mar 2009 14:51:32 -0400 Subject: NFS: Fix the fix to Bugzilla #11061, when IPv6 isn't defined... Stephen Rothwell reports: Today's linux-next build (powerpc ppc64_defconfig) failed like this: fs/built-in.o: In function `.nfs_get_client': client.c:(.text+0x115010): undefined reference to `.__ipv6_addr_type' Fix by moving the IPV6 specific parts of commit d7371c41b0cda782256b1df759df4e8d4724584c ("Bug 11061, NFS mounts dropped") into the '#ifdef IPV6..." section. Also fix up a couple of formatting issues. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 68 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 06654b831d1..574158ae239 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -255,6 +255,32 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } return 0; } + +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; + + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) + return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} #else static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, const struct sockaddr_in *sa2) @@ -270,6 +296,12 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, (const struct sockaddr_in *)sa2); } + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, + const struct sockaddr * sa2) +{ + return 0; +} #endif /* @@ -279,37 +311,17 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, * * The caller should ensure both socket addresses are AF_INET. */ -static int nfs_sockaddr_cmp_ip4(const struct sockaddr_in * saddr1, - const struct sockaddr_in * saddr2) +static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, + const struct sockaddr *sa2) { + const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) return 0; return saddr1->sin_port == saddr2->sin_port; } -/* - * Test if two ip6 socket addresses refer to the same socket by - * comparing relevant fields. The padding bytes specifically, are not - * compared. sin6_flowinfo is not compared because it only affects QoS - * and sin6_scope_id is only compared if the address is "link local" - * because "link local" addresses need only be unique to a specific - * link. Conversely, ordinary unicast addresses might have different - * sin6_scope_id. - * - * The caller should ensure both socket addresses are AF_INET6. - */ -static int nfs_sockaddr_cmp_ip6 (const struct sockaddr_in6 * saddr1, - const struct sockaddr_in6 * saddr2) -{ - if (!ipv6_addr_equal(&saddr1->sin6_addr, - &saddr1->sin6_addr)) - return 0; - if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - saddr1->sin6_scope_id != saddr2->sin6_scope_id) - return 0; - return saddr1->sin6_port == saddr2->sin6_port; -} - /* * Test if two socket addresses represent the same actual socket, * by comparing (only) relevant fields. @@ -322,11 +334,9 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, switch (sa1->sa_family) { case AF_INET: - return nfs_sockaddr_cmp_ip4((const struct sockaddr_in *) sa1, - (const struct sockaddr_in *) sa2); + return nfs_sockaddr_cmp_ip4(sa1, sa2); case AF_INET6: - return nfs_sockaddr_cmp_ip6((const struct sockaddr_in6 *) sa1, - (const struct sockaddr_in6 *) sa2); + return nfs_sockaddr_cmp_ip6(sa1, sa2); } return 0; } -- cgit v1.2.3 From e5bc49ba7439b9726006d031d440cba96819f0f8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 12 Mar 2009 14:31:28 -0700 Subject: pipe_rdwr_fasync: fix the error handling to prevent the leak/crash If the second fasync_helper() fails, pipe_rdwr_fasync() returns the error but leaves the file on ->fasync_readers. This was always wrong, but since 233e70f4228e78eb2f80dc6650f65d3ae3dbf17c "saner FASYNC handling on file close" we have the new problem. Because in this case setfl() doesn't set FASYNC bit, __fput() will not do ->fasync(0), and we leak fasync_struct with ->fa_file pointing to the freed file. Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Andi Kleen Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 3a48ba5179d..14f502b89cf 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -699,12 +699,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on) int retval; mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); - - if (retval >= 0) + if (retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - + if (retval < 0) /* this can happen only if on == T */ + fasync_helper(-1, filp, 0, &pipe->fasync_readers); + } mutex_unlock(&inode->i_mutex); if (retval < 0) -- cgit v1.2.3 From a3cfbb53b1764a3d1f58ddc032737ab9edaa7d41 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 12 Mar 2009 14:31:29 -0700 Subject: vfs: add missing unlock in sget() In sget(), destroy_super(s) is called with s->s_umount held, which makes lockdep unhappy. Signed-off-by: Li Zefan Cc: Al Viro Acked-by: Peter Zijlstra Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 8349ed6b141..6ce501447ad 100644 --- a/fs/super.c +++ b/fs/super.c @@ -371,8 +371,10 @@ retry: continue; if (!grab_super(old)) goto retry; - if (s) + if (s) { + up_write(&s->s_umount); destroy_super(s); + } return old; } } @@ -387,6 +389,7 @@ retry: err = set(s, data); if (err) { spin_unlock(&sb_lock); + up_write(&s->s_umount); destroy_super(s); return ERR_PTR(err); } -- cgit v1.2.3 From 7ef0d7377cb287e08f3ae94cebc919448e1f5dff Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 12 Mar 2009 14:31:38 -0700 Subject: fs: new inode i_state corruption fix There was a report of a data corruption http://lkml.org/lkml/2008/11/14/121. There is a script included to reproduce the problem. During testing, I encountered a number of strange things with ext3, so I tried ext2 to attempt to reduce complexity of the problem. I found that fsstress would quickly hang in wait_on_inode, waiting for I_LOCK to be cleared, even though instrumentation showed that unlock_new_inode had already been called for that inode. This points to memory scribble, or synchronisation problme. i_state of I_NEW inodes is not protected by inode_lock because other processes are not supposed to touch them until I_LOCK (and I_NEW) is cleared. Adding WARN_ON(inode->i_state & I_NEW) to sites where we modify i_state revealed that generic_sync_sb_inodes is picking up new inodes from the inode lists and passing them to __writeback_single_inode without waiting for I_NEW. Subsequently modifying i_state causes corruption. In my case it would look like this: CPU0 CPU1 unlock_new_inode() __sync_single_inode() reg <- inode->i_state reg -> reg & ~(I_LOCK|I_NEW) reg <- inode->i_state reg -> inode->i_state reg -> reg | I_SYNC reg -> inode->i_state Non-atomic RMW on CPU1 overwrites CPU0 store and sets I_LOCK|I_NEW again. Fix for this is rather than wait for I_NEW inodes, just skip over them: inodes concurrently being created are not subject to data integrity operations, and should not significantly contribute to dirty memory either. After this change, I'm unable to reproduce any of the added warnings or hangs after ~1hour of running. Previously, the new warnings would start immediately and hang would happen in under 5 minutes. I'm also testing on ext3 now, and so far no problems there either. I don't know whether this fixes the problem reported above, but it fixes a real problem for me. Cc: "Jorge Boncompte [DTI2]" Reported-by: Adrian Hunter Cc: Jan Kara Cc: Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 9 ++++++++- fs/inode.c | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e5eaa62fd17..e3fe9918faa 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) int ret; BUG_ON(inode->i_state & I_SYNC); + WARN_ON(inode->i_state & I_NEW); /* Set I_SYNC, reset I_DIRTY */ dirty = inode->i_state & I_DIRTY; @@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (!(inode->i_state & I_DIRTY) && @@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb, break; } + if (inode->i_state & I_NEW) { + requeue_io(inode); + continue; + } + if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; if (!sb_is_blkdev_sb(sb)) @@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb, list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE)) + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) continue; mapping = inode->i_mapping; if (mapping->nrpages == 0) diff --git a/fs/inode.c b/fs/inode.c index 913ab2d9a5d..826fb0b9d1c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -359,6 +359,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { list_move(&inode->i_list, dispose); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; count++; continue; @@ -460,6 +461,7 @@ static void prune_icache(int nr_to_scan) continue; } list_move(&inode->i_list, &freeable); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; nr_pruned++; } @@ -656,6 +658,7 @@ void unlock_new_inode(struct inode *inode) * just created it (so there can be no old holders * that haven't tested I_LOCK). */ + WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); inode->i_state &= ~(I_LOCK|I_NEW); wake_up_inode(inode); } @@ -1145,6 +1148,7 @@ void generic_delete_inode(struct inode *inode) list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1186,16 +1190,19 @@ static void generic_forget_inode(struct inode *inode) spin_unlock(&inode_lock); return; } + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; spin_unlock(&inode_lock); write_inode_now(inode, 1); spin_lock(&inode_lock); + WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; inodes_stat.nr_unused--; hlist_del_init(&inode->i_hash); } list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); -- cgit v1.2.3 From 6c9fd1dc0a597e575617a7de7086c8a3efa8f524 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Fri, 6 Mar 2009 10:19:30 +0800 Subject: ocfs2: reserve xattr block for new directory with inline data If this is a new directory with inline data, we choose to reserve the entire inline area for directory contents and force an external xattr block. Signed-off-by: Tiger Yang Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ddd788add6..c63efb5ef13 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -547,8 +547,12 @@ int ocfs2_calc_xattr_init(struct inode *dir, * when blocksize = 512, may reserve one more cluser for * xattr bucket, otherwise reserve one metadata block * for them is ok. + * If this is a new directory with inline data, + * we choose to reserve the entire inline area for + * directory contents and force an external xattr block. */ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); if (ret) { -- cgit v1.2.3 From d9ae49d6e2b1ac9166e58ae3c9345135604beaa6 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 5 Mar 2009 11:06:15 +0800 Subject: ocfs2: tweak to get the maximum inline data size with xattr Replace max_inline_data with max_inline_data_with_xattr to ensure it correct when xattr inlined. Signed-off-by: Tiger Yang Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 7 +++++-- fs/ocfs2/namei.c | 3 ++- fs/ocfs2/ocfs2_fs.h | 6 ------ 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a067a6cffb0..8e1709a679b 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); if (size > PAGE_CACHE_SIZE || - size > ocfs2_max_inline_data(inode->i_sb)) { + size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, int ret, written = 0; loff_t end = pos + len; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = NULL; mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, @@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, /* * Check whether the write can fit. */ - if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + if (mmap_page || + end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) return 0; do_inline_write: diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 084aba86c3b..4b11762f249 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -532,7 +532,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); - fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb)); + fe->id2.i_data.id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(osb->sb, fe)); } else { fel = &fe->id2.i_list; fel->l_tree_depth = 0; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c7ae45aaa36..2332ef740f4 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1070,12 +1070,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) offsetof(struct ocfs2_dinode, id2.i_symlink); } -static inline int ocfs2_max_inline_data(struct super_block *sb) -{ - return sb->s_blocksize - - offsetof(struct ocfs2_dinode, id2.i_data.id_data); -} - static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, struct ocfs2_dinode *di) { -- cgit v1.2.3 From 74e77eb30d0ecbb12964d005b439c8b84a505b84 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Mar 2009 06:24:23 +0800 Subject: ocfs2: Fix a bug found by sparse check. We need to use le32_to_cpu to test rec->e_cpos in ocfs2_dinode_insert_check. Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 3a9e5deed74..19e3a96aa02 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode, BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != rec->e_cpos), + (OCFS2_I(inode)->ip_clusters != + le32_to_cpu(rec->e_cpos)), "Device %s, asking for sparse allocation: inode %llu, " "cpos %u, clusters %u\n", osb->dev_str, -- cgit v1.2.3 From 712e53e46a1da35fcd88c05aa0c675b10f7c0e9d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Mar 2009 08:37:34 +0800 Subject: ocfs2: Use xs->bucket to set xattr value outside A long time ago, xs->base is allocated a 4K size and all the contents in the bucket are copied to the it. Now we use ocfs2_xattr_bucket to abstract xattr bucket and xs->base is initialized to the start of the bu_bhs[0]. So xs->base + offset will overflow when the value root is stored outside the first block. Then why we can survive the xattr test by now? It is because we always read the bucket contiguously now and kernel mm allocate continguous memory for us. We are lucky, but we should fix it. So just get the right value root as other callers do. Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c63efb5ef13..2563df89fc2 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4795,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, char *val, int value_len) { - int offset; + int ret, offset, block_off; struct ocfs2_xattr_value_root *xv; struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); + void *base; BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - offset = le16_to_cpu(xe->xe_name_offset) + - OCFS2_XATTR_SIZE(xe->xe_name_len); + ret = ocfs2_xattr_bucket_get_name_value(inode, xh, + xe - xh->xh_entries, + &block_off, + &offset); + if (ret) { + mlog_errno(ret); + goto out; + } - xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + base = bucket_block(xs->bucket, block_off); + xv = (struct ocfs2_xattr_value_root *)(base + offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); - return __ocfs2_xattr_set_value_outside(inode, handle, - xv, val, value_len); + ret = __ocfs2_xattr_set_value_outside(inode, handle, + xv, val, value_len); + if (ret) + mlog_errno(ret); +out: + return ret; } static int ocfs2_rm_xattr_cluster(struct inode *inode, -- cgit v1.2.3 From 8d03c7a0c550e7ab24cadcef5e66656bfadec8b9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 14 Mar 2009 11:51:46 -0400 Subject: ext4: fix bogus BUG_ONs in in mballoc code Thiemo Nagel reported that: # dd if=/dev/zero of=image.ext4 bs=1M count=2 # mkfs.ext4 -v -F -b 1024 -m 0 -g 512 -G 4 -I 128 -N 1 \ -O large_file,dir_index,flex_bg,extent,sparse_super image.ext4 # mount -o loop image.ext4 mnt/ # dd if=/dev/zero of=mnt/file oopsed, with a BUG_ON in ext4_mb_normalize_request because size == EXT4_BLOCKS_PER_GROUP It appears to me (esp. after talking to Andreas) that the BUG_ON is bogus; a request of exactly EXT4_BLOCKS_PER_GROUP should be allowed, though larger sizes do indicate a problem. Fix that an another (apparently rare) codepath with a similar check. Reported-by: Thiemo Nagel Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4415beeb0b6..41f4348b62f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1447,7 +1447,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); @@ -3292,7 +3292,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ -- cgit v1.2.3 From 020fe22ff14320927f394de222cbb11708bcc7a8 Mon Sep 17 00:00:00 2001 From: Enrik Berkhan Date: Fri, 13 Mar 2009 13:51:56 -0700 Subject: nommu: ramfs: pages allocated to an inode's pagecache may get wrongly discarded The pages attached to a ramfs inode's pagecache by truncation from nothing - as done by SYSV SHM for example - may get discarded under memory pressure. The problem is that the pages are not marked dirty. Anything that creates data in an MMU-based ramfs will cause the pages holding that data will cause the set_page_dirty() aop to be called. For the NOMMU-based mmap, set_page_dirty() may be called by write(), but it won't be called by page-writing faults on writable mmaps, and it isn't called by ramfs_nommu_expand_for_mapping() when a file is being truncated from nothing to allocate a contiguous run. The solution is to mark the pages dirty at the point of allocation by the truncation code. Signed-off-by: Enrik Berkhan Signed-off-by: David Howells Cc: Peter Zijlstra Cc: Nick Piggin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index b9b567a2837..90d72bead55 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -114,6 +114,9 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) if (!pagevec_add(&lru_pvec, page)) __pagevec_lru_add_file(&lru_pvec); + /* prevent the page from being discarded on memory pressure */ + SetPageDirty(page); + unlock_page(page); } -- cgit v1.2.3 From 15e7b8767605dc0cb9bd4594caabfec392385210 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 13 Mar 2009 13:51:58 -0700 Subject: nommu: ramfs: don't leak pages when adding to page cache fails When a ramfs nommu mapping is expanded, contiguous pages are allocated and added to the pagecache. The caller's reference is then passed on by moving whole pagevecs to the file lru list. If the page cache adding fails, make sure that the error path also moves the pagevec contents which might still contain up to PAGEVEC_SIZE successfully added pages, of which we would leak references otherwise. Signed-off-by: Johannes Weiner Cc: David Howells Cc: Enrik Berkhan Cc: Nick Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 90d72bead55..5d7c7ececa6 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -129,6 +129,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return -EFBIG; add_error: + pagevec_lru_add_file(&lru_pvec); page_cache_release(pages + loop); for (loop++; loop < npages; loop++) __free_page(pages + loop); -- cgit v1.2.3 From 84814d642a4f1f294bd675ab11aae1ca54c6cedb Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 13 Mar 2009 13:51:59 -0700 Subject: eCryptfs: don't encrypt file key with filename key eCryptfs has file encryption keys (FEK), file encryption key encryption keys (FEKEK), and filename encryption keys (FNEK). The per-file FEK is encrypted with one or more FEKEKs and stored in the header of the encrypted file. I noticed that the FEK is also being encrypted by the FNEK. This is a problem if a user wants to use a different FNEK than their FEKEK, as their file contents will still be accessible with the FNEK. This is a minimalistic patch which prevents the FNEKs signatures from being copied to the inode signatures list. Ultimately, it keeps the FEK from being encrypted with a FNEK. Signed-off-by: Tyler Hicks Cc: Serge Hallyn Acked-by: Dustin Kirkland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 2 ++ fs/ecryptfs/ecryptfs_kernel.h | 3 ++- fs/ecryptfs/keystore.c | 3 ++- fs/ecryptfs/main.c | 5 +++-- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f6caeb1d110..bdca1f4b3a3 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { + if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK) + continue; rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index c11fc95714a..eb2267eca1f 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -328,6 +328,7 @@ struct ecryptfs_dentry_info { */ struct ecryptfs_global_auth_tok { #define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 +#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002 u32 flags; struct list_head mount_crypt_stat_list; struct key *global_auth_tok_key; @@ -696,7 +697,7 @@ ecryptfs_write_header_metadata(char *virt, int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, - char *sig); + char *sig, u32 global_auth_tok_flags); int ecryptfs_get_global_auth_tok_for_sig( struct ecryptfs_global_auth_tok **global_auth_tok, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index ff539420cc6..e4a6223c314 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -2375,7 +2375,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache; int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, - char *sig) + char *sig, u32 global_auth_tok_flags) { struct ecryptfs_global_auth_tok *new_auth_tok; int rc = 0; @@ -2389,6 +2389,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, goto out; } memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_auth_tok->flags = global_auth_tok_flags; new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); list_add(&new_auth_tok->mount_crypt_stat_list, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 789cf2e1be1..aed56c25539 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) case ecryptfs_opt_ecryptfs_sig: sig_src = args[0].from; rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, - sig_src); + sig_src, 0); if (rc) { printk(KERN_ERR "Error attempting to register " "global sig; rc = [%d]\n", rc); @@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) ECRYPTFS_SIG_SIZE_HEX] = '\0'; rc = ecryptfs_add_global_auth_tok( mount_crypt_stat, - mount_crypt_stat->global_default_fnek_sig); + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_AUTH_TOK_FNEK); if (rc) { printk(KERN_ERR "Error attempting to register " "global fnek sig [%s]; rc = [%d]\n", -- cgit v1.2.3 From 87092698c665e0a358caf9825ae13114343027e8 Mon Sep 17 00:00:00 2001 From: un'ichi Nomura Date: Mon, 9 Mar 2009 10:40:52 +0100 Subject: block: Add gfp_mask parameter to bio_integrity_clone() Stricter gfp_mask might be required for clone allocation. For example, request-based dm may clone bio in interrupt context so it has to use GFP_ATOMIC. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Acked-by: Martin K. Petersen Cc: Alasdair G Kergon Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 5 +++-- fs/bio.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 549b0144da1..fe2b1aa2464 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -685,19 +685,20 @@ EXPORT_SYMBOL(bio_integrity_split); * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio + * @gfp_mask: Memory allocation mask * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - struct bio_set *bs) + gfp_t gfp_mask, struct bio_set *bs) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); if (bip == NULL) return -EIO; diff --git a/fs/bio.c b/fs/bio.c index 124b95c4d58..cf747378b97 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -463,7 +463,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, fs_bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); if (ret < 0) return NULL; -- cgit v1.2.3 From 059ea3318c8ede71851a52b4359fbf1ab0cec301 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Mar 2009 10:42:45 +0100 Subject: block: fix memory leak in bio_clone() If bio_integrity_clone() fails, bio_clone() returns NULL without freeing the newly allocated bio. Signed-off-by: Li Zefan Signed-off-by: Jens Axboe --- fs/bio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index cf747378b97..d4f06327c81 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -465,8 +465,10 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); - if (ret < 0) + if (ret < 0) { + bio_put(b); return NULL; + } } return b; -- cgit v1.2.3 From ee6f779b9e0851e2f7da292a9f58e0095edf615a Mon Sep 17 00:00:00 2001 From: Zhang Le Date: Mon, 16 Mar 2009 14:44:31 +0800 Subject: filp->f_pos not correctly updated in proc_task_readdir filp->f_pos only get updated at the end of the function. Thus d_off of those dirents who are in the middle will be 0, and this will cause a problem in glibc's readdir implementation, specifically endless loop. Because when overflow occurs, f_pos will be set to next dirent to read, however it will be 0, unless the next one is the last one. So it will start over again and again. There is a sample program in man 2 gendents. This is the output of the program running on a multithread program's task dir before this patch is applied: $ ./a.out /proc/3807/task --------------- nread=128 --------------- i-node# file type d_reclen d_off d_name 506442 directory 16 1 . 506441 directory 16 0 .. 506443 directory 16 0 3807 506444 directory 16 0 3809 506445 directory 16 0 3812 506446 directory 16 0 3861 506447 directory 16 0 3862 506448 directory 16 8 3863 This is the output after this patch is applied $ ./a.out /proc/3807/task --------------- nread=128 --------------- i-node# file type d_reclen d_off d_name 506442 directory 16 1 . 506441 directory 16 2 .. 506443 directory 16 3 3807 506444 directory 16 4 3809 506445 directory 16 5 3812 506446 directory 16 6 3861 506447 directory 16 7 3862 506448 directory 16 8 3863 Signed-off-by: Zhang Le Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/proc/base.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0c9de19a163..cc6ea2329e7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3066,7 +3066,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi int retval = -ENOENT; ino_t ino; int tid; - unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ struct pid_namespace *ns; task = get_proc_task(inode); @@ -3083,18 +3082,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi goto out_no_task; retval = 0; - switch (pos) { + switch (filp->f_pos) { case 0: ino = inode->i_ino; - if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) + if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ case 1: ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) + if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ } @@ -3104,9 +3103,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi ns = filp->f_dentry->d_sb->s_fs_info; tid = (int)filp->f_version; filp->f_version = 0; - for (task = first_tid(leader, tid, pos - 2, ns); + for (task = first_tid(leader, tid, filp->f_pos - 2, ns); task; - task = next_tid(task), pos++) { + task = next_tid(task), filp->f_pos++) { tid = task_pid_nr_ns(task, ns); if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { /* returning this tgid failed, save it as the first @@ -3117,7 +3116,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi } } out: - filp->f_pos = pos; put_task_struct(leader); out_no_task: return retval; -- cgit v1.2.3 From d33a1976fbee1ee321d6f014333d8f03a39d526c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 16 Mar 2009 23:25:40 -0400 Subject: ext4: fix bb_prealloc_list corruption due to wrong group locking This is for Red Hat bug 490026: EXT4 panic, list corruption in ext4_mb_new_inode_pa ext4_lock_group(sb, group) is supposed to protect this list for each group, and a common code flow to remove an album is like this: ext4_get_group_no_and_offset(sb, pa->pa_pstart, &grp, NULL); ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); so it's critical that we get the right group number back for this prealloc context, to lock the right group (the one associated with this pa) and prevent concurrent list manipulation. however, ext4_mb_put_pa() passes in (pa->pa_pstart - 1) with a comment, "-1 is to protect from crossing allocation group". This makes sense for the group_pa, where pa_pstart is advanced by the length which has been used (in ext4_mb_release_context()), and when the entire length has been used, pa_pstart has been advanced to the first block of the next group. However, for inode_pa, pa_pstart is never advanced; it's just set once to the first block in the group and not moved after that. So in this case, if we subtract one in ext4_mb_put_pa(), we are actually locking the *previous* group, and opening the race with the other threads which do not subtract off the extra block. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 41f4348b62f..9f61e62f435 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3589,6 +3589,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; + ext4_fsblk_t grp_blk; if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) return; @@ -3603,8 +3604,12 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); - /* -1 is to protect from crossing allocation group */ - ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); + grp_blk = pa->pa_pstart; + /* If linear, pa_pstart may be in the next group when pa is used up */ + if (pa->pa_linear) + grp_blk--; + + ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); /* * possible race: -- cgit v1.2.3 From ee568b25ee9e160b32d1aef73d8b2ee9c05d34db Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Mar 2009 10:02:35 -0700 Subject: Avoid 64-bit "switch()" statements on 32-bit architectures Commit ee6f779b9e0851e2f7da292a9f58e0095edf615a ("filp->f_pos not correctly updated in proc_task_readdir") changed the proc code to use filp->f_pos directly, rather than through a temporary variable. In the process, that caused the operations to be done on the full 64 bits, even though the offset is never that big. That's all fine and dandy per se, but for some unfathomable reason gcc generates absolutely horrid code when using 64-bit values in switch() statements. To the point of actually calling out to gcc helper functions like __cmpdi2 rather than just doing the trivial comparisons directly the way gcc does for normal compares. At which point we get link failures, because we really don't want to support that kind of crazy code. Fix this by just casting the f_pos value to "unsigned long", which is plenty big enough for /proc, and avoids the gcc code generation issue. Reported-by: Alexey Dobriyan Cc: Zhang Le Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index cc6ea2329e7..beaa0ce3b82 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3082,7 +3082,7 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi goto out_no_task; retval = 0; - switch (filp->f_pos) { + switch ((unsigned long)filp->f_pos) { case 0: ino = inode->i_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) -- cgit v1.2.3 From 84f09f46b4ee9e4e9b6381f8af31817516d2091b Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 4 Mar 2009 23:05:35 +0200 Subject: NFSD: provide encode routine for OP_OPENATTR Although this operation is unsupported by our implementation we still need to provide an encode routine for it to merely encode its (error) status back in the compound reply. Thanks for Bill Baker at sun.com for testing with the Sun OpenSolaris' client, finding, and reporting this bug at Connectathon 2009. This bug was introduced in 2.6.27 Signed-off-by: Benny Halevy Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f65953be39c..9250067943d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2596,6 +2596,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, -- cgit v1.2.3 From a8e7d49aa7be728c4ae241a75a2a124cdcabc0c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 19 Mar 2009 11:32:05 -0700 Subject: Fix race in create_empty_buffers() vs __set_page_dirty_buffers() Nick Piggin noticed this (very unlikely) race between setting a page dirty and creating the buffers for it - we need to hold the mapping private_lock until we've set the page dirty bit in order to make sure that create_empty_buffers() might not build up a set of buffers without the dirty bits set when the page is dirty. I doubt anybody has ever hit this race (and it didn't solve the issue Nick was looking at), but as Nick says: "Still, it does appear to solve a real race, which we should close." Acked-by: Nick Piggin Signed-off-by: Linus Torvalds --- fs/buffer.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 9f697419ed8..891e1c78e4f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -760,15 +760,9 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. */ -static int __set_page_dirty(struct page *page, +static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - if (TestSetPageDirty(page)) - return 0; - spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); @@ -785,8 +779,6 @@ static int __set_page_dirty(struct page *page, } spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - - return 1; } /* @@ -816,6 +808,7 @@ static int __set_page_dirty(struct page *page, */ int __set_page_dirty_buffers(struct page *page) { + int newly_dirty; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -831,9 +824,12 @@ int __set_page_dirty_buffers(struct page *page) bh = bh->b_this_page; } while (bh != head); } + newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - return __set_page_dirty(page, mapping, 1); + if (newly_dirty) + __set_page_dirty(page, mapping, 1); + return newly_dirty; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1262,8 +1258,11 @@ void mark_buffer_dirty(struct buffer_head *bh) return; } - if (!test_set_buffer_dirty(bh)) - __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); + if (!test_set_buffer_dirty(bh)) { + struct page *page = bh->b_page; + if (!TestSetPageDirty(page)) + __set_page_dirty(page, page_mapping(page), 0); + } } /* -- cgit v1.2.3 From 87c3a86e1c220121d0ced59d1a71e78ed9abc6dd Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 18 Mar 2009 17:04:19 -0700 Subject: eventfd: remove fput() call from possible IRQ context Remove a source of fput() call from inside IRQ context. Myself, like Eric, wasn't able to reproduce an fput() call from IRQ context, but Jeff said he was able to, with the attached test program. Independently from this, the bug is conceptually there, so we might be better off fixing it. This patch adds an optimization similar to the one we already do on ->ki_filp, on ->ki_eventfd. Playing with ->f_count directly is not pretty in general, but the alternative here would be to add a brand new delayed fput() infrastructure, that I'm not sure is worth it. Signed-off-by: Davide Libenzi Cc: Benjamin LaHaise Cc: Trond Myklebust Cc: Eric Dumazet Signed-off-by: Jeff Moyer Cc: Zach Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 8fa77e23394..4a9d4d641fb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -443,7 +443,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) req->private = NULL; req->ki_iovec = NULL; INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = ERR_PTR(-EINVAL); + req->ki_eventfd = NULL; /* Check if the completion queue has enough free space to * accept an event from this io. @@ -485,8 +485,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) { assert_spin_locked(&ctx->ctx_lock); - if (!IS_ERR(req->ki_eventfd)) - fput(req->ki_eventfd); if (req->ki_dtor) req->ki_dtor(req); if (req->ki_iovec != &req->ki_inline_vec) @@ -508,8 +506,11 @@ static void aio_fput_routine(struct work_struct *data) list_del(&req->ki_list); spin_unlock_irq(&fput_lock); - /* Complete the fput */ - __fput(req->ki_filp); + /* Complete the fput(s) */ + if (req->ki_filp != NULL) + __fput(req->ki_filp); + if (req->ki_eventfd != NULL) + __fput(req->ki_eventfd); /* Link the iocb into the context's free list */ spin_lock_irq(&ctx->ctx_lock); @@ -527,12 +528,14 @@ static void aio_fput_routine(struct work_struct *data) */ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) { + int schedule_putreq = 0; + dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", req, atomic_long_read(&req->ki_filp->f_count)); assert_spin_locked(&ctx->ctx_lock); - req->ki_users --; + req->ki_users--; BUG_ON(req->ki_users < 0); if (likely(req->ki_users)) return 0; @@ -540,10 +543,23 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) req->ki_cancel = NULL; req->ki_retry = NULL; - /* Must be done under the lock to serialise against cancellation. - * Call this aio_fput as it duplicates fput via the fput_work. + /* + * Try to optimize the aio and eventfd file* puts, by avoiding to + * schedule work in case it is not __fput() time. In normal cases, + * we would not be holding the last reference to the file*, so + * this function will be executed w/out any aio kthread wakeup. */ - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) + schedule_putreq++; + else + req->ki_filp = NULL; + if (req->ki_eventfd != NULL) { + if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count))) + schedule_putreq++; + else + req->ki_eventfd = NULL; + } + if (unlikely(schedule_putreq)) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); @@ -1009,7 +1025,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (!IS_ERR(iocb->ki_eventfd)) + if (iocb->ki_eventfd != NULL) eventfd_signal(iocb->ki_eventfd, 1); put_rq: @@ -1608,6 +1624,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); + req->ki_eventfd = NULL; goto out_put_req; } } -- cgit v1.2.3 From 65c24491b4fef017c64e39ec64384fde5e05e0a0 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 18 Mar 2009 17:04:21 -0700 Subject: aio: lookup_ioctx can return the wrong value when looking up a bogus context The libaio test harness turned up a problem whereby lookup_ioctx on a bogus io context was returning the 1 valid io context from the list (harness/cases/3.p). Because of that, an extra put_iocontext was done, and when the process exited, it hit a BUG_ON in the put_iocontext macro called from exit_aio (since we expect a users count of 1 and instead get 0). The problem was introduced by "aio: make the lookup_ioctx() lockless" (commit abf137dd7712132ee56d5b3143c2ff61a72a5faa). Thanks to Zach for pointing out that hlist_for_each_entry_rcu will not return with a NULL tpos at the end of the loop, even if the entry was not found. Signed-off-by: Jeff Moyer Acked-by: Zach Brown Acked-by: Jens Axboe Cc: Benjamin LaHaise Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 4a9d4d641fb..76da1253795 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -587,7 +587,7 @@ int aio_put_req(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct mm_struct *mm = current->mm; - struct kioctx *ctx = NULL; + struct kioctx *ctx, *ret = NULL; struct hlist_node *n; rcu_read_lock(); @@ -595,12 +595,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { if (ctx->user_id == ctx_id && !ctx->dead) { get_ioctx(ctx); + ret = ctx; break; } } rcu_read_unlock(); - return ctx; + return ret; } /* -- cgit v1.2.3 From 8faece5f906725c10e7a1f6caf84452abadbdc7b Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Mar 2009 01:25:09 -0500 Subject: eCryptfs: Allocate a variable number of pages for file headers When allocating the memory used to store the eCryptfs header contents, a single, zeroed page was being allocated with get_zeroed_page(). However, the size of an eCryptfs header is either PAGE_CACHE_SIZE or ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE (8192), whichever is larger, and is stored in the file's private_data->crypt_stat->num_header_bytes_at_front field. ecryptfs_write_metadata_to_contents() was using num_header_bytes_at_front to decide how many bytes should be written to the lower filesystem for the file header. Unfortunately, at least 8K was being written from the page, despite the chance of the single, zeroed page being smaller than 8K. This resulted in random areas of kernel memory being written between the 0x1000 and 0x1FFF bytes offsets in the eCryptfs file headers if PAGE_SIZE was 4K. This patch allocates a variable number of pages, calculated with num_header_bytes_at_front, and passes the number of allocated pages along to ecryptfs_write_metadata_to_contents(). Thanks to Florian Streibelt for reporting the data leak and working with me to find the problem. 2.6.28 is the only kernel release with this vulnerability. Corresponds to CVE-2009-0787 Signed-off-by: Tyler Hicks Acked-by: Dustin Kirkland Reviewed-by: Eric Sandeen Reviewed-by: Eugene Teo Cc: Greg KH Cc: dann frazier Cc: Serge E. Hallyn Cc: Florian Streibelt Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index bdca1f4b3a3..75bee99de0f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1324,14 +1324,13 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, - struct dentry *ecryptfs_dentry, - char *virt) +ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, + char *virt, size_t virt_len) { int rc; rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, - 0, crypt_stat->num_header_bytes_at_front); + 0, virt_len); if (rc) printk(KERN_ERR "%s: Error attempting to write header " "information to lower file; rc = [%d]\n", __func__, @@ -1341,7 +1340,6 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, static int ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, - struct ecryptfs_crypt_stat *crypt_stat, char *page_virt, size_t size) { int rc; @@ -1351,6 +1349,17 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, return rc; } +static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (page) + return (unsigned long) page_address(page); + return 0; +} + /** * ecryptfs_write_metadata * @ecryptfs_dentry: The eCryptfs dentry @@ -1367,7 +1376,9 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + unsigned int order; char *virt; + size_t virt_len; size_t size = 0; int rc = 0; @@ -1383,33 +1394,35 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = -EINVAL; goto out; } + virt_len = crypt_stat->num_header_bytes_at_front; + order = get_order(virt_len); /* Released in this function */ - virt = (char *)get_zeroed_page(GFP_KERNEL); + virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); if (!virt) { printk(KERN_ERR "%s: Out of memory\n", __func__); rc = -ENOMEM; goto out; } - rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size, - crypt_stat, ecryptfs_dentry); + rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat, + ecryptfs_dentry); if (unlikely(rc)) { printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", __func__, rc); goto out_free; } if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, - crypt_stat, virt, size); + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, + size); else - rc = ecryptfs_write_metadata_to_contents(crypt_stat, - ecryptfs_dentry, virt); + rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " "rc = [%d]\n", __func__, rc); goto out_free; } out_free: - free_page((unsigned long)virt); + free_pages((unsigned long)virt, order); out: return rc; } -- cgit v1.2.3 From 2aac0cf88681bfa092f731553bc7fbd23516be73 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Mar 2009 02:23:57 -0500 Subject: eCryptfs: NULL crypt_stat dereference during lookup If ecryptfs_encrypted_view or ecryptfs_xattr_metadata were being specified as mount options, a NULL pointer dereference of crypt_stat was possible during lookup. This patch moves the crypt_stat assignment into ecryptfs_lookup_and_interpose_lower(), ensuring that crypt_stat will not be NULL before we attempt to dereference it. Thanks to Dan Carpenter and his static analysis tool, smatch, for finding this bug. Signed-off-by: Tyler Hicks Acked-by: Dustin Kirkland Cc: Dan Carpenter Cc: Serge Hallyn Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 10 ++++++---- fs/ecryptfs/ecryptfs_kernel.h | 1 - fs/ecryptfs/inode.c | 32 ++++++++++++-------------------- 3 files changed, 18 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 75bee99de0f..8b65f289ee0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -2221,17 +2221,19 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, struct dentry *ecryptfs_dir_dentry, const char *name, size_t name_size) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; char *decoded_name; size_t decoded_name_size; size_t packet_size; int rc = 0; - if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { - struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private( - ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; const char *orig_name = name; size_t orig_name_size = name_size; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index eb2267eca1f..ac749d4d644 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -620,7 +620,6 @@ int ecryptfs_interpose(struct dentry *hidden_dentry, u32 flags); int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct ecryptfs_crypt_stat *crypt_stat, struct inode *ecryptfs_dir_inode, struct nameidata *ecryptfs_nd); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5697899a168..55b3145b807 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -246,7 +246,6 @@ out: */ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct ecryptfs_crypt_stat *crypt_stat, struct inode *ecryptfs_dir_inode, struct nameidata *ecryptfs_nd) { @@ -254,6 +253,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct vfsmount *lower_mnt; struct inode *lower_inode; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; char *page_virt = NULL; u64 file_size; int rc = 0; @@ -314,6 +314,11 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, goto out_free_kmem; } } + crypt_stat = &ecryptfs_inode_to_private( + ecryptfs_dentry->d_inode)->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); rc = ecryptfs_read_and_validate_header_region(page_virt, ecryptfs_dentry->d_inode); if (rc) { @@ -362,9 +367,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, { char *encrypted_and_encoded_name = NULL; size_t encrypted_and_encoded_name_size; - struct ecryptfs_crypt_stat *crypt_stat = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - struct ecryptfs_inode_info *inode_info; struct dentry *lower_dir_dentry, *lower_dentry; int rc = 0; @@ -388,26 +391,15 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, } if (lower_dentry->d_inode) goto lookup_and_interpose; - inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); - if (inode_info) { - crypt_stat = &inode_info->crypt_stat; - /* TODO: lock for crypt_stat comparison */ - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) - ecryptfs_set_default_sizes(crypt_stat); - } - if (crypt_stat) - mount_crypt_stat = crypt_stat->mount_crypt_stat; - else - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; - if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) - && !(mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(mount_crypt_stat + && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) goto lookup_and_interpose; dput(lower_dentry); rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, - crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, + NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, ecryptfs_dentry->d_name.len); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " @@ -426,7 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, } lookup_and_interpose: rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, - crypt_stat, ecryptfs_dir_inode, + ecryptfs_dir_inode, ecryptfs_nd); goto out; out_d_drop: -- cgit v1.2.3 From f762dd68218665bb87d4e4a0eeac86fde7530293 Mon Sep 17 00:00:00 2001 From: Gertjan van Wingerde Date: Sat, 21 Mar 2009 23:18:57 +0100 Subject: Update my email address Update all previous incarnations of my email address to the correct one. Signed-off-by: Gertjan van Wingerde Signed-off-by: Linus Torvalds --- fs/minix/inode.c | 2 +- fs/ufs/super.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/minix/inode.c b/fs/minix/inode.c index d1d1eb84679..618865b3128 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -3,7 +3,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Copyright (C) 1996 Gertjan van Wingerde (gertjan@cs.vu.nl) + * Copyright (C) 1996 Gertjan van Wingerde * Minix V2 fs support. * * Modified for 680x0 by Andreas Schwab diff --git a/fs/ufs/super.c b/fs/ufs/super.c index e65212dfb60..261a1c2f22d 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -41,7 +41,7 @@ * Stefan Reinauer * * Module usage counts added on 96/04/29 by - * Gertjan van Wingerde + * Gertjan van Wingerde * * Clean swab support on 19970406 by * Francois-Rene Rideau -- cgit v1.2.3