From 143f412eb4c7cc48b9eb4381f9133b7d36c68075 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:46 -0800 Subject: [PATCH] NFS: Fix a potential panic in O_DIRECT Based on an original patch by Mike O'Connor and Greg Banks of SGI. Mike states: A normal user can panic an NFS client and cause a local DoS with 'judicious'(?) use of O_DIRECT. Any O_DIRECT write to an NFS file where the user buffer starts with a valid mapped page and contains an unmapped page, will crash in this way. I haven't followed the code, but O_DIRECT reads with similar user buffers will probably also crash albeit in different ways. Details: when nfs_get_user_pages() calls get_user_pages(), it detects and correctly handles get_user_pages() returning an error, which happens if the first page covered by the user buffer's address range is unmapped. However, if the first page is mapped but some subsequent page isn't, get_user_pages() will return a positive number which is less than the number of pages requested (this behaviour is sort of analagous to a short write() call and appears to be intentional). nfs_get_user_pages() doesn't detect this and hands off the array of pages (whose last few elements are random rubbish from the newly allocated array memory) to it's caller, whence they go to nfs_direct_write_seg(), which then totally ignores the nr_pages it's given, and calculates its own idea of how many pages are in the array from the user buffer length. Needless to say, when it comes to transmit those uninitialised page* pointers, we see a crash in the network stack. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/direct.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 04ab2fc360e..4e9b3a1b36c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -57,6 +57,7 @@ #define NFSDBG_FACILITY NFSDBG_VFS #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) +static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty); static kmem_cache_t *nfs_direct_cachep; /* @@ -107,6 +108,15 @@ nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); + /* + * If we got fewer pages than expected from get_user_pages(), + * the user buffer runs off the end of a mapping; return EFAULT. + */ + if (result >= 0 && result < page_count) { + nfs_free_user_pages(*pages, result, 0); + *pages = NULL; + result = -EFAULT; + } } return result; } -- cgit v1.2.3 From c12e87f4652b1ba3be168b4f63a440399b941928 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:47 -0800 Subject: [PATCH] NFSv4: fix mount segfault on errors returned that are < -1000 It turns out that nfs4_proc_get_root() may return raw NFSv4 errors instead of mapping them to kernel errors. Problem spotted by Neil Horman Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 984ca3454d0..f8c0066e02e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1430,7 +1430,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); out: - return status; + return nfs4_map_errors(status); } static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) -- cgit v1.2.3 From 30f4e20a0d3492668f5065af582b5af2d1e4256b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Mar 2006 21:20:49 -0800 Subject: [PATCH] NLM: Ensure we do not Oops in the case of an unlock In theory, NLM specs assure us that the server will only reply LCK_GRANTED or LCK_DENIED_GRACE_PERIOD to our NLM_UNLOCK request. In practice, we should not assume this to be the case, and the code will currently Oops if we do. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/lockd/clntproc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 220058d8616..970b6a6aa33 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -662,12 +662,18 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) * reclaimed while we're stuck in the unlock call. */ fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED; + /* + * Note: the server is supposed to either grant us the unlock + * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either + * case, we want to unlock. + */ + do_vfs_lock(fl); + if (req->a_flags & RPC_TASK_ASYNC) { status = nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); /* Hrmf... Do the unlock early since locks_remove_posix() * really expects us to free the lock synchronously */ - do_vfs_lock(fl); if (status < 0) { nlmclnt_release_lockargs(req); kfree(req); @@ -680,7 +686,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) if (status < 0) return status; - do_vfs_lock(fl); if (resp->status == NLM_LCK_GRANTED) return 0; -- cgit v1.2.3 From a488edc914aa1d766a4e2c982b5ae03d5657ec1b Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 14 Mar 2006 13:44:00 -0600 Subject: [PATCH] JFS: Take logsync lock before testing mp->lsn This fixes a race where lsn could be cleared before taking the lock Signed-off-by: Dave Kleikamp Signed-off-by: Linus Torvalds --- fs/jfs/jfs_dmap.c | 7 ++----- fs/jfs/jfs_imap.c | 6 ++---- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 2967b739341..79b5404db10 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -532,10 +532,10 @@ dbUpdatePMap(struct inode *ipbmap, lastlblkno = lblkno; + LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(diffp, mp->lsn, log); - LOGSYNC_LOCK(log, flags); if (difft < diffp) { mp->lsn = lsn; @@ -548,20 +548,17 @@ dbUpdatePMap(struct inode *ipbmap, logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } else { mp->log = log; mp->lsn = lsn; /* insert bp after tblock in logsync list */ - LOGSYNC_LOCK(log, flags); - log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } + LOGSYNC_UNLOCK(log, flags); } /* write the last buffer. */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 31b4aa13dd4..4efa0d0eec3 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -2844,11 +2844,11 @@ diUpdatePMap(struct inode *ipimap, */ lsn = tblk->lsn; log = JFS_SBI(tblk->sb)->log; + LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(difft, lsn, log); logdiff(diffp, mp->lsn, log); - LOGSYNC_LOCK(log, flags); if (difft < diffp) { mp->lsn = lsn; /* move mp after tblock in logsync list */ @@ -2860,17 +2860,15 @@ diUpdatePMap(struct inode *ipimap, logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } else { mp->log = log; mp->lsn = lsn; /* insert mp after tblock in logsync list */ - LOGSYNC_LOCK(log, flags); log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } + LOGSYNC_UNLOCK(log, flags); write_metapage(mp); return (0); } -- cgit v1.2.3 From 3fb962bde48c413bfa419ec4413037e87955dcb6 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Wed, 15 Mar 2006 15:14:45 +1100 Subject: Fix a direct I/O locking issue revealed by the new mutex code. Affects only XFS (i.e. DIO_OWN_LOCKING case) - currently it is not possible to get i_mutex locking correct when using DIO_OWN direct I/O locking in a filesystem due to indeterminism in the possible return code/lock/unlock combinations. This can cause a direct read to attempt a double i_mutex unlock inside XFS. We're now ensuring __blockdev_direct_IO always exits with the inode i_mutex (still) held for a direct reader. Tested with the three different locking modes (via direct block device access, ext3 and XFS) - both reading and writing; cannot find any regressions resulting from this change, and it clearly fixes the mutex_unlock warning originally reported here: http://marc.theaimsgroup.com/?l=linux-kernel&m=114189068126253&w=2 Signed-off-by: Nathan Scott Acked-by: Christoph Hellwig --- fs/direct-io.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 848044af7e1..27f3e787fac 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1155,15 +1155,16 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * For writes, i_mutex is not held on entry; it is never taken. * * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_mutex and return with i_mutex held, even though - * it is internally dropped. + * For writes we are called under i_mutex and return with i_mutex held, even + * though it is internally dropped. * For reads, i_mutex is not held on entry, but it is taken and dropped before * returning. * * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of * uninitialised data, allowing parallel direct readers and writers) * For writes we are called without i_mutex, return without it, never touch it. - * For reads, i_mutex is held on entry and will be released before returning. + * For reads we are called under i_mutex and return with i_mutex held, even + * though it may be internally dropped. * * Additional i_alloc_sem locking requirements described inline below. */ @@ -1182,7 +1183,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int reader_with_isem = (rw == READ && dio_lock_type == DIO_OWN_LOCKING); + int release_i_mutex = 0; + int acquire_i_mutex = 0; if (rw & WRITE) current->flags |= PF_SYNCWRITE; @@ -1225,7 +1227,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * writers need to grab i_alloc_sem only (i_mutex is already held) * For regular files using DIO_OWN_LOCKING, * neither readers nor writers take any locks here - * (i_mutex is already held and release for writers here) */ dio->lock_type = dio_lock_type; if (dio_lock_type != DIO_NO_LOCKING) { @@ -1236,7 +1237,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, mapping = iocb->ki_filp->f_mapping; if (dio_lock_type != DIO_OWN_LOCKING) { mutex_lock(&inode->i_mutex); - reader_with_isem = 1; + release_i_mutex = 1; } retval = filemap_write_and_wait_range(mapping, offset, @@ -1248,7 +1249,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (dio_lock_type == DIO_OWN_LOCKING) { mutex_unlock(&inode->i_mutex); - reader_with_isem = 0; + acquire_i_mutex = 1; } } @@ -1269,11 +1270,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, nr_segs, blkbits, get_blocks, end_io, dio); if (rw == READ && dio_lock_type == DIO_LOCKING) - reader_with_isem = 0; + release_i_mutex = 0; out: - if (reader_with_isem) + if (release_i_mutex) mutex_unlock(&inode->i_mutex); + else if (acquire_i_mutex) + mutex_lock(&inode->i_mutex); if (rw & WRITE) current->flags &= ~PF_SYNCWRITE; return retval; -- cgit v1.2.3 From 4983da07f1e2e8dc81cb9d640fbf35b899cdbdf2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 14 Mar 2006 19:50:19 -0800 Subject: [PATCH] page migration: fail if page is in a vma flagged VM_LOCKED page migration currently simply retries a couple of times if try_to_unmap() fails without inspecting the return code. However, SWAP_FAIL indicates that the page is in a vma that has the VM_LOCKED flag set (if ignore_refs ==1). We can check for that return code and avoid retrying the migration. migrate_page_remove_references() now needs to return a reason why the failure occured. So switch migrate_page_remove_references to use -Exx style error messages. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 62cfd17dc5f..a9b39940200 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,6 +3060,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page) { struct address_space *mapping = page->mapping; struct buffer_head *bh, *head; + int rc; if (!mapping) return -EAGAIN; @@ -3069,8 +3070,9 @@ int buffer_migrate_page(struct page *newpage, struct page *page) head = page_buffers(page); - if (migrate_page_remove_references(newpage, page, 3)) - return -EAGAIN; + rc = migrate_page_remove_references(newpage, page, 3); + if (rc) + return rc; bh = head; do { -- cgit v1.2.3 From f13b83580acef03a36c785dccc534ccdd7e43084 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 15 Mar 2006 17:37:32 +0100 Subject: [PATCH] fs/namespace.c:dup_namespace(): fix a use after free The Coverity checker spotted the following bug in dup_namespace(): <-- snip --> if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); goto out; } ... out: return new_ns; <-- snip --> Callers expect a non-NULL result to not be freed. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- fs/namespace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 058a44865be..39c81a8d631 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1338,7 +1338,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); if (!new_ns) - goto out; + return NULL; atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); @@ -1352,7 +1352,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); - goto out; + return NULL; } spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); @@ -1393,7 +1393,6 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) if (altrootmnt) mntput(altrootmnt); -out: return new_ns; } -- cgit v1.2.3 From 2d7f2ea9c989853310c7f6e8be52cc090cc8e66b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 15 Mar 2006 21:41:59 +0000 Subject: [PATCH] Fix ext2 readdir f_pos re-validation logic This fixes not one, but _two_, silly (but admittedly hard to hit) bugs in the ext2 filesystem "readdir()" function. It also cleans up the code to avoid the unnecessary goto mess. The bugs were related to re-valiating the f_pos value after somebody had either done an "lseek()" on the directory to an invalid offset, or when the offset had become invalid due to a file being unlinked in the directory. The code would not only set the f_version too eagerly, it would also not update f_pos appropriately for when the offset fixup took place. When that happened, we'd occasionally subsequently fail the readdir() even when we shouldn't (no real harm done, but an ugly printk, and obviously you would end up not necessarily seeing all entries). Thanks to Masoud Sharbiani who noticed the problem and had a test-case for it, and also fixed up a thinko in the first version of this patch. Signed-off-by: Al Viro Acked-by: Masoud Sharbiani Signed-off-by: Linus Torvalds --- fs/ext2/dir.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 7442bdd1267..b3dbd716cd3 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -256,11 +256,10 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); unsigned char *types = NULL; - int need_revalidate = (filp->f_version != inode->i_version); - int ret; + int need_revalidate = filp->f_version != inode->i_version; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) - goto success; + return 0; if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) types = ext2_filetype_table; @@ -275,12 +274,15 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) "bad page in #%lu", inode->i_ino); filp->f_pos += PAGE_CACHE_SIZE - offset; - ret = -EIO; - goto done; + return -EIO; } kaddr = page_address(page); - if (need_revalidate) { - offset = ext2_validate_entry(kaddr, offset, chunk_mask); + if (unlikely(need_revalidate)) { + if (offset) { + offset = ext2_validate_entry(kaddr, offset, chunk_mask); + filp->f_pos = (n<f_version = inode->i_version; need_revalidate = 0; } de = (ext2_dirent *)(kaddr+offset); @@ -289,9 +291,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) if (de->rec_len == 0) { ext2_error(sb, __FUNCTION__, "zero-length directory entry"); - ret = -EIO; ext2_put_page(page); - goto done; + return -EIO; } if (de->inode) { int over; @@ -306,19 +307,14 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) le32_to_cpu(de->inode), d_type); if (over) { ext2_put_page(page); - goto success; + return 0; } } filp->f_pos += le16_to_cpu(de->rec_len); } ext2_put_page(page); } - -success: - ret = 0; -done: - filp->f_version = inode->i_version; - return ret; + return 0; } /* -- cgit v1.2.3