From 143f412eb4c7cc48b9eb4381f9133b7d36c68075 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:46 -0800
Subject: [PATCH] NFS: Fix a potential panic in O_DIRECT

Based on an original patch by Mike O'Connor and Greg Banks of SGI.

Mike states:

A normal user can panic an NFS client and cause a local DoS with
'judicious'(?) use of O_DIRECT.  Any O_DIRECT write to an NFS file where the
user buffer starts with a valid mapped page and contains an unmapped page,
will crash in this way.  I haven't followed the code, but O_DIRECT reads with
similar user buffers will probably also crash albeit in different ways.

Details: when nfs_get_user_pages() calls get_user_pages(), it detects and
correctly handles get_user_pages() returning an error, which happens if the
first page covered by the user buffer's address range is unmapped.  However,
if the first page is mapped but some subsequent page isn't, get_user_pages()
will return a positive number which is less than the number of pages requested
(this behaviour is sort of analagous to a short write() call and appears to be
intentional).  nfs_get_user_pages() doesn't detect this and hands off the
array of pages (whose last few elements are random rubbish from the newly
allocated array memory) to it's caller, whence they go to
nfs_direct_write_seg(), which then totally ignores the nr_pages it's given,
and calculates its own idea of how many pages are in the array from the user
buffer length.  Needless to say, when it comes to transmit those uninitialised
page* pointers, we see a crash in the network stack.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/direct.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 04ab2fc360e..4e9b3a1b36c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -57,6 +57,7 @@
 #define NFSDBG_FACILITY		NFSDBG_VFS
 #define MAX_DIRECTIO_SIZE	(4096UL << PAGE_SHIFT)
 
+static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty);
 static kmem_cache_t *nfs_direct_cachep;
 
 /*
@@ -107,6 +108,15 @@ nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
 					page_count, (rw == READ), 0,
 					*pages, NULL);
 		up_read(&current->mm->mmap_sem);
+		/*
+		 * If we got fewer pages than expected from get_user_pages(),
+		 * the user buffer runs off the end of a mapping; return EFAULT.
+		 */
+		if (result >= 0 && result < page_count) {
+			nfs_free_user_pages(*pages, result, 0);
+			*pages = NULL;
+			result = -EFAULT;
+		}
 	}
 	return result;
 }
-- 
cgit v1.2.3


From c12e87f4652b1ba3be168b4f63a440399b941928 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:47 -0800
Subject: [PATCH] NFSv4: fix mount segfault on errors returned that are < -1000

It turns out that nfs4_proc_get_root() may return raw NFSv4 errors instead of
mapping them to kernel errors.  Problem spotted by Neil Horman
<nhorman@tuxdriver.com>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 984ca3454d0..f8c0066e02e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1430,7 +1430,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	if (status == 0)
 		status = nfs4_do_fsinfo(server, fhandle, info);
 out:
-	return status;
+	return nfs4_map_errors(status);
 }
 
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
-- 
cgit v1.2.3


From 30f4e20a0d3492668f5065af582b5af2d1e4256b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:49 -0800
Subject: [PATCH] NLM: Ensure we do not Oops in the case of an unlock

In theory, NLM specs assure us that the server will only reply LCK_GRANTED or
LCK_DENIED_GRACE_PERIOD to our NLM_UNLOCK request.

In practice, we should not assume this to be the case, and the code will
currently Oops if we do.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/lockd/clntproc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 220058d8616..970b6a6aa33 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -662,12 +662,18 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 * reclaimed while we're stuck in the unlock call. */
 	fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED;
 
+	/*
+	 * Note: the server is supposed to either grant us the unlock
+	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
+	 * case, we want to unlock.
+	 */
+	do_vfs_lock(fl);
+
 	if (req->a_flags & RPC_TASK_ASYNC) {
 		status = nlmclnt_async_call(req, NLMPROC_UNLOCK,
 					&nlmclnt_unlock_ops);
 		/* Hrmf... Do the unlock early since locks_remove_posix()
 		 * really expects us to free the lock synchronously */
-		do_vfs_lock(fl);
 		if (status < 0) {
 			nlmclnt_release_lockargs(req);
 			kfree(req);
@@ -680,7 +686,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	if (status < 0)
 		return status;
 
-	do_vfs_lock(fl);
 	if (resp->status == NLM_LCK_GRANTED)
 		return 0;
 
-- 
cgit v1.2.3


From a488edc914aa1d766a4e2c982b5ae03d5657ec1b Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@austin.ibm.com>
Date: Tue, 14 Mar 2006 13:44:00 -0600
Subject: [PATCH] JFS: Take logsync lock before testing mp->lsn

This fixes a race where lsn could be cleared before taking the lock

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jfs/jfs_dmap.c | 7 ++-----
 fs/jfs/jfs_imap.c | 6 ++----
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2967b739341..79b5404db10 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -532,10 +532,10 @@ dbUpdatePMap(struct inode *ipbmap,
 
 		lastlblkno = lblkno;
 
+		LOGSYNC_LOCK(log, flags);
 		if (mp->lsn != 0) {
 			/* inherit older/smaller lsn */
 			logdiff(diffp, mp->lsn, log);
-			LOGSYNC_LOCK(log, flags);
 			if (difft < diffp) {
 				mp->lsn = lsn;
 
@@ -548,20 +548,17 @@ dbUpdatePMap(struct inode *ipbmap,
 			logdiff(diffp, mp->clsn, log);
 			if (difft > diffp)
 				mp->clsn = tblk->clsn;
-			LOGSYNC_UNLOCK(log, flags);
 		} else {
 			mp->log = log;
 			mp->lsn = lsn;
 
 			/* insert bp after tblock in logsync list */
-			LOGSYNC_LOCK(log, flags);
-
 			log->count++;
 			list_add(&mp->synclist, &tblk->synclist);
 
 			mp->clsn = tblk->clsn;
-			LOGSYNC_UNLOCK(log, flags);
 		}
+		LOGSYNC_UNLOCK(log, flags);
 	}
 
 	/* write the last buffer. */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 31b4aa13dd4..4efa0d0eec3 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2844,11 +2844,11 @@ diUpdatePMap(struct inode *ipimap,
 	 */
 	lsn = tblk->lsn;
 	log = JFS_SBI(tblk->sb)->log;
+	LOGSYNC_LOCK(log, flags);
 	if (mp->lsn != 0) {
 		/* inherit older/smaller lsn */
 		logdiff(difft, lsn, log);
 		logdiff(diffp, mp->lsn, log);
-		LOGSYNC_LOCK(log, flags);
 		if (difft < diffp) {
 			mp->lsn = lsn;
 			/* move mp after tblock in logsync list */
@@ -2860,17 +2860,15 @@ diUpdatePMap(struct inode *ipimap,
 		logdiff(diffp, mp->clsn, log);
 		if (difft > diffp)
 			mp->clsn = tblk->clsn;
-		LOGSYNC_UNLOCK(log, flags);
 	} else {
 		mp->log = log;
 		mp->lsn = lsn;
 		/* insert mp after tblock in logsync list */
-		LOGSYNC_LOCK(log, flags);
 		log->count++;
 		list_add(&mp->synclist, &tblk->synclist);
 		mp->clsn = tblk->clsn;
-		LOGSYNC_UNLOCK(log, flags);
 	}
+	LOGSYNC_UNLOCK(log, flags);
 	write_metapage(mp);
 	return (0);
 }
-- 
cgit v1.2.3


From 3fb962bde48c413bfa419ec4413037e87955dcb6 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@bruce>
Date: Wed, 15 Mar 2006 15:14:45 +1100
Subject: Fix a direct I/O locking issue revealed by the new mutex code.
 Affects only XFS (i.e. DIO_OWN_LOCKING case) - currently it is not possible
 to get i_mutex locking correct when using DIO_OWN direct I/O locking in a
 filesystem due to indeterminism in the possible return code/lock/unlock
 combinations.  This can cause a direct read to attempt a double i_mutex
 unlock inside XFS.

We're now ensuring __blockdev_direct_IO always exits with the
inode i_mutex (still) held for a direct reader.

Tested with the three different locking modes (via direct block
device access, ext3 and XFS) - both reading and writing; cannot
find any regressions resulting from this change, and it clearly
fixes the mutex_unlock warning originally reported here:
http://marc.theaimsgroup.com/?l=linux-kernel&m=114189068126253&w=2

Signed-off-by: Nathan Scott <nathans@sgi.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 848044af7e1..27f3e787fac 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1155,15 +1155,16 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
  * For writes, i_mutex is not held on entry; it is never taken.
  *
  * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even though
- * it is internally dropped.
+ * For writes we are called under i_mutex and return with i_mutex held, even
+ * though it is internally dropped.
  * For reads, i_mutex is not held on entry, but it is taken and dropped before
  * returning.
  *
  * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
  *	uninitialised data, allowing parallel direct readers and writers)
  * For writes we are called without i_mutex, return without it, never touch it.
- * For reads, i_mutex is held on entry and will be released before returning.
+ * For reads we are called under i_mutex and return with i_mutex held, even
+ * though it may be internally dropped.
  *
  * Additional i_alloc_sem locking requirements described inline below.
  */
@@ -1182,7 +1183,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
-	int reader_with_isem = (rw == READ && dio_lock_type == DIO_OWN_LOCKING);
+	int release_i_mutex = 0;
+	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
 		current->flags |= PF_SYNCWRITE;
@@ -1225,7 +1227,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
 	 * For regular files using DIO_OWN_LOCKING,
 	 *	neither readers nor writers take any locks here
-	 *	(i_mutex is already held and release for writers here)
 	 */
 	dio->lock_type = dio_lock_type;
 	if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1236,7 +1237,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 			mapping = iocb->ki_filp->f_mapping;
 			if (dio_lock_type != DIO_OWN_LOCKING) {
 				mutex_lock(&inode->i_mutex);
-				reader_with_isem = 1;
+				release_i_mutex = 1;
 			}
 
 			retval = filemap_write_and_wait_range(mapping, offset,
@@ -1248,7 +1249,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 			if (dio_lock_type == DIO_OWN_LOCKING) {
 				mutex_unlock(&inode->i_mutex);
-				reader_with_isem = 0;
+				acquire_i_mutex = 1;
 			}
 		}
 
@@ -1269,11 +1270,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 				nr_segs, blkbits, get_blocks, end_io, dio);
 
 	if (rw == READ && dio_lock_type == DIO_LOCKING)
-		reader_with_isem = 0;
+		release_i_mutex = 0;
 
 out:
-	if (reader_with_isem)
+	if (release_i_mutex)
 		mutex_unlock(&inode->i_mutex);
+	else if (acquire_i_mutex)
+		mutex_lock(&inode->i_mutex);
 	if (rw & WRITE)
 		current->flags &= ~PF_SYNCWRITE;
 	return retval;
-- 
cgit v1.2.3


From 4983da07f1e2e8dc81cb9d640fbf35b899cdbdf2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 14 Mar 2006 19:50:19 -0800
Subject: [PATCH] page migration: fail if page is in a vma flagged VM_LOCKED

page migration currently simply retries a couple of times if try_to_unmap()
fails without inspecting the return code.

However, SWAP_FAIL indicates that the page is in a vma that has the
VM_LOCKED flag set (if ignore_refs ==1).  We can check for that return code
and avoid retrying the migration.

migrate_page_remove_references() now needs to return a reason why the
failure occured.  So switch migrate_page_remove_references to use -Exx
style error messages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 62cfd17dc5f..a9b39940200 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3060,6 +3060,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	struct buffer_head *bh, *head;
+	int rc;
 
 	if (!mapping)
 		return -EAGAIN;
@@ -3069,8 +3070,9 @@ int buffer_migrate_page(struct page *newpage, struct page *page)
 
 	head = page_buffers(page);
 
-	if (migrate_page_remove_references(newpage, page, 3))
-		return -EAGAIN;
+	rc = migrate_page_remove_references(newpage, page, 3);
+	if (rc)
+		return rc;
 
 	bh = head;
 	do {
-- 
cgit v1.2.3


From f13b83580acef03a36c785dccc534ccdd7e43084 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 15 Mar 2006 17:37:32 +0100
Subject: [PATCH] fs/namespace.c:dup_namespace(): fix a use after free

The Coverity checker spotted the following bug in dup_namespace():

<--  snip  -->

        if (!new_ns->root) {
                up_write(&namespace_sem);
                kfree(new_ns);
                goto out;
        }
...
out:
        return new_ns;

<--  snip  -->

Callers expect a non-NULL result to not be freed.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 058a44865be..39c81a8d631 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1338,7 +1338,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 
 	new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
 	if (!new_ns)
-		goto out;
+		return NULL;
 
 	atomic_set(&new_ns->count, 1);
 	INIT_LIST_HEAD(&new_ns->list);
@@ -1352,7 +1352,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 	if (!new_ns->root) {
 		up_write(&namespace_sem);
 		kfree(new_ns);
-		goto out;
+		return NULL;
 	}
 	spin_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
@@ -1393,7 +1393,6 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 	if (altrootmnt)
 		mntput(altrootmnt);
 
-out:
 	return new_ns;
 }
 
-- 
cgit v1.2.3


From 2d7f2ea9c989853310c7f6e8be52cc090cc8e66b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 15 Mar 2006 21:41:59 +0000
Subject: [PATCH] Fix ext2 readdir f_pos re-validation logic

This fixes not one, but _two_, silly (but admittedly hard to hit) bugs
in the ext2 filesystem "readdir()" function.  It also cleans up the code
to avoid the unnecessary goto mess.

The bugs were related to re-valiating the f_pos value after somebody had
either done an "lseek()" on the directory to an invalid offset, or when
the offset had become invalid due to a file being unlinked in the
directory.  The code would not only set the f_version too eagerly, it
would also not update f_pos appropriately for when the offset fixup took
place.

When that happened, we'd occasionally subsequently fail the readdir()
even when we shouldn't (no real harm done, but an ugly printk, and
obviously you would end up not necessarily seeing all entries).

Thanks to Masoud Sharbiani <masouds@google.com> who noticed the problem
and had a test-case for it, and also fixed up a thinko in the first
version of this patch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Masoud Sharbiani <masouds@google.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/dir.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7442bdd1267..b3dbd716cd3 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -256,11 +256,10 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
 	unsigned char *types = NULL;
-	int need_revalidate = (filp->f_version != inode->i_version);
-	int ret;
+	int need_revalidate = filp->f_version != inode->i_version;
 
 	if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
-		goto success;
+		return 0;
 
 	if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
 		types = ext2_filetype_table;
@@ -275,12 +274,15 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 				   "bad page in #%lu",
 				   inode->i_ino);
 			filp->f_pos += PAGE_CACHE_SIZE - offset;
-			ret = -EIO;
-			goto done;
+			return -EIO;
 		}
 		kaddr = page_address(page);
-		if (need_revalidate) {
-			offset = ext2_validate_entry(kaddr, offset, chunk_mask);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ext2_validate_entry(kaddr, offset, chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (ext2_dirent *)(kaddr+offset);
@@ -289,9 +291,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 			if (de->rec_len == 0) {
 				ext2_error(sb, __FUNCTION__,
 					"zero-length directory entry");
-				ret = -EIO;
 				ext2_put_page(page);
-				goto done;
+				return -EIO;
 			}
 			if (de->inode) {
 				int over;
@@ -306,19 +307,14 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 						le32_to_cpu(de->inode), d_type);
 				if (over) {
 					ext2_put_page(page);
-					goto success;
+					return 0;
 				}
 			}
 			filp->f_pos += le16_to_cpu(de->rec_len);
 		}
 		ext2_put_page(page);
 	}
-
-success:
-	ret = 0;
-done:
-	filp->f_version = inode->i_version;
-	return ret;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3