From 2785259631697ebb0749a3782cca206e2e542939 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 4 Feb 2008 23:48:37 -0800
Subject: nfs: use GFP_NOFS preloads for radix-tree insertion

NFS should use GFP_NOFS mode radix tree preloads rather than GFP_ATOMIC
allocations at radix-tree insertion-time.  This is important to reduce the
atomic memory requirement.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f55c437124a..7be42e6eb63 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -360,15 +360,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 /*
  * Insert a write request into an inode
  */
-static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int error;
 
 	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
-	BUG_ON(error == -EEXIST);
-	if (error)
-		return error;
+	BUG_ON(error);
 	if (!nfsi->npages) {
 		igrab(inode);
 		if (nfs_have_delegation(inode, FMODE_WRITE))
@@ -378,8 +376,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	set_page_private(req->wb_page, (unsigned long)req);
 	nfsi->npages++;
 	kref_get(&req->wb_kref);
-	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
-	return 0;
+	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
+				NFS_PAGE_TAG_LOCKED);
 }
 
 /*
@@ -591,6 +589,13 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 		/* Loop over all inode entries and see if we find
 		 * A request for the page we wish to update
 		 */
+		if (new) {
+			if (radix_tree_preload(GFP_NOFS)) {
+				nfs_release_request(new);
+				return ERR_PTR(-ENOMEM);
+			}
+		}
+
 		spin_lock(&inode->i_lock);
 		req = nfs_page_find_request_locked(page);
 		if (req) {
@@ -601,28 +606,27 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 				error = nfs_wait_on_request(req);
 				nfs_release_request(req);
 				if (error < 0) {
-					if (new)
+					if (new) {
+						radix_tree_preload_end();
 						nfs_release_request(new);
+					}
 					return ERR_PTR(error);
 				}
 				continue;
 			}
 			spin_unlock(&inode->i_lock);
-			if (new)
+			if (new) {
+				radix_tree_preload_end();
 				nfs_release_request(new);
+			}
 			break;
 		}
 
 		if (new) {
-			int error;
 			nfs_lock_request_dontget(new);
-			error = nfs_inode_add_request(inode, new);
-			if (error) {
-				spin_unlock(&inode->i_lock);
-				nfs_unlock_request(new);
-				return ERR_PTR(error);
-			}
+			nfs_inode_add_request(inode, new);
 			spin_unlock(&inode->i_lock);
+			radix_tree_preload_end();
 			req = new;
 			goto zero_page;
 		}
-- 
cgit v1.2.3


From cbc20059259edee4ea24c923e6627c394830c972 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 14 Feb 2008 11:11:30 -0500
Subject: SUNRPC: Declare as const the rpc_message arguments to
 rpc_call_sync/async

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h | 9 +++++----
 net/sunrpc/clnt.c           | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 129a86e25d2..6fff7f82ef1 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -127,11 +127,12 @@ int		rpcb_getport_sync(struct sockaddr_in *, u32, u32, int);
 void		rpcb_getport_async(struct rpc_task *);
 
 void		rpc_call_start(struct rpc_task *);
-int		rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg,
-			       int flags, const struct rpc_call_ops *tk_ops,
+int		rpc_call_async(struct rpc_clnt *clnt,
+			       const struct rpc_message *msg, int flags,
+			       const struct rpc_call_ops *tk_ops,
 			       void *calldata);
-int		rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg,
-			      int flags);
+int		rpc_call_sync(struct rpc_clnt *clnt,
+			      const struct rpc_message *msg, int flags);
 struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred,
 			       int flags);
 void		rpc_restart_call(struct rpc_task *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8c6a7f1a25e..a23512bb240 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -548,7 +548,7 @@ EXPORT_SYMBOL_GPL(rpc_run_task);
  * @msg: RPC call parameters
  * @flags: RPC call flags
  */
-int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags)
 {
 	struct rpc_task	*task;
 	struct rpc_task_setup task_setup_data = {
@@ -579,7 +579,7 @@ EXPORT_SYMBOL_GPL(rpc_call_sync);
  * @data: user call data
  */
 int
-rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
+rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags,
 	       const struct rpc_call_ops *tk_ops, void *data)
 {
 	struct rpc_task	*task;
-- 
cgit v1.2.3


From 4b5621f6b127bce9218998c187bd25bf7f9fc371 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 25 Feb 2008 15:56:29 -0800
Subject: NFS: Fix an f_mode/f_flags confusion in fs/nfs/write.c

O_SYNC is stored in filp->f_flags.
Thanks to Al Viro for pointing out the bug.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f55c437124a..80c61fdb272 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -734,7 +734,7 @@ int nfs_updatepage(struct file *file, struct page *page,
 	 */
 	if (nfs_write_pageuptodate(page, inode) &&
 			inode->i_flock == NULL &&
-			!(file->f_mode & O_SYNC)) {
+			!(file->f_flags & O_SYNC)) {
 		count = max(count + offset, nfs_page_length(page));
 		offset = 0;
 	}
-- 
cgit v1.2.3


From 383ba71938519959be8e0b598ec658f0c211ff45 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 19 Feb 2008 20:04:20 -0500
Subject: NFS: Fix a deadlock with lazy umount

We can't allow rpc callback functions like task->tk_ops->rpc_call_prepare()
and task->tk_ops->rpc_call_done() to call mntput() in any way, since
that will cause a deadlock when the call to rpc_shutdown_client() attempts
to wait on 'task' to complete.

We can avoid the above deadlock by moving calls to mntput to
task->tk_ops->rpc_release() callback, since at that time the task will be
marked as completed, and so rpc_shutdown_client won't attempt to wait on
it.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c |  5 +++--
 fs/nfs/inode.c  |  6 +++++-
 fs/nfs/read.c   |  7 +++++--
 fs/nfs/write.c  | 13 ++++++++++---
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 16844f98f50..e0170407a88 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -323,7 +323,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->inode = inode;
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
-		data->args.context = ctx;
+		data->args.context = get_nfs_open_context(ctx);
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -546,6 +546,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->args.fh = NFS_FH(data->inode);
 	data->args.offset = 0;
 	data->args.count = 0;
+	data->args.context = get_nfs_open_context(dreq->ctx);
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -728,7 +729,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->inode = inode;
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
-		data->args.context = ctx;
+		data->args.context = get_nfs_open_context(ctx);
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 966a8850aa3..a499fb58d85 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -521,8 +521,12 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 
 static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
 {
-	struct inode *inode = ctx->path.dentry->d_inode;
+	struct inode *inode;
 
+	if (ctx == NULL)
+		return;
+
+	inode = ctx->path.dentry->d_inode;
 	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
 		return;
 	list_del(&ctx->list);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 3d7d9631e12..fab0d3720a0 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -73,7 +73,10 @@ static void nfs_readdata_free(struct nfs_read_data *rdata)
 
 void nfs_readdata_release(void *data)
 {
-        nfs_readdata_free(data);
+	struct nfs_read_data *rdata = data;
+
+	put_nfs_open_context(rdata->args.context);
+	nfs_readdata_free(rdata);
 }
 
 static
@@ -186,7 +189,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->args.pgbase = req->wb_pgbase + offset;
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
-	data->args.context = req->wb_context;
+	data->args.context = get_nfs_open_context(req->wb_context);
 
 	data->res.fattr   = &data->fattr;
 	data->res.count   = count;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 80c61fdb272..69b4158d9a1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -105,8 +105,11 @@ static void nfs_writedata_free(struct nfs_write_data *wdata)
 	call_rcu_bh(&wdata->task.u.tk_rcu, nfs_writedata_rcu_free);
 }
 
-void nfs_writedata_release(void *wdata)
+void nfs_writedata_release(void *data)
 {
+	struct nfs_write_data *wdata = data;
+
+	put_nfs_open_context(wdata->args.context);
 	nfs_writedata_free(wdata);
 }
 
@@ -816,7 +819,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.pgbase = req->wb_pgbase + offset;
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
-	data->args.context = req->wb_context;
+	data->args.context = get_nfs_open_context(req->wb_context);
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
@@ -1153,8 +1156,11 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-void nfs_commit_release(void *wdata)
+void nfs_commit_release(void *data)
 {
+	struct nfs_write_data *wdata = data;
+
+	put_nfs_open_context(wdata->args.context);
 	nfs_commit_free(wdata);
 }
 
@@ -1197,6 +1203,7 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	/* Note: we always request a commit of the entire inode */
 	data->args.offset = 0;
 	data->args.count  = 0;
+	data->args.context = get_nfs_open_context(first->wb_context);
 	data->res.count   = 0;
 	data->res.fattr   = &data->fattr;
 	data->res.verf    = &data->verf;
-- 
cgit v1.2.3


From 32bfb5c0f495dd88ef6bac4b76885d0820563739 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 19 Feb 2008 20:04:21 -0500
Subject: SUNRPC: Allow the rpc_release() callback to be run on another
 workqueue

A lot of the work done by the rpc_release() callback is inappropriate for
rpciod as it will often involve things like starting a new rpc call in
order to clean up state after an interrupted NFSv4 open() call, or
calls to mntput(), etc.

This patch allows the caller of rpc_run_task() to specify that the
rpc_release callback should run on a different workqueue than the default
rpciod_workqueue.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/sched.c           | 29 +++++++++++++++++++++--------
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index f689f02e679..fefb0ab5218 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -123,6 +123,7 @@ struct rpc_task_setup {
 	const struct rpc_message *rpc_message;
 	const struct rpc_call_ops *callback_ops;
 	void *callback_data;
+	struct workqueue_struct *workqueue;
 	unsigned short flags;
 	signed char priority;
 };
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 4c669121e60..3e0b22382a3 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -326,7 +326,7 @@ static void rpc_make_runnable(struct rpc_task *task)
 		int status;
 
 		INIT_WORK(&task->u.tk_work, rpc_async_schedule);
-		status = queue_work(task->tk_workqueue, &task->u.tk_work);
+		status = queue_work(rpciod_workqueue, &task->u.tk_work);
 		if (status < 0) {
 			printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
 			task->tk_status = status;
@@ -832,7 +832,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	task->tk_owner = current->tgid;
 
 	/* Initialize workqueue for async tasks */
-	task->tk_workqueue = rpciod_workqueue;
+	task->tk_workqueue = task_setup_data->workqueue;
 
 	task->tk_client = task_setup_data->rpc_client;
 	if (task->tk_client != NULL) {
@@ -868,7 +868,7 @@ rpc_alloc_task(void)
 	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
 }
 
-static void rpc_free_task(struct rcu_head *rcu)
+static void rpc_free_task_rcu(struct rcu_head *rcu)
 {
 	struct rpc_task *task = container_of(rcu, struct rpc_task, u.tk_rcu);
 	dprintk("RPC: %5u freeing task\n", task->tk_pid);
@@ -898,12 +898,23 @@ out:
 	return task;
 }
 
-
-void rpc_put_task(struct rpc_task *task)
+static void rpc_free_task(struct rpc_task *task)
 {
 	const struct rpc_call_ops *tk_ops = task->tk_ops;
 	void *calldata = task->tk_calldata;
 
+	if (task->tk_flags & RPC_TASK_DYNAMIC)
+		call_rcu_bh(&task->u.tk_rcu, rpc_free_task_rcu);
+	rpc_release_calldata(tk_ops, calldata);
+}
+
+static void rpc_async_release(struct work_struct *work)
+{
+	rpc_free_task(container_of(work, struct rpc_task, u.tk_work));
+}
+
+void rpc_put_task(struct rpc_task *task)
+{
 	if (!atomic_dec_and_test(&task->tk_count))
 		return;
 	/* Release resources */
@@ -915,9 +926,11 @@ void rpc_put_task(struct rpc_task *task)
 		rpc_release_client(task->tk_client);
 		task->tk_client = NULL;
 	}
-	if (task->tk_flags & RPC_TASK_DYNAMIC)
-		call_rcu_bh(&task->u.tk_rcu, rpc_free_task);
-	rpc_release_calldata(tk_ops, calldata);
+	if (task->tk_workqueue != NULL) {
+		INIT_WORK(&task->u.tk_work, rpc_async_release);
+		queue_work(task->tk_workqueue, &task->u.tk_work);
+	} else
+		rpc_free_task(task);
 }
 EXPORT_SYMBOL_GPL(rpc_put_task);
 
-- 
cgit v1.2.3


From 5746006f1d17d9d5a3015051ea54de4341cb31f9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 19 Feb 2008 20:04:22 -0500
Subject: NFS: Add an nfsiod workqueue

NFS post-rpciod cleanups often involve tasks that cannot be safely
performed within the rpciod context (due to deadlock concerns). We
therefore add a dedicated NFS workqueue that can perform tasks like
cleaning up state after an interrupted NFSv4 open() call, or calling
put_nfs_open_context() after an asynchronous read or write call.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    | 37 +++++++++++++++++++++++++++++++++++++
 fs/nfs/internal.h |  1 +
 2 files changed, 38 insertions(+)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a499fb58d85..36cb99985d2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1219,6 +1219,36 @@ static void nfs_destroy_inodecache(void)
 	kmem_cache_destroy(nfs_inode_cachep);
 }
 
+struct workqueue_struct *nfsiod_workqueue;
+
+/*
+ * start up the nfsiod workqueue
+ */
+static int nfsiod_start(void)
+{
+	struct workqueue_struct *wq;
+	dprintk("RPC:       creating workqueue nfsiod\n");
+	wq = create_singlethread_workqueue("nfsiod");
+	if (wq == NULL)
+		return -ENOMEM;
+	nfsiod_workqueue = wq;
+	return 0;
+}
+
+/*
+ * Destroy the nfsiod workqueue
+ */
+static void nfsiod_stop(void)
+{
+	struct workqueue_struct *wq;
+
+	wq = nfsiod_workqueue;
+	if (wq == NULL)
+		return;
+	nfsiod_workqueue = NULL;
+	destroy_workqueue(wq);
+}
+
 /*
  * Initialize NFS
  */
@@ -1226,6 +1256,10 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	err = nfsiod_start();
+	if (err)
+		goto out6;
+
 	err = nfs_fs_proc_init();
 	if (err)
 		goto out5;
@@ -1272,6 +1306,8 @@ out3:
 out4:
 	nfs_fs_proc_exit();
 out5:
+	nfsiod_stop();
+out6:
 	return err;
 }
 
@@ -1287,6 +1323,7 @@ static void __exit exit_nfs_fs(void)
 #endif
 	unregister_nfs_fs();
 	nfs_fs_proc_exit();
+	nfsiod_stop();
 }
 
 /* Not quite true; I just maintain it */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 0f5619611b8..985dc293103 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -143,6 +143,7 @@ extern struct rpc_procinfo nfs4_procedures[];
 extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 
 /* inode.c */
+extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
 extern int nfs_write_inode(struct inode *,int);
-- 
cgit v1.2.3


From 101070ca2fe67186f5f5517b66cb4757b17f4e29 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 19 Feb 2008 20:04:23 -0500
Subject: NFS: Ensure that the asynchronous RPC calls complete on nfsiod.

We want to ensure that rpc_call_ops that involve mntput() are run on nfsiod
rather than on rpciod, so that they don't deadlock when the resulting
umount calls rpc_shutdown_client(). Hence we specify that read, write and
commit calls must complete on nfsiod.
Ditto for NFSv4 open, lock, locku and close asynchronous calls.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c   | 4 ++++
 fs/nfs/nfs4proc.c | 6 ++++++
 fs/nfs/read.c     | 1 +
 fs/nfs/write.c    | 2 ++
 4 files changed, 13 insertions(+)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e0170407a88..e44200579c8 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -280,6 +280,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		.rpc_client = NFS_CLIENT(inode),
 		.rpc_message = &msg,
 		.callback_ops = &nfs_read_direct_ops,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 	unsigned int pgbase;
@@ -446,6 +447,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
 		.callback_ops = &nfs_write_direct_ops,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 
@@ -537,6 +539,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 		.rpc_message = &msg,
 		.callback_ops = &nfs_commit_direct_ops,
 		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 
@@ -683,6 +686,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		.rpc_client = NFS_CLIENT(inode),
 		.rpc_message = &msg,
 		.callback_ops = &nfs_write_direct_ops,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 	size_t wsize = NFS_SERVER(inode)->wsize;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7ce07862c2f..b6db833f33e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -51,6 +51,7 @@
 
 #include "nfs4_fs.h"
 #include "delegation.h"
+#include "internal.h"
 #include "iostat.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
@@ -773,6 +774,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_open_confirm_ops,
 		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 	int status;
@@ -910,6 +912,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_open_ops,
 		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 	int status;
@@ -1315,6 +1318,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 		.rpc_client = server->client,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_close_ops,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 	int status = -ENOMEM;
@@ -3235,6 +3239,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 		.rpc_client = NFS_CLIENT(lsp->ls_state->inode),
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_locku_ops,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 
@@ -3419,6 +3424,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 		.rpc_client = NFS_CLIENT(state->inode),
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_lock_ops,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 	int ret;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index fab0d3720a0..87546cd277d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -177,6 +177,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC | swap_flags,
 	};
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 69b4158d9a1..84495ef9072 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -803,6 +803,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
 		.flags = flags,
 		.priority = priority,
 	};
@@ -1187,6 +1188,7 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 		.rpc_message = &msg,
 		.callback_ops = &nfs_commit_ops,
 		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
 		.flags = flags,
 		.priority = priority,
 	};
-- 
cgit v1.2.3


From fde95c7554aa77f9a242f32b0b5f8f15395abf52 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 22 Feb 2008 15:09:26 -0500
Subject: SUNRPC: Clean up rpc_run_timer()

All RPC timeout callback functions are expected to wake the task up. We can
enforce this by moving the wakeup back into rpc_run_timer.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sched.c | 67 ++++++++++++++++++++++++++++--------------------------
 net/sunrpc/xprt.c  |  2 --
 2 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 3e0b22382a3..9433a113862 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -38,7 +38,6 @@ static struct kmem_cache	*rpc_buffer_slabp __read_mostly;
 static mempool_t	*rpc_task_mempool __read_mostly;
 static mempool_t	*rpc_buffer_mempool __read_mostly;
 
-static void			__rpc_default_timer(struct rpc_task *task);
 static void			rpc_async_schedule(struct work_struct *);
 static void			 rpc_release_task(struct rpc_task *task);
 
@@ -66,25 +65,13 @@ __rpc_disable_timer(struct rpc_task *task)
 }
 
 /*
- * Run a timeout function.
- * We use the callback in order to allow __rpc_wake_up_task()
- * and friends to disable the timer synchronously on SMP systems
- * without calling del_timer_sync(). The latter could cause a
- * deadlock if called while we're holding spinlocks...
+ * Default timeout handler if none specified by user
  */
-static void rpc_run_timer(struct rpc_task *task)
+static void
+__rpc_default_timer(struct rpc_task *task)
 {
-	void (*callback)(struct rpc_task *);
-
-	callback = task->tk_timeout_fn;
-	task->tk_timeout_fn = NULL;
-	if (callback && RPC_IS_QUEUED(task)) {
-		dprintk("RPC: %5u running timer\n", task->tk_pid);
-		callback(task);
-	}
-	smp_mb__before_clear_bit();
-	clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
-	smp_mb__after_clear_bit();
+	dprintk("RPC: %5u timeout (default timer)\n", task->tk_pid);
+	task->tk_status = -ETIMEDOUT;
 }
 
 /*
@@ -415,17 +402,6 @@ static void __rpc_wake_up_task(struct rpc_task *task)
 	}
 }
 
-/*
- * Default timeout handler if none specified by user
- */
-static void
-__rpc_default_timer(struct rpc_task *task)
-{
-	dprintk("RPC: %5u timeout (default timer)\n", task->tk_pid);
-	task->tk_status = -ETIMEDOUT;
-	rpc_wake_up_task(task);
-}
-
 /*
  * Wake up the specified task
  */
@@ -578,9 +554,37 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_status);
 
+/*
+ * Run a timeout function.
+ */
+static void rpc_run_timer(unsigned long ptr)
+{
+	struct rpc_task *task = (struct rpc_task *)ptr;
+	void (*callback)(struct rpc_task *);
+
+	if (!rpc_start_wakeup(task))
+		goto out;
+	if (RPC_IS_QUEUED(task)) {
+		struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq;
+		callback = task->tk_timeout_fn;
+
+		dprintk("RPC: %5u running timer\n", task->tk_pid);
+		if (callback != NULL)
+			callback(task);
+		/* Note: we're already in a bh-safe context */
+		spin_lock(&queue->lock);
+		__rpc_do_wake_up_task(task);
+		spin_unlock(&queue->lock);
+	}
+	rpc_finish_wakeup(task);
+out:
+	smp_mb__before_clear_bit();
+	clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
+	smp_mb__after_clear_bit();
+}
+
 static void __rpc_atrun(struct rpc_task *task)
 {
-	rpc_wake_up_task(task);
 }
 
 /*
@@ -816,8 +820,7 @@ EXPORT_SYMBOL_GPL(rpc_free);
 static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data)
 {
 	memset(task, 0, sizeof(*task));
-	setup_timer(&task->tk_timer, (void (*)(unsigned long))rpc_run_timer,
-			(unsigned long)task);
+	setup_timer(&task->tk_timer, rpc_run_timer, (unsigned long)task);
 	atomic_set(&task->tk_count, 1);
 	task->tk_flags  = task_setup_data->flags;
 	task->tk_ops = task_setup_data->callback_ops;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index d5553b8179f..96c212ddc41 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -777,8 +777,6 @@ static void xprt_timer(struct rpc_task *task)
 			xprt->ops->timer(task);
 		task->tk_status = -ETIMEDOUT;
 	}
-	task->tk_timeout = 0;
-	rpc_wake_up_task(task);
 	spin_unlock(&xprt->transport_lock);
 }
 
-- 
cgit v1.2.3


From 96ef13b283934fbf60b732e6c4ce23e8babd0042 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 22 Feb 2008 15:46:41 -0500
Subject: SUNRPC: Add a new helper rpc_wake_up_queued_task()

In all cases where we currently use rpc_wake_up_task(), we almost always
know on which waitqueue the rpc_task is actually sleeping. This will allows
us to simplify the queue locking in a future patch.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  4 ++-
 net/sunrpc/clnt.c            |  2 +-
 net/sunrpc/sched.c           | 64 +++++++++++++++++++++-----------------------
 3 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index fefb0ab5218..83f67779cf0 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -33,7 +33,6 @@ struct rpc_wait_queue;
 struct rpc_wait {
 	struct list_head	list;		/* wait queue links */
 	struct list_head	links;		/* Links to related tasks */
-	struct rpc_wait_queue *	rpc_waitq;	/* RPC wait queue we're on */
 };
 
 /*
@@ -80,6 +79,7 @@ struct rpc_task {
 	struct workqueue_struct	*tk_workqueue;	/* Normally rpciod, but could
 						 * be any workqueue
 						 */
+	struct rpc_wait_queue 	*tk_waitqueue;	/* RPC wait queue we're on */
 	union {
 		struct work_struct	tk_work;	/* Async task work queue */
 		struct rpc_wait		tk_wait;	/* RPC wait */
@@ -233,6 +233,8 @@ void		rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
 					rpc_action action, rpc_action timer);
 void		rpc_wake_up_task(struct rpc_task *);
+void		rpc_wake_up_queued_task(struct rpc_wait_queue *,
+					struct rpc_task *);
 void		rpc_wake_up(struct rpc_wait_queue *);
 struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
 void		rpc_wake_up_status(struct rpc_wait_queue *, int);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8c6a7f1a25e..fe95bd0ab1e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1535,7 +1535,7 @@ void rpc_show_tasks(void)
 				proc = -1;
 
 			if (RPC_IS_QUEUED(t))
-				rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
+				rpc_waitq = rpc_qname(t->tk_waitqueue);
 
 			printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
 				t->tk_pid, proc,
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9433a113862..9233ace076a 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -148,7 +148,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *
 		list_add(&task->u.tk_wait.list, &queue->tasks[0]);
 	else
 		list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
-	task->u.tk_wait.rpc_waitq = queue;
+	task->tk_waitqueue = queue;
 	queue->qlen++;
 	rpc_set_queued(task);
 
@@ -175,11 +175,8 @@ static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
  * Remove request from queue.
  * Note: must be called with spin lock held.
  */
-static void __rpc_remove_wait_queue(struct rpc_task *task)
+static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
-	struct rpc_wait_queue *queue;
-	queue = task->u.tk_wait.rpc_waitq;
-
 	if (RPC_IS_PRIORITY(queue))
 		__rpc_remove_wait_queue_priority(task);
 	else
@@ -364,11 +361,12 @@ EXPORT_SYMBOL_GPL(rpc_sleep_on);
 
 /**
  * __rpc_do_wake_up_task - wake up a single rpc_task
+ * @queue: wait queue
  * @task: task to be woken up
  *
  * Caller must hold queue->lock, and have cleared the task queued flag.
  */
-static void __rpc_do_wake_up_task(struct rpc_task *task)
+static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
 	dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
 			task->tk_pid, jiffies);
@@ -383,7 +381,7 @@ static void __rpc_do_wake_up_task(struct rpc_task *task)
 	}
 
 	__rpc_disable_timer(task);
-	__rpc_remove_wait_queue(task);
+	__rpc_remove_wait_queue(queue, task);
 
 	rpc_make_runnable(task);
 
@@ -391,36 +389,38 @@ static void __rpc_do_wake_up_task(struct rpc_task *task)
 }
 
 /*
- * Wake up the specified task
+ * Wake up a queued task while the queue lock is being held
  */
-static void __rpc_wake_up_task(struct rpc_task *task)
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
+	if (!RPC_IS_QUEUED(task) || task->tk_waitqueue != queue)
+		return;
 	if (rpc_start_wakeup(task)) {
-		if (RPC_IS_QUEUED(task))
-			__rpc_do_wake_up_task(task);
+			__rpc_do_wake_up_task(queue, task);
 		rpc_finish_wakeup(task);
 	}
 }
 
 /*
- * Wake up the specified task
+ * Wake up a task on a specific queue
  */
-void rpc_wake_up_task(struct rpc_task *task)
+void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
 	rcu_read_lock_bh();
-	if (rpc_start_wakeup(task)) {
-		if (RPC_IS_QUEUED(task)) {
-			struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq;
-
-			/* Note: we're already in a bh-safe context */
-			spin_lock(&queue->lock);
-			__rpc_do_wake_up_task(task);
-			spin_unlock(&queue->lock);
-		}
-		rpc_finish_wakeup(task);
-	}
+	spin_lock(&queue->lock);
+	rpc_wake_up_task_queue_locked(queue, task);
+	spin_unlock(&queue->lock);
 	rcu_read_unlock_bh();
 }
+EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
+
+/*
+ * Wake up the specified task
+ */
+void rpc_wake_up_task(struct rpc_task *task)
+{
+	rpc_wake_up_queued_task(task->tk_waitqueue, task);
+}
 EXPORT_SYMBOL_GPL(rpc_wake_up_task);
 
 /*
@@ -471,7 +471,7 @@ new_queue:
 new_owner:
 	rpc_set_waitqueue_owner(queue, task->tk_owner);
 out:
-	__rpc_wake_up_task(task);
+	rpc_wake_up_task_queue_locked(queue, task);
 	return task;
 }
 
@@ -490,7 +490,7 @@ struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue)
 		task = __rpc_wake_up_next_priority(queue);
 	else {
 		task_for_first(task, &queue->tasks[0])
-			__rpc_wake_up_task(task);
+			rpc_wake_up_task_queue_locked(queue, task);
 	}
 	spin_unlock(&queue->lock);
 	rcu_read_unlock_bh();
@@ -515,7 +515,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue)
 	head = &queue->tasks[queue->maxpriority];
 	for (;;) {
 		list_for_each_entry_safe(task, next, head, u.tk_wait.list)
-			__rpc_wake_up_task(task);
+			rpc_wake_up_task_queue_locked(queue, task);
 		if (head == &queue->tasks[0])
 			break;
 		head--;
@@ -543,7 +543,7 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 	for (;;) {
 		list_for_each_entry_safe(task, next, head, u.tk_wait.list) {
 			task->tk_status = status;
-			__rpc_wake_up_task(task);
+			rpc_wake_up_task_queue_locked(queue, task);
 		}
 		if (head == &queue->tasks[0])
 			break;
@@ -562,10 +562,8 @@ static void rpc_run_timer(unsigned long ptr)
 	struct rpc_task *task = (struct rpc_task *)ptr;
 	void (*callback)(struct rpc_task *);
 
-	if (!rpc_start_wakeup(task))
-		goto out;
 	if (RPC_IS_QUEUED(task)) {
-		struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq;
+		struct rpc_wait_queue *queue = task->tk_waitqueue;
 		callback = task->tk_timeout_fn;
 
 		dprintk("RPC: %5u running timer\n", task->tk_pid);
@@ -573,11 +571,9 @@ static void rpc_run_timer(unsigned long ptr)
 			callback(task);
 		/* Note: we're already in a bh-safe context */
 		spin_lock(&queue->lock);
-		__rpc_do_wake_up_task(task);
+		rpc_wake_up_task_queue_locked(queue, task);
 		spin_unlock(&queue->lock);
 	}
-	rpc_finish_wakeup(task);
-out:
 	smp_mb__before_clear_bit();
 	clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
 	smp_mb__after_clear_bit();
-- 
cgit v1.2.3


From fda1393938035559b417dd5b26b9cc293a7aee00 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 22 Feb 2008 16:34:12 -0500
Subject: SUNRPC: Convert users of rpc_wake_up_task to use
 rpc_wake_up_queued_task

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c            |  2 +-
 include/linux/sunrpc/sched.h |  1 -
 net/sunrpc/clnt.c            |  2 +-
 net/sunrpc/sched.c           |  3 +--
 net/sunrpc/xprt.c            | 10 +++++-----
 5 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b6db833f33e..54743396b66 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2768,7 +2768,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 			rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL);
 			nfs4_schedule_state_recovery(clp);
 			if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
-				rpc_wake_up_task(task);
+				rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
 			task->tk_status = 0;
 			return -EAGAIN;
 		case -NFS4ERR_DELAY:
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 83f67779cf0..7963ef0ffb8 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -232,7 +232,6 @@ void		rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
 					rpc_action action, rpc_action timer);
-void		rpc_wake_up_task(struct rpc_task *);
 void		rpc_wake_up_queued_task(struct rpc_wait_queue *,
 					struct rpc_task *);
 void		rpc_wake_up(struct rpc_wait_queue *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index fe95bd0ab1e..0e209af3979 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1066,7 +1066,7 @@ call_transmit(struct rpc_task *task)
 	if (task->tk_msg.rpc_proc->p_decode != NULL)
 		return;
 	task->tk_action = rpc_exit_task;
-	rpc_wake_up_task(task);
+	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
 }
 
 /*
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9233ace076a..35acdc39bfc 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -417,11 +417,10 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
 /*
  * Wake up the specified task
  */
-void rpc_wake_up_task(struct rpc_task *task)
+static void rpc_wake_up_task(struct rpc_task *task)
 {
 	rpc_wake_up_queued_task(task->tk_waitqueue, task);
 }
-EXPORT_SYMBOL_GPL(rpc_wake_up_task);
 
 /*
  * Wake up the next task on a priority queue.
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 96c212ddc41..6e2772217e5 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -472,7 +472,7 @@ void xprt_write_space(struct rpc_xprt *xprt)
 	if (xprt->snd_task) {
 		dprintk("RPC:       write space: waking waiting task on "
 				"xprt %p\n", xprt);
-		rpc_wake_up_task(xprt->snd_task);
+		rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task);
 	}
 	spin_unlock_bh(&xprt->transport_lock);
 }
@@ -602,8 +602,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
 	/* Try to schedule an autoclose RPC call */
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
 		queue_work(rpciod_workqueue, &xprt->task_cleanup);
-	else if (xprt->snd_task != NULL)
-		rpc_wake_up_task(xprt->snd_task);
+	xprt_wake_pending_tasks(xprt, -ENOTCONN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
 EXPORT_SYMBOL_GPL(xprt_force_disconnect);
@@ -749,18 +748,19 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt);
 void xprt_complete_rqst(struct rpc_task *task, int copied)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
 
 	dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
 			task->tk_pid, ntohl(req->rq_xid), copied);
 
-	task->tk_xprt->stat.recvs++;
+	xprt->stat.recvs++;
 	task->tk_rtt = (long)jiffies - req->rq_xtime;
 
 	list_del_init(&req->rq_list);
 	/* Ensure all writes are done before we update req->rq_received */
 	smp_wmb();
 	req->rq_received = req->rq_private_buf.len = copied;
-	rpc_wake_up_task(task);
+	rpc_wake_up_queued_task(&xprt->pending, task);
 }
 EXPORT_SYMBOL_GPL(xprt_complete_rqst);
 
-- 
cgit v1.2.3


From 5d00837b90340af9106dcd93af75fd664c8eb87f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 22 Feb 2008 16:34:17 -0500
Subject: SUNRPC: Run rpc timeout functions as callbacks instead of in softirqs

An audit of the current RPC timeout functions shows that they don't really
ever need to run in the softirq context. As long as the softirq is
able to signal that the wakeup is due to a timeout (which it can do by
setting task->tk_status to -ETIMEDOUT) then the callback functions can just
run as standard task->tk_callback functions (in the rpciod/process
context).

The only possible border-line case would be xprt_timer() for the case of
UDP, when the callback is used to reduce the size of the transport
congestion window. In testing, however, the effect of moving that update
to a callback would appear to be minor.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c              |  2 +-
 fs/nfs/nfs4state.c             |  2 +-
 include/linux/sunrpc/sched.h   |  4 +---
 net/sunrpc/auth_gss/auth_gss.c |  4 ++--
 net/sunrpc/rpcb_clnt.c         |  2 +-
 net/sunrpc/sched.c             | 50 +++++++++++++-----------------------------
 net/sunrpc/xprt.c              | 28 ++++++++++++-----------
 7 files changed, 36 insertions(+), 56 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 54743396b66..bbb0d58ee6a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2765,7 +2765,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
-			rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL);
+			rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
 			nfs4_schedule_state_recovery(clp);
 			if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
 				rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index b962397004c..a2ef02824aa 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -731,7 +731,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
 		list_add_tail(&seqid->list, &sequence->list);
 	if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
 		goto unlock;
-	rpc_sleep_on(&sequence->wait, task, NULL, NULL);
+	rpc_sleep_on(&sequence->wait, task, NULL);
 	status = -EAGAIN;
 unlock:
 	spin_unlock(&sequence->lock);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 7963ef0ffb8..503a937bdca 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -56,12 +56,10 @@ struct rpc_task {
 	__u8			tk_cred_retry;
 
 	/*
-	 * timeout_fn   to be executed by timer bottom half
 	 * callback	to be executed after waking up
 	 * action	next procedure for async tasks
 	 * tk_ops	caller callbacks
 	 */
-	void			(*tk_timeout_fn)(struct rpc_task *);
 	void			(*tk_callback)(struct rpc_task *);
 	void			(*tk_action)(struct rpc_task *);
 	const struct rpc_call_ops *tk_ops;
@@ -231,7 +229,7 @@ void		rpc_execute(struct rpc_task *);
 void		rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
-					rpc_action action, rpc_action timer);
+					rpc_action action);
 void		rpc_wake_up_queued_task(struct rpc_wait_queue *,
 					struct rpc_task *);
 void		rpc_wake_up(struct rpc_wait_queue *);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 6dac3879228..dc6391bcda1 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -408,13 +408,13 @@ gss_refresh_upcall(struct rpc_task *task)
 	}
 	spin_lock(&inode->i_lock);
 	if (gss_cred->gc_upcall != NULL)
-		rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL, NULL);
+		rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL);
 	else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) {
 		task->tk_timeout = 0;
 		gss_cred->gc_upcall = gss_msg;
 		/* gss_upcall_callback will release the reference to gss_upcall_msg */
 		atomic_inc(&gss_msg->count);
-		rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback, NULL);
+		rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback);
 	} else
 		err = gss_msg->msg.errno;
 	spin_unlock(&inode->i_lock);
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 3164a0871cf..f480c718b40 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -298,7 +298,7 @@ void rpcb_getport_async(struct rpc_task *task)
 
 	/* Put self on queue before sending rpcbind request, in case
 	 * rpcb_getport_done completes before we return from rpc_run_task */
-	rpc_sleep_on(&xprt->binding, task, NULL, NULL);
+	rpc_sleep_on(&xprt->binding, task, NULL);
 
 	/* Someone else may have bound if we slept */
 	if (xprt_bound(xprt)) {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 35acdc39bfc..caf12fd6b6a 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -56,29 +56,18 @@ struct workqueue_struct *rpciod_workqueue;
  * queue->lock and bh_disabled in order to avoid races within
  * rpc_run_timer().
  */
-static inline void
+static void
 __rpc_disable_timer(struct rpc_task *task)
 {
 	dprintk("RPC: %5u disabling timer\n", task->tk_pid);
-	task->tk_timeout_fn = NULL;
 	task->tk_timeout = 0;
 }
 
-/*
- * Default timeout handler if none specified by user
- */
-static void
-__rpc_default_timer(struct rpc_task *task)
-{
-	dprintk("RPC: %5u timeout (default timer)\n", task->tk_pid);
-	task->tk_status = -ETIMEDOUT;
-}
-
 /*
  * Set up a timer for the current task.
  */
-static inline void
-__rpc_add_timer(struct rpc_task *task, rpc_action timer)
+static void
+__rpc_add_timer(struct rpc_task *task)
 {
 	if (!task->tk_timeout)
 		return;
@@ -86,10 +75,6 @@ __rpc_add_timer(struct rpc_task *task, rpc_action timer)
 	dprintk("RPC: %5u setting alarm for %lu ms\n",
 			task->tk_pid, task->tk_timeout * 1000 / HZ);
 
-	if (timer)
-		task->tk_timeout_fn = timer;
-	else
-		task->tk_timeout_fn = __rpc_default_timer;
 	set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
 	mod_timer(&task->tk_timer, jiffies + task->tk_timeout);
 }
@@ -297,7 +282,6 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
  */
 static void rpc_make_runnable(struct rpc_task *task)
 {
-	BUG_ON(task->tk_timeout_fn);
 	rpc_clear_queued(task);
 	if (rpc_test_and_set_running(task))
 		return;
@@ -327,7 +311,7 @@ static void rpc_make_runnable(struct rpc_task *task)
  * as it's on a wait queue.
  */
 static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
-			rpc_action action, rpc_action timer)
+			rpc_action action)
 {
 	dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
 			task->tk_pid, rpc_qname(q), jiffies);
@@ -341,11 +325,11 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 
 	BUG_ON(task->tk_callback != NULL);
 	task->tk_callback = action;
-	__rpc_add_timer(task, timer);
+	__rpc_add_timer(task);
 }
 
 void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
-				rpc_action action, rpc_action timer)
+				rpc_action action)
 {
 	/* Mark the task as being activated if so needed */
 	rpc_set_active(task);
@@ -354,7 +338,7 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 	 * Protect the queue operations.
 	 */
 	spin_lock_bh(&q->lock);
-	__rpc_sleep_on(q, task, action, timer);
+	__rpc_sleep_on(q, task, action);
 	spin_unlock_bh(&q->lock);
 }
 EXPORT_SYMBOL_GPL(rpc_sleep_on);
@@ -559,20 +543,15 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_status);
 static void rpc_run_timer(unsigned long ptr)
 {
 	struct rpc_task *task = (struct rpc_task *)ptr;
-	void (*callback)(struct rpc_task *);
+	struct rpc_wait_queue *queue = task->tk_waitqueue;
 
-	if (RPC_IS_QUEUED(task)) {
-		struct rpc_wait_queue *queue = task->tk_waitqueue;
-		callback = task->tk_timeout_fn;
-
-		dprintk("RPC: %5u running timer\n", task->tk_pid);
-		if (callback != NULL)
-			callback(task);
-		/* Note: we're already in a bh-safe context */
-		spin_lock(&queue->lock);
+	spin_lock(&queue->lock);
+	if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue) {
+		dprintk("RPC: %5u timeout\n", task->tk_pid);
+		task->tk_status = -ETIMEDOUT;
 		rpc_wake_up_task_queue_locked(queue, task);
-		spin_unlock(&queue->lock);
 	}
+	spin_unlock(&queue->lock);
 	smp_mb__before_clear_bit();
 	clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
 	smp_mb__after_clear_bit();
@@ -580,6 +559,7 @@ static void rpc_run_timer(unsigned long ptr)
 
 static void __rpc_atrun(struct rpc_task *task)
 {
+	task->tk_status = 0;
 }
 
 /*
@@ -588,7 +568,7 @@ static void __rpc_atrun(struct rpc_task *task)
 void rpc_delay(struct rpc_task *task, unsigned long delay)
 {
 	task->tk_timeout = delay;
-	rpc_sleep_on(&delay_queue, task, NULL, __rpc_atrun);
+	rpc_sleep_on(&delay_queue, task, __rpc_atrun);
 }
 EXPORT_SYMBOL_GPL(rpc_delay);
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 6e2772217e5..9bf118c5431 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -188,9 +188,9 @@ out_sleep:
 	task->tk_timeout = 0;
 	task->tk_status = -EAGAIN;
 	if (req && req->rq_ntrans)
-		rpc_sleep_on(&xprt->resend, task, NULL, NULL);
+		rpc_sleep_on(&xprt->resend, task, NULL);
 	else
-		rpc_sleep_on(&xprt->sending, task, NULL, NULL);
+		rpc_sleep_on(&xprt->sending, task, NULL);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
@@ -238,9 +238,9 @@ out_sleep:
 	task->tk_timeout = 0;
 	task->tk_status = -EAGAIN;
 	if (req && req->rq_ntrans)
-		rpc_sleep_on(&xprt->resend, task, NULL, NULL);
+		rpc_sleep_on(&xprt->resend, task, NULL);
 	else
-		rpc_sleep_on(&xprt->sending, task, NULL, NULL);
+		rpc_sleep_on(&xprt->sending, task, NULL);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
@@ -453,7 +453,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task)
 	struct rpc_xprt *xprt = req->rq_xprt;
 
 	task->tk_timeout = req->rq_timeout;
-	rpc_sleep_on(&xprt->pending, task, NULL, NULL);
+	rpc_sleep_on(&xprt->pending, task, NULL);
 }
 EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
 
@@ -652,7 +652,7 @@ void xprt_connect(struct rpc_task *task)
 			task->tk_rqstp->rq_bytes_sent = 0;
 
 		task->tk_timeout = xprt->connect_timeout;
-		rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL);
+		rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
 		xprt->stat.connect_start = jiffies;
 		xprt->ops->connect(task);
 	}
@@ -769,15 +769,17 @@ static void xprt_timer(struct rpc_task *task)
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
 
+	if (task->tk_status != -ETIMEDOUT)
+		return;
 	dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
 
-	spin_lock(&xprt->transport_lock);
+	spin_lock_bh(&xprt->transport_lock);
 	if (!req->rq_received) {
 		if (xprt->ops->timer)
 			xprt->ops->timer(task);
-		task->tk_status = -ETIMEDOUT;
-	}
-	spin_unlock(&xprt->transport_lock);
+	} else
+		task->tk_status = 0;
+	spin_unlock_bh(&xprt->transport_lock);
 }
 
 /**
@@ -862,7 +864,7 @@ void xprt_transmit(struct rpc_task *task)
 		if (!xprt_connected(xprt))
 			task->tk_status = -ENOTCONN;
 		else if (!req->rq_received)
-			rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer);
+			rpc_sleep_on(&xprt->pending, task, xprt_timer);
 		spin_unlock_bh(&xprt->transport_lock);
 		return;
 	}
@@ -873,7 +875,7 @@ void xprt_transmit(struct rpc_task *task)
 	 */
 	task->tk_status = status;
 	if (status == -ECONNREFUSED)
-		rpc_sleep_on(&xprt->sending, task, NULL, NULL);
+		rpc_sleep_on(&xprt->sending, task, NULL);
 }
 
 static inline void do_xprt_reserve(struct rpc_task *task)
@@ -893,7 +895,7 @@ static inline void do_xprt_reserve(struct rpc_task *task)
 	dprintk("RPC:       waiting for request slot\n");
 	task->tk_status = -EAGAIN;
 	task->tk_timeout = 0;
-	rpc_sleep_on(&xprt->backlog, task, NULL, NULL);
+	rpc_sleep_on(&xprt->backlog, task, NULL);
 }
 
 /**
-- 
cgit v1.2.3


From f6a1cc89309f0ae847a9b6fe418d1c4215e5bc55 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 22 Feb 2008 17:06:55 -0500
Subject: SUNRPC: Add a (empty for the moment) destructor for rpc_wait_queues

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c                |  2 ++
 fs/nfs/nfs4state.c             |  6 +++++-
 include/linux/sunrpc/sched.h   |  1 +
 net/sunrpc/auth_gss/auth_gss.c |  1 +
 net/sunrpc/sched.c             | 21 ++++++++++++++-------
 net/sunrpc/xprt.c              |  5 +++++
 6 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c5c0175898f..06f064d8fbb 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -170,6 +170,8 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 	BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
 	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
 		nfs_idmap_delete(clp);
+
+	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 #endif
 }
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a2ef02824aa..7775435ea7a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -292,8 +292,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	spin_unlock(&clp->cl_lock);
 	if (sp == new)
 		get_rpccred(cred);
-	else
+	else {
+		rpc_destroy_wait_queue(&new->so_sequence.wait);
 		kfree(new);
+	}
 	return sp;
 }
 
@@ -310,6 +312,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 		return;
 	nfs4_remove_state_owner(clp, sp);
 	spin_unlock(&clp->cl_lock);
+	rpc_destroy_wait_queue(&sp->so_sequence.wait);
 	put_rpccred(cred);
 	kfree(sp);
 }
@@ -529,6 +532,7 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 	spin_lock(&clp->cl_lock);
 	nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
 	spin_unlock(&clp->cl_lock);
+	rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
 	kfree(lsp);
 }
 
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 503a937bdca..d39729e2b89 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -228,6 +228,7 @@ void		rpc_killall_tasks(struct rpc_clnt *);
 void		rpc_execute(struct rpc_task *);
 void		rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
+void		rpc_destroy_wait_queue(struct rpc_wait_queue *);
 void		rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
 					rpc_action action);
 void		rpc_wake_up_queued_task(struct rpc_wait_queue *,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index dc6391bcda1..ef638496180 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -266,6 +266,7 @@ gss_release_msg(struct gss_upcall_msg *gss_msg)
 	BUG_ON(!list_empty(&gss_msg->list));
 	if (gss_msg->ctx != NULL)
 		gss_put_ctx(gss_msg->ctx);
+	rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue);
 	kfree(gss_msg);
 }
 
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index caf12fd6b6a..86aa897e7b0 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -214,6 +214,11 @@ void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname)
 }
 EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
 
+void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
+{
+}
+EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
+
 static int rpc_wait_bit_killable(void *word)
 {
 	if (fatal_signal_pending(current))
@@ -1020,11 +1025,20 @@ rpc_destroy_mempool(void)
 		kmem_cache_destroy(rpc_task_slabp);
 	if (rpc_buffer_slabp)
 		kmem_cache_destroy(rpc_buffer_slabp);
+	rpc_destroy_wait_queue(&delay_queue);
 }
 
 int
 rpc_init_mempool(void)
 {
+	/*
+	 * The following is not strictly a mempool initialisation,
+	 * but there is no harm in doing it here
+	 */
+	rpc_init_wait_queue(&delay_queue, "delayq");
+	if (!rpciod_start())
+		goto err_nomem;
+
 	rpc_task_slabp = kmem_cache_create("rpc_tasks",
 					     sizeof(struct rpc_task),
 					     0, SLAB_HWCACHE_ALIGN,
@@ -1045,13 +1059,6 @@ rpc_init_mempool(void)
 						      rpc_buffer_slabp);
 	if (!rpc_buffer_mempool)
 		goto err_nomem;
-	if (!rpciod_start())
-		goto err_nomem;
-	/*
-	 * The following is not strictly a mempool initialisation,
-	 * but there is no harm in doing it here
-	 */
-	rpc_init_wait_queue(&delay_queue, "delayq");
 	return 0;
 err_nomem:
 	rpc_destroy_mempool();
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9bf118c5431..85199c64702 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1052,6 +1052,11 @@ static void xprt_destroy(struct kref *kref)
 	xprt->shutdown = 1;
 	del_timer_sync(&xprt->timer);
 
+	rpc_destroy_wait_queue(&xprt->binding);
+	rpc_destroy_wait_queue(&xprt->pending);
+	rpc_destroy_wait_queue(&xprt->sending);
+	rpc_destroy_wait_queue(&xprt->resend);
+	rpc_destroy_wait_queue(&xprt->backlog);
 	/*
 	 * Tear down transport state and free the rpc_xprt
 	 */
-- 
cgit v1.2.3


From 36df9aae3158ce8fc4ede241169dc94ac910d884 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 18 Jul 2007 16:18:52 -0400
Subject: SUNRPC: Add a timer function to wait queues.

This is designed to replace the timeout timer in the individual rpc_tasks.
By putting the timer function in the wait queue, we will eventually be able
to reduce the total number of timers in use by the RPC subsystem.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  9 +++++++++
 net/sunrpc/sched.c           | 46 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d39729e2b89..7751d3a0549 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -33,6 +33,8 @@ struct rpc_wait_queue;
 struct rpc_wait {
 	struct list_head	list;		/* wait queue links */
 	struct list_head	links;		/* Links to related tasks */
+	struct list_head	timer_list;	/* Timer list */
+	unsigned long		expires;
 };
 
 /*
@@ -191,6 +193,12 @@ struct rpc_task_setup {
 #define RPC_PRIORITY_HIGH	(1)
 #define RPC_NR_PRIORITY		(1 + RPC_PRIORITY_HIGH - RPC_PRIORITY_LOW)
 
+struct rpc_timer {
+	struct timer_list timer;
+	struct list_head list;
+	unsigned long expires;
+};
+
 /*
  * RPC synchronization objects
  */
@@ -203,6 +211,7 @@ struct rpc_wait_queue {
 	unsigned char		count;			/* # task groups remaining serviced so far */
 	unsigned char		nr;			/* # tasks remaining for cookie */
 	unsigned short		qlen;			/* total # tasks waiting in queue */
+	struct rpc_timer	timer_list;
 #ifdef RPC_DEBUG
 	const char *		name;
 #endif
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 86aa897e7b0..29b1c1441f4 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -40,6 +40,7 @@ static mempool_t	*rpc_buffer_mempool __read_mostly;
 
 static void			rpc_async_schedule(struct work_struct *);
 static void			 rpc_release_task(struct rpc_task *task);
+static void __rpc_queue_timer_fn(unsigned long ptr);
 
 /*
  * RPC tasks sit here while waiting for conditions to improve.
@@ -59,8 +60,18 @@ struct workqueue_struct *rpciod_workqueue;
 static void
 __rpc_disable_timer(struct rpc_task *task)
 {
+	if (task->tk_timeout == 0)
+		return;
 	dprintk("RPC: %5u disabling timer\n", task->tk_pid);
 	task->tk_timeout = 0;
+	list_del(&task->u.tk_wait.timer_list);
+}
+
+static void
+rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
+{
+	queue->timer_list.expires = expires;
+	mod_timer(&queue->timer_list.timer, expires);
 }
 
 /*
@@ -153,7 +164,6 @@ static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
 		list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
 		list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
 	}
-	list_del(&task->u.tk_wait.list);
 }
 
 /*
@@ -162,10 +172,10 @@ static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
  */
 static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
+	__rpc_disable_timer(task);
 	if (RPC_IS_PRIORITY(queue))
 		__rpc_remove_wait_queue_priority(task);
-	else
-		list_del(&task->u.tk_wait.list);
+	list_del(&task->u.tk_wait.list);
 	queue->qlen--;
 	dprintk("RPC: %5u removed from queue %p \"%s\"\n",
 			task->tk_pid, queue, rpc_qname(queue));
@@ -198,6 +208,9 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c
 		INIT_LIST_HEAD(&queue->tasks[i]);
 	queue->maxpriority = nr_queues - 1;
 	rpc_reset_waitqueue_priority(queue);
+	queue->qlen = 0;
+	setup_timer(&queue->timer_list.timer, __rpc_queue_timer_fn, (unsigned long)queue);
+	INIT_LIST_HEAD(&queue->timer_list.list);
 #ifdef RPC_DEBUG
 	queue->name = qname;
 #endif
@@ -216,6 +229,7 @@ EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
 
 void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
 {
+	del_timer_sync(&queue->timer_list.timer);
 }
 EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
 
@@ -369,7 +383,6 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
 		return;
 	}
 
-	__rpc_disable_timer(task);
 	__rpc_remove_wait_queue(queue, task);
 
 	rpc_make_runnable(task);
@@ -562,6 +575,31 @@ static void rpc_run_timer(unsigned long ptr)
 	smp_mb__after_clear_bit();
 }
 
+static void __rpc_queue_timer_fn(unsigned long ptr)
+{
+	struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr;
+	struct rpc_task *task, *n;
+	unsigned long expires, now, timeo;
+
+	spin_lock(&queue->lock);
+	expires = now = jiffies;
+	list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
+		timeo = task->u.tk_wait.expires;
+		if (time_after_eq(now, timeo)) {
+			list_del_init(&task->u.tk_wait.timer_list);
+			dprintk("RPC: %5u timeout\n", task->tk_pid);
+			task->tk_status = -ETIMEDOUT;
+			rpc_wake_up_task_queue_locked(queue, task);
+			continue;
+		}
+		if (expires == now || time_after(expires, timeo))
+			expires = timeo;
+	}
+	if (!list_empty(&queue->timer_list.list))
+		rpc_set_queue_timer(queue, expires);
+	spin_unlock(&queue->lock);
+}
+
 static void __rpc_atrun(struct rpc_task *task)
 {
 	task->tk_status = 0;
-- 
cgit v1.2.3


From eb276c0e10187702928aeaa133e1d3dbaf3eafc7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 22 Feb 2008 17:27:59 -0500
Subject: SUNRPC: Switch tasks to using the rpc_waitqueue's timer function

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  9 +------
 net/sunrpc/sched.c           | 60 ++++++++------------------------------------
 2 files changed, 11 insertions(+), 58 deletions(-)

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 7751d3a0549..0d7be1642dc 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -67,12 +67,6 @@ struct rpc_task {
 	const struct rpc_call_ops *tk_ops;
 	void *			tk_calldata;
 
-	/*
-	 * tk_timer is used for async processing by the RPC scheduling
-	 * primitives. You should not access this directly unless
-	 * you have a pathological interest in kernel oopses.
-	 */
-	struct timer_list	tk_timer;	/* kernel timer */
 	unsigned long		tk_timeout;	/* timeout for rpc_sleep() */
 	unsigned short		tk_flags;	/* misc flags */
 	unsigned long		tk_runstate;	/* Task run status */
@@ -149,8 +143,7 @@ struct rpc_task_setup {
 #define RPC_TASK_RUNNING	0
 #define RPC_TASK_QUEUED		1
 #define RPC_TASK_WAKEUP		2
-#define RPC_TASK_HAS_TIMER	3
-#define RPC_TASK_ACTIVE		4
+#define RPC_TASK_ACTIVE		3
 
 #define RPC_IS_RUNNING(t)	test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
 #define rpc_set_running(t)	set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 29b1c1441f4..043eef4c15a 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -58,13 +58,15 @@ struct workqueue_struct *rpciod_workqueue;
  * rpc_run_timer().
  */
 static void
-__rpc_disable_timer(struct rpc_task *task)
+__rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
 	if (task->tk_timeout == 0)
 		return;
 	dprintk("RPC: %5u disabling timer\n", task->tk_pid);
 	task->tk_timeout = 0;
 	list_del(&task->u.tk_wait.timer_list);
+	if (list_empty(&queue->timer_list.list))
+		del_timer(&queue->timer_list.timer);
 }
 
 static void
@@ -78,7 +80,7 @@ rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
  * Set up a timer for the current task.
  */
 static void
-__rpc_add_timer(struct rpc_task *task)
+__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
 	if (!task->tk_timeout)
 		return;
@@ -86,23 +88,10 @@ __rpc_add_timer(struct rpc_task *task)
 	dprintk("RPC: %5u setting alarm for %lu ms\n",
 			task->tk_pid, task->tk_timeout * 1000 / HZ);
 
-	set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
-	mod_timer(&task->tk_timer, jiffies + task->tk_timeout);
-}
-
-/*
- * Delete any timer for the current task. Because we use del_timer_sync(),
- * this function should never be called while holding queue->lock.
- */
-static void
-rpc_delete_timer(struct rpc_task *task)
-{
-	if (RPC_IS_QUEUED(task))
-		return;
-	if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) {
-		del_singleshot_timer_sync(&task->tk_timer);
-		dprintk("RPC: %5u deleting timer\n", task->tk_pid);
-	}
+	task->u.tk_wait.expires = jiffies + task->tk_timeout;
+	if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
+		rpc_set_queue_timer(queue, task->u.tk_wait.expires);
+	list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
 }
 
 /*
@@ -172,7 +161,7 @@ static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
  */
 static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
-	__rpc_disable_timer(task);
+	__rpc_disable_timer(queue, task);
 	if (RPC_IS_PRIORITY(queue))
 		__rpc_remove_wait_queue_priority(task);
 	list_del(&task->u.tk_wait.list);
@@ -344,7 +333,7 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 
 	BUG_ON(task->tk_callback != NULL);
 	task->tk_callback = action;
-	__rpc_add_timer(task);
+	__rpc_add_timer(q, task);
 }
 
 void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
@@ -555,26 +544,6 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_status);
 
-/*
- * Run a timeout function.
- */
-static void rpc_run_timer(unsigned long ptr)
-{
-	struct rpc_task *task = (struct rpc_task *)ptr;
-	struct rpc_wait_queue *queue = task->tk_waitqueue;
-
-	spin_lock(&queue->lock);
-	if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue) {
-		dprintk("RPC: %5u timeout\n", task->tk_pid);
-		task->tk_status = -ETIMEDOUT;
-		rpc_wake_up_task_queue_locked(queue, task);
-	}
-	spin_unlock(&queue->lock);
-	smp_mb__before_clear_bit();
-	clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
-	smp_mb__after_clear_bit();
-}
-
 static void __rpc_queue_timer_fn(unsigned long ptr)
 {
 	struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr;
@@ -586,7 +555,6 @@ static void __rpc_queue_timer_fn(unsigned long ptr)
 	list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
 		timeo = task->u.tk_wait.expires;
 		if (time_after_eq(now, timeo)) {
-			list_del_init(&task->u.tk_wait.timer_list);
 			dprintk("RPC: %5u timeout\n", task->tk_pid);
 			task->tk_status = -ETIMEDOUT;
 			rpc_wake_up_task_queue_locked(queue, task);
@@ -666,10 +634,6 @@ static void __rpc_execute(struct rpc_task *task)
 	BUG_ON(RPC_IS_QUEUED(task));
 
 	for (;;) {
-		/*
-		 * Garbage collection of pending timers...
-		 */
-		rpc_delete_timer(task);
 
 		/*
 		 * Execute any pending callback.
@@ -838,7 +802,6 @@ EXPORT_SYMBOL_GPL(rpc_free);
 static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data)
 {
 	memset(task, 0, sizeof(*task));
-	setup_timer(&task->tk_timer, rpc_run_timer, (unsigned long)task);
 	atomic_set(&task->tk_count, 1);
 	task->tk_flags  = task_setup_data->flags;
 	task->tk_ops = task_setup_data->callback_ops;
@@ -971,9 +934,6 @@ static void rpc_release_task(struct rpc_task *task)
 	}
 	BUG_ON (RPC_IS_QUEUED(task));
 
-	/* Synchronously delete any running timer */
-	rpc_delete_timer(task);
-
 #ifdef RPC_DEBUG
 	task->tk_magic = 0;
 #endif
-- 
cgit v1.2.3


From f5fb7b06e4e4ab18326f067f4317b2016ce18af2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 25 Feb 2008 21:40:50 -0800
Subject: SUNRPC: Eliminate the now-redundant rpc_start_wakeup()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h | 12 +-----------
 net/sunrpc/sched.c           |  8 ++------
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 0d7be1642dc..bf69fce84ed 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -142,8 +142,7 @@ struct rpc_task_setup {
 
 #define RPC_TASK_RUNNING	0
 #define RPC_TASK_QUEUED		1
-#define RPC_TASK_WAKEUP		2
-#define RPC_TASK_ACTIVE		3
+#define RPC_TASK_ACTIVE		2
 
 #define RPC_IS_RUNNING(t)	test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
 #define rpc_set_running(t)	set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
@@ -165,15 +164,6 @@ struct rpc_task_setup {
 		smp_mb__after_clear_bit(); \
 	} while (0)
 
-#define rpc_start_wakeup(t) \
-	(test_and_set_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate) == 0)
-#define rpc_finish_wakeup(t) \
-	do { \
-		smp_mb__before_clear_bit(); \
-		clear_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate); \
-		smp_mb__after_clear_bit(); \
-	} while (0)
-
 #define RPC_IS_ACTIVATED(t)	test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate)
 
 /*
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 043eef4c15a..88a686a8e43 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -384,12 +384,8 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
  */
 static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
-	if (!RPC_IS_QUEUED(task) || task->tk_waitqueue != queue)
-		return;
-	if (rpc_start_wakeup(task)) {
-			__rpc_do_wake_up_task(queue, task);
-		rpc_finish_wakeup(task);
-	}
+	if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue)
+		__rpc_do_wake_up_task(queue, task);
 }
 
 /*
-- 
cgit v1.2.3


From ff2d7db848f8db7cade39e55f78f86d77e0de01a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 25 Feb 2008 21:40:51 -0800
Subject: SUNRPC: Ensure that we read all available tcp data

Don't stop until we run out of data, or we hit an error.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 30e7ac243a9..8bd3b0f73ac 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1073,6 +1073,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
 {
 	struct rpc_xprt *xprt;
 	read_descriptor_t rd_desc;
+	int read;
 
 	dprintk("RPC:       xs_tcp_data_ready...\n");
 
@@ -1084,8 +1085,10 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
 
 	/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
 	rd_desc.arg.data = xprt;
-	rd_desc.count = 65536;
-	tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+	do {
+		rd_desc.count = 65536;
+		read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+	} while (read > 0);
 out:
 	read_unlock(&sk->sk_callback_lock);
 }
-- 
cgit v1.2.3


From 5e4424af9a1f062c6451681dff24a26e27741cc6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 25 Feb 2008 21:53:49 -0800
Subject: SUNRPC: Remove now-redundant RCU-safe rpc_task free path

Now that we've tightened up the locking rules for RPC queue wakeups, we can
remove the RCU-safe kfree calls...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c                |  8 +-------
 fs/nfs/write.c               | 16 ++--------------
 include/linux/sunrpc/sched.h |  2 --
 net/sunrpc/sched.c           | 37 ++++++++++++-------------------------
 4 files changed, 15 insertions(+), 48 deletions(-)

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 87546cd277d..be9e8270f4d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -58,19 +58,13 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 	return p;
 }
 
-static void nfs_readdata_rcu_free(struct rcu_head *head)
+static void nfs_readdata_free(struct nfs_read_data *p)
 {
-	struct nfs_read_data *p = container_of(head, struct nfs_read_data, task.u.tk_rcu);
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_rdata_mempool);
 }
 
-static void nfs_readdata_free(struct nfs_read_data *rdata)
-{
-	call_rcu_bh(&rdata->task.u.tk_rcu, nfs_readdata_rcu_free);
-}
-
 void nfs_readdata_release(void *data)
 {
 	struct nfs_read_data *rdata = data;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 84495ef9072..30513029c00 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -58,19 +58,13 @@ struct nfs_write_data *nfs_commit_alloc(void)
 	return p;
 }
 
-static void nfs_commit_rcu_free(struct rcu_head *head)
+void nfs_commit_free(struct nfs_write_data *p)
 {
-	struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu);
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_commit_mempool);
 }
 
-void nfs_commit_free(struct nfs_write_data *wdata)
-{
-	call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free);
-}
-
 struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
@@ -92,19 +86,13 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 	return p;
 }
 
-static void nfs_writedata_rcu_free(struct rcu_head *head)
+static void nfs_writedata_free(struct nfs_write_data *p)
 {
-	struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu);
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_wdata_mempool);
 }
 
-static void nfs_writedata_free(struct nfs_write_data *wdata)
-{
-	call_rcu_bh(&wdata->task.u.tk_rcu, nfs_writedata_rcu_free);
-}
-
 void nfs_writedata_release(void *data)
 {
 	struct nfs_write_data *wdata = data;
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index bf69fce84ed..d1a5c8c1a0f 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -11,7 +11,6 @@
 
 #include <linux/timer.h>
 #include <linux/sunrpc/types.h>
-#include <linux/rcupdate.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
@@ -77,7 +76,6 @@ struct rpc_task {
 	union {
 		struct work_struct	tk_work;	/* Async task work queue */
 		struct rpc_wait		tk_wait;	/* RPC wait */
-		struct rcu_head		tk_rcu;		/* for task deletion */
 	} u;
 
 	unsigned short		tk_timeouts;	/* maj timeouts */
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 88a686a8e43..cae219c8cae 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -393,11 +393,9 @@ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct r
  */
 void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
-	rcu_read_lock_bh();
-	spin_lock(&queue->lock);
+	spin_lock_bh(&queue->lock);
 	rpc_wake_up_task_queue_locked(queue, task);
-	spin_unlock(&queue->lock);
-	rcu_read_unlock_bh();
+	spin_unlock_bh(&queue->lock);
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
 
@@ -470,16 +468,14 @@ struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue)
 
 	dprintk("RPC:       wake_up_next(%p \"%s\")\n",
 			queue, rpc_qname(queue));
-	rcu_read_lock_bh();
-	spin_lock(&queue->lock);
+	spin_lock_bh(&queue->lock);
 	if (RPC_IS_PRIORITY(queue))
 		task = __rpc_wake_up_next_priority(queue);
 	else {
 		task_for_first(task, &queue->tasks[0])
 			rpc_wake_up_task_queue_locked(queue, task);
 	}
-	spin_unlock(&queue->lock);
-	rcu_read_unlock_bh();
+	spin_unlock_bh(&queue->lock);
 
 	return task;
 }
@@ -496,8 +492,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue)
 	struct rpc_task *task, *next;
 	struct list_head *head;
 
-	rcu_read_lock_bh();
-	spin_lock(&queue->lock);
+	spin_lock_bh(&queue->lock);
 	head = &queue->tasks[queue->maxpriority];
 	for (;;) {
 		list_for_each_entry_safe(task, next, head, u.tk_wait.list)
@@ -506,8 +501,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue)
 			break;
 		head--;
 	}
-	spin_unlock(&queue->lock);
-	rcu_read_unlock_bh();
+	spin_unlock_bh(&queue->lock);
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up);
 
@@ -523,8 +517,7 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 	struct rpc_task *task, *next;
 	struct list_head *head;
 
-	rcu_read_lock_bh();
-	spin_lock(&queue->lock);
+	spin_lock_bh(&queue->lock);
 	head = &queue->tasks[queue->maxpriority];
 	for (;;) {
 		list_for_each_entry_safe(task, next, head, u.tk_wait.list) {
@@ -535,8 +528,7 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 			break;
 		head--;
 	}
-	spin_unlock(&queue->lock);
-	rcu_read_unlock_bh();
+	spin_unlock_bh(&queue->lock);
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_status);
 
@@ -848,13 +840,6 @@ rpc_alloc_task(void)
 	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
 }
 
-static void rpc_free_task_rcu(struct rcu_head *rcu)
-{
-	struct rpc_task *task = container_of(rcu, struct rpc_task, u.tk_rcu);
-	dprintk("RPC: %5u freeing task\n", task->tk_pid);
-	mempool_free(task, rpc_task_mempool);
-}
-
 /*
  * Create a new task for the specified client.
  */
@@ -883,8 +868,10 @@ static void rpc_free_task(struct rpc_task *task)
 	const struct rpc_call_ops *tk_ops = task->tk_ops;
 	void *calldata = task->tk_calldata;
 
-	if (task->tk_flags & RPC_TASK_DYNAMIC)
-		call_rcu_bh(&task->u.tk_rcu, rpc_free_task_rcu);
+	if (task->tk_flags & RPC_TASK_DYNAMIC) {
+		dprintk("RPC: %5u freeing task\n", task->tk_pid);
+		mempool_free(task, rpc_task_mempool);
+	}
 	rpc_release_calldata(tk_ops, calldata);
 }
 
-- 
cgit v1.2.3


From 25337fdc85951dfeac944f16cb565904c619077a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Mar 2008 14:40:14 -0400
Subject: SUNRPC: Fix a bug in rpcauth_lookup_credcache()

The hash bucket is for some reason always being set to zero.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h | 4 ++--
 net/sunrpc/auth.c           | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 7a69ca3beba..84d5f3a05b1 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -59,8 +59,8 @@ struct rpc_cred {
 /*
  * Client authentication handle
  */
-#define RPC_CREDCACHE_NR	8
-#define RPC_CREDCACHE_MASK	(RPC_CREDCACHE_NR - 1)
+#define RPC_CREDCACHE_HASHBITS	4
+#define RPC_CREDCACHE_NR	(1 << RPC_CREDCACHE_HASHBITS)
 struct rpc_cred_cache {
 	struct hlist_head	hashtable[RPC_CREDCACHE_NR];
 	spinlock_t		lock;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index eca941ce298..b38f6ee2f5e 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
+#include <linux/hash.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/spinlock.h>
 
@@ -280,7 +281,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	struct hlist_node *pos;
 	struct rpc_cred	*cred = NULL,
 			*entry, *new;
-	int		nr = 0;
+	unsigned int nr;
+
+	nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS);
 
 	if (!(flags & RPCAUTH_LOOKUP_ROOTCREDS))
 		nr = acred->uid & RPC_CREDCACHE_MASK;
-- 
cgit v1.2.3


From af093835774931de898a9baf7b4041fa0d100f77 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Mar 2008 12:12:16 -0400
Subject: SUNRPC: Fix RPCAUTH_LOOKUP_ROOTCREDS

The current RPCAUTH_LOOKUP_ROOTCREDS flag only works for AUTH_SYS
authentication, and then only as a special case in the code. This patch
removes the auth_sys special casing, and replaces it with generic code.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  4 ++--
 net/sunrpc/auth.c           | 35 ++++++++++++++++------------
 net/sunrpc/auth_unix.c      | 56 +++++++++++++++++++--------------------------
 net/sunrpc/sched.c          |  4 +++-
 4 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 84d5f3a05b1..012566a6257 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -89,7 +89,6 @@ struct rpc_auth {
 
 /* Flags for rpcauth_lookupcred() */
 #define RPCAUTH_LOOKUP_NEW		0x01	/* Accept an uninitialised cred */
-#define RPCAUTH_LOOKUP_ROOTCREDS	0x02	/* This really ought to go! */
 
 /*
  * Client authentication ops
@@ -136,7 +135,8 @@ void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
-struct rpc_cred *	rpcauth_bindcred(struct rpc_task *);
+void			rpcauth_bindcred(struct rpc_task *);
+void			rpcauth_bind_root_cred(struct rpc_task *);
 void			rpcauth_holdcred(struct rpc_task *);
 void			put_rpccred(struct rpc_cred *);
 void			rpcauth_unbindcred(struct rpc_task *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index b38f6ee2f5e..b0f2b2ee7cc 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -285,9 +285,6 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 
 	nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS);
 
-	if (!(flags & RPCAUTH_LOOKUP_ROOTCREDS))
-		nr = acred->uid & RPC_CREDCACHE_MASK;
-
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) {
 		if (!entry->cr_ops->crmatch(acred, entry, flags))
@@ -378,30 +375,38 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_cred);
 
-struct rpc_cred *
-rpcauth_bindcred(struct rpc_task *task)
+void
+rpcauth_bind_root_cred(struct rpc_task *task)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct auth_cred acred = {
-		.uid = current->fsuid,
-		.gid = current->fsgid,
-		.group_info = current->group_info,
+		.uid = 0,
+		.gid = 0,
 	};
 	struct rpc_cred *ret;
-	int flags = 0;
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
-	get_group_info(acred.group_info);
-	if (task->tk_flags & RPC_TASK_ROOTCREDS)
-		flags |= RPCAUTH_LOOKUP_ROOTCREDS;
-	ret = auth->au_ops->lookup_cred(auth, &acred, flags);
+	ret = auth->au_ops->lookup_cred(auth, &acred, 0);
+	if (!IS_ERR(ret))
+		task->tk_msg.rpc_cred = ret;
+	else
+		task->tk_status = PTR_ERR(ret);
+}
+
+void
+rpcauth_bindcred(struct rpc_task *task)
+{
+	struct rpc_auth *auth = task->tk_client->cl_auth;
+	struct rpc_cred *ret;
+
+	dprintk("RPC: %5u looking up %s cred\n",
+		task->tk_pid, auth->au_ops->au_name);
+	ret = rpcauth_lookupcred(auth, 0);
 	if (!IS_ERR(ret))
 		task->tk_msg.rpc_cred = ret;
 	else
 		task->tk_status = PTR_ERR(ret);
-	put_group_info(acred.group_info);
-	return ret;
 }
 
 void
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 5ed91e5bcee..b763710d3db 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -60,7 +60,8 @@ static struct rpc_cred *
 unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
 	struct unx_cred	*cred;
-	int		i;
+	unsigned int groups = 0;
+	unsigned int i;
 
 	dprintk("RPC:       allocating UNIX cred for uid %d gid %d\n",
 			acred->uid, acred->gid);
@@ -70,21 +71,17 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 
 	rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
 	cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
-	if (flags & RPCAUTH_LOOKUP_ROOTCREDS) {
-		cred->uc_uid = 0;
-		cred->uc_gid = 0;
-		cred->uc_gids[0] = NOGROUP;
-	} else {
-		int groups = acred->group_info->ngroups;
-		if (groups > NFS_NGROUPS)
-			groups = NFS_NGROUPS;
-
-		cred->uc_gid = acred->gid;
-		for (i = 0; i < groups; i++)
-			cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
-		if (i < NFS_NGROUPS)
-		  cred->uc_gids[i] = NOGROUP;
-	}
+
+	if (acred->group_info != NULL)
+		groups = acred->group_info->ngroups;
+	if (groups > NFS_NGROUPS)
+		groups = NFS_NGROUPS;
+
+	cred->uc_gid = acred->gid;
+	for (i = 0; i < groups; i++)
+		cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
+	if (i < NFS_NGROUPS)
+		cred->uc_gids[i] = NOGROUP;
 
 	return &cred->uc_base;
 }
@@ -118,26 +115,21 @@ static int
 unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
 {
 	struct unx_cred	*cred = container_of(rcred, struct unx_cred, uc_base);
-	int		i;
+	unsigned int groups = 0;
+	unsigned int i;
 
-	if (!(flags & RPCAUTH_LOOKUP_ROOTCREDS)) {
-		int groups;
 
-		if (cred->uc_uid != acred->uid
-		 || cred->uc_gid != acred->gid)
-			return 0;
+	if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid)
+		return 0;
 
+	if (acred->group_info != NULL)
 		groups = acred->group_info->ngroups;
-		if (groups > NFS_NGROUPS)
-			groups = NFS_NGROUPS;
-		for (i = 0; i < groups ; i++)
-			if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i))
-				return 0;
-		return 1;
-	}
-	return (cred->uc_uid == 0
-	     && cred->uc_gid == 0
-	     && cred->uc_gids[0] == (gid_t) NOGROUP);
+	if (groups > NFS_NGROUPS)
+		groups = NFS_NGROUPS;
+	for (i = 0; i < groups ; i++)
+		if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i))
+			return 0;
+	return 1;
 }
 
 /*
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index cae219c8cae..7db956f6e01 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -821,8 +821,10 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 		/* Bind the user cred */
 		if (task->tk_msg.rpc_cred != NULL)
 			rpcauth_holdcred(task);
-		else
+		else if (!(task_setup_data->flags & RPC_TASK_ROOTCREDS))
 			rpcauth_bindcred(task);
+		else
+			rpcauth_bind_root_cred(task);
 		if (task->tk_action == NULL)
 			rpc_call_start(task);
 	}
-- 
cgit v1.2.3


From 4ccda2cdd8d156b6f49440653d5d6997e0facf97 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Mar 2008 16:20:55 -0400
Subject: SUNRPC: Clean up rpcauth_bindcred()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  4 +---
 net/sunrpc/auth.c           | 28 ++++++++++++++++++----------
 net/sunrpc/sched.c          | 11 ++++-------
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 012566a6257..348546c7826 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -135,9 +135,7 @@ void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
-void			rpcauth_bindcred(struct rpc_task *);
-void			rpcauth_bind_root_cred(struct rpc_task *);
-void			rpcauth_holdcred(struct rpc_task *);
+void			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
 void			put_rpccred(struct rpc_cred *);
 void			rpcauth_unbindcred(struct rpc_task *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index b0f2b2ee7cc..1cdb163c4f8 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -375,7 +375,15 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_cred);
 
-void
+static void
+rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred)
+{
+	task->tk_msg.rpc_cred = get_rpccred(cred);
+	dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
+			cred->cr_auth->au_ops->au_name, cred);
+}
+
+static void
 rpcauth_bind_root_cred(struct rpc_task *task)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
@@ -394,8 +402,8 @@ rpcauth_bind_root_cred(struct rpc_task *task)
 		task->tk_status = PTR_ERR(ret);
 }
 
-void
-rpcauth_bindcred(struct rpc_task *task)
+static void
+rpcauth_bind_new_cred(struct rpc_task *task)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct rpc_cred *ret;
@@ -410,14 +418,14 @@ rpcauth_bindcred(struct rpc_task *task)
 }
 
 void
-rpcauth_holdcred(struct rpc_task *task)
+rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
-	if (cred != NULL) {
-		get_rpccred(cred);
-		dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
-				cred->cr_auth->au_ops->au_name, cred);
-	}
+	if (cred != NULL)
+		rpcauth_generic_bind_cred(task, cred);
+	else if (flags & RPC_TASK_ROOTCREDS)
+		rpcauth_bind_root_cred(task);
+	else
+		rpcauth_bind_new_cred(task);
 }
 
 void
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 7db956f6e01..6eab9bf94ba 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -817,14 +817,11 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 		task->tk_action = rpc_prepare_task;
 
 	if (task_setup_data->rpc_message != NULL) {
-		memcpy(&task->tk_msg, task_setup_data->rpc_message, sizeof(task->tk_msg));
+		task->tk_msg.rpc_proc = task_setup_data->rpc_message->rpc_proc;
+		task->tk_msg.rpc_argp = task_setup_data->rpc_message->rpc_argp;
+		task->tk_msg.rpc_resp = task_setup_data->rpc_message->rpc_resp;
 		/* Bind the user cred */
-		if (task->tk_msg.rpc_cred != NULL)
-			rpcauth_holdcred(task);
-		else if (!(task_setup_data->flags & RPC_TASK_ROOTCREDS))
-			rpcauth_bindcred(task);
-		else
-			rpcauth_bind_root_cred(task);
+		rpcauth_bindcred(task, task_setup_data->rpc_message->rpc_cred, task_setup_data->flags);
 		if (task->tk_action == NULL)
 			rpc_call_start(task);
 	}
-- 
cgit v1.2.3


From 9a559efd4199c9812d339e23cc1b6055366b224f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Mar 2008 12:24:49 -0400
Subject: SUNRPC: Add a generic RPC credential

Add an rpc credential that is not tied to any particular auth mechanism,
but that can be cached by NFS, and later used to look up a cred for
whichever auth mechanism that turns out to be valid when the RPC call is
being made.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |   3 +
 net/sunrpc/Makefile         |   2 +-
 net/sunrpc/auth.c           |   1 +
 net/sunrpc/auth_generic.c   | 142 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 net/sunrpc/auth_generic.c

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 348546c7826..70644ed6799 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -125,9 +125,12 @@ extern const struct rpc_authops	authunix_ops;
 extern const struct rpc_authops	authnull_ops;
 
 void __init		rpc_init_authunix(void);
+void __init		rpc_init_generic_auth(void);
 void __init		rpcauth_init_module(void);
 void __exit		rpcauth_remove_module(void);
+void __exit		rpc_destroy_generic_auth(void);
 
+struct rpc_cred *	rpc_lookup_cred(void);
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 92e1dbe5094..5369aa369b3 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
 
 sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
-	    auth.o auth_null.o auth_unix.o \
+	    auth.o auth_null.o auth_unix.o auth_generic.o \
 	    svc.o svcsock.o svcauth.o svcauth_unix.o \
 	    rpcb_clnt.o timer.o xdr.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 1cdb163c4f8..012f2a32063 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -566,6 +566,7 @@ static struct shrinker rpc_cred_shrinker = {
 void __init rpcauth_init_module(void)
 {
 	rpc_init_authunix();
+	rpc_init_generic_auth();
 	register_shrinker(&rpc_cred_shrinker);
 }
 
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
new file mode 100644
index 00000000000..6f129b1b20a
--- /dev/null
+++ b/net/sunrpc/auth_generic.c
@@ -0,0 +1,142 @@
+/*
+ * Generic RPC credential
+ *
+ * Copyright (C) 2008, Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/err.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/sched.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+struct generic_cred {
+	struct rpc_cred gc_base;
+	struct auth_cred acred;
+};
+
+static struct rpc_auth generic_auth;
+static struct rpc_cred_cache generic_cred_cache;
+static const struct rpc_credops generic_credops;
+
+/*
+ * Public call interface
+ */
+struct rpc_cred *rpc_lookup_cred(void)
+{
+	return rpcauth_lookupcred(&generic_auth, 0);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+
+/*
+ * Lookup generic creds for current process
+ */
+static struct rpc_cred *
+generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+}
+
+static struct rpc_cred *
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	struct generic_cred *gcred;
+
+	gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+	if (gcred == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rpcauth_init_cred(&gcred->gc_base, acred, &generic_auth, &generic_credops);
+	gcred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
+
+	gcred->acred.uid = acred->uid;
+	gcred->acred.gid = acred->gid;
+	gcred->acred.group_info = acred->group_info;
+	if (gcred->acred.group_info != NULL)
+		get_group_info(gcred->acred.group_info);
+
+	dprintk("RPC:       allocated generic cred %p for uid %d gid %d\n",
+			gcred, acred->uid, acred->gid);
+	return &gcred->gc_base;
+}
+
+static void
+generic_free_cred(struct rpc_cred *cred)
+{
+	struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base);
+
+	dprintk("RPC:       generic_free_cred %p\n", gcred);
+	if (gcred->acred.group_info != NULL)
+		put_group_info(gcred->acred.group_info);
+	kfree(gcred);
+}
+
+static void
+generic_free_cred_callback(struct rcu_head *head)
+{
+	struct rpc_cred *cred = container_of(head, struct rpc_cred, cr_rcu);
+	generic_free_cred(cred);
+}
+
+static void
+generic_destroy_cred(struct rpc_cred *cred)
+{
+	call_rcu(&cred->cr_rcu, generic_free_cred_callback);
+}
+
+/*
+ * Match credentials against current process creds.
+ */
+static int
+generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
+{
+	struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base);
+
+	if (gcred->acred.uid != acred->uid ||
+	    gcred->acred.gid != acred->gid ||
+	    gcred->acred.group_info != acred->group_info)
+		return 0;
+	return 1;
+}
+
+void __init rpc_init_generic_auth(void)
+{
+	spin_lock_init(&generic_cred_cache.lock);
+}
+
+void __exit rpc_destroy_generic_auth(void)
+{
+	rpcauth_clear_credcache(&generic_cred_cache);
+}
+
+static struct rpc_cred_cache generic_cred_cache = {
+	{{ NULL, },},
+};
+
+static const struct rpc_authops generic_auth_ops = {
+	.owner = THIS_MODULE,
+#ifdef RPC_DEBUG
+	.au_name = "Generic",
+#endif
+	.lookup_cred = generic_lookup_cred,
+	.crcreate = generic_create_cred,
+};
+
+static struct rpc_auth generic_auth = {
+	.au_ops = &generic_auth_ops,
+	.au_count = ATOMIC_INIT(0),
+	.au_credcache = &generic_cred_cache,
+};
+
+static const struct rpc_credops generic_credops = {
+	.cr_name = "Generic cred",
+	.crdestroy = generic_destroy_cred,
+	.crmatch = generic_match,
+};
-- 
cgit v1.2.3


From 5c691044ecbca04dd558fca4c754121689fe1b34 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Mar 2008 16:21:07 -0400
Subject: SUNRPC: Add an rpc_credop callback for binding a credential to an
 rpc_task

We need the ability to treat 'generic' creds specially, since they want to
bind instances of the auth cred instead of binding themselves.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h    |  2 ++
 net/sunrpc/auth.c              |  5 +++--
 net/sunrpc/auth_generic.c      | 15 +++++++++++++++
 net/sunrpc/auth_gss/auth_gss.c |  2 ++
 net/sunrpc/auth_null.c         |  1 +
 net/sunrpc/auth_unix.c         |  1 +
 6 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 70644ed6799..e93cd8aa3eb 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -112,6 +112,7 @@ struct rpc_credops {
 	void			(*crdestroy)(struct rpc_cred *);
 
 	int			(*crmatch)(struct auth_cred *, struct rpc_cred *, int);
+	void			(*crbind)(struct rpc_task *, struct rpc_cred *);
 	__be32 *		(*crmarshal)(struct rpc_task *, __be32 *);
 	int			(*crrefresh)(struct rpc_task *);
 	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
@@ -139,6 +140,7 @@ struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
 void			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
+void			rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *);
 void			put_rpccred(struct rpc_cred *);
 void			rpcauth_unbindcred(struct rpc_task *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 012f2a32063..d65dd794010 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -375,13 +375,14 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_cred);
 
-static void
+void
 rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred)
 {
 	task->tk_msg.rpc_cred = get_rpccred(cred);
 	dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
 			cred->cr_auth->au_ops->au_name, cred);
 }
+EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred);
 
 static void
 rpcauth_bind_root_cred(struct rpc_task *task)
@@ -421,7 +422,7 @@ void
 rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 {
 	if (cred != NULL)
-		rpcauth_generic_bind_cred(task, cred);
+		cred->cr_ops->crbind(task, cred);
 	else if (flags & RPC_TASK_ROOTCREDS)
 		rpcauth_bind_root_cred(task);
 	else
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 6f129b1b20a..6a3f77c9e4d 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -35,6 +35,20 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+static void
+generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred)
+{
+	struct rpc_auth *auth = task->tk_client->cl_auth;
+	struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred;
+	struct rpc_cred *ret;
+
+	ret = auth->au_ops->lookup_cred(auth, acred, 0);
+	if (!IS_ERR(ret))
+		task->tk_msg.rpc_cred = ret;
+	else
+		task->tk_status = PTR_ERR(ret);
+}
+
 /*
  * Lookup generic creds for current process
  */
@@ -138,5 +152,6 @@ static struct rpc_auth generic_auth = {
 static const struct rpc_credops generic_credops = {
 	.cr_name = "Generic cred",
 	.crdestroy = generic_destroy_cred,
+	.crbind = generic_bind_cred,
 	.crmatch = generic_match,
 };
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index ef638496180..d34f6dfc751 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1300,6 +1300,7 @@ static const struct rpc_credops gss_credops = {
 	.cr_name	= "AUTH_GSS",
 	.crdestroy	= gss_destroy_cred,
 	.cr_init	= gss_cred_init,
+	.crbind		= rpcauth_generic_bind_cred,
 	.crmatch	= gss_match,
 	.crmarshal	= gss_marshal,
 	.crrefresh	= gss_refresh,
@@ -1311,6 +1312,7 @@ static const struct rpc_credops gss_credops = {
 static const struct rpc_credops gss_nullops = {
 	.cr_name	= "AUTH_GSS",
 	.crdestroy	= gss_destroy_cred,
+	.crbind		= rpcauth_generic_bind_cred,
 	.crmatch	= gss_match,
 	.crmarshal	= gss_marshal,
 	.crrefresh	= gss_refresh_null,
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 537d0e8589d..3c26c18df0d 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -125,6 +125,7 @@ static
 const struct rpc_credops null_credops = {
 	.cr_name	= "AUTH_NULL",
 	.crdestroy	= nul_destroy_cred,
+	.crbind		= rpcauth_generic_bind_cred,
 	.crmatch	= nul_match,
 	.crmarshal	= nul_marshal,
 	.crrefresh	= nul_refresh,
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index b763710d3db..04e936a56fb 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -237,6 +237,7 @@ static
 const struct rpc_credops unix_credops = {
 	.cr_name	= "AUTH_UNIX",
 	.crdestroy	= unx_destroy_cred,
+	.crbind		= rpcauth_generic_bind_cred,
 	.crmatch	= unx_match,
 	.crmarshal	= unx_marshal,
 	.crrefresh	= unx_refresh,
-- 
cgit v1.2.3


From 98a8e3239427051f5d44f2025b398bdcc3918f37 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Mar 2008 12:25:28 -0400
Subject: SUNRPC: Add a helper rpcauth_lookup_generic_cred()

The NFSv4 protocol allows clients to negotiate security protocols on the
fly in the case where an administrator on the server changes the export
settings and/or in the case where we may have a filesystem migration event.

Instead of having the NFS client code cache credentials that are tied to a
particular AUTH method it is therefore preferable to have a generic credential
that can be converted into whatever AUTH is in use by the RPC client when
the read/write/sillyrename/... is put on the wire.

We do this by means of the new "generic" credential, which basically just
caches the minimal information that is needed to look up an RPCSEC_GSS,
AUTH_SYS, or AUTH_NULL credential.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 2 +-
 fs/nfs/inode.c    | 2 +-
 fs/nfs/nfs4proc.c | 8 ++++----
 fs/nfs/unlink.c   | 2 +-
 net/sunrpc/auth.c | 1 -
 5 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6cea7479c5b..d583654a0b3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1966,7 +1966,7 @@ force_lookup:
 	if (!NFS_PROTO(inode)->access)
 		goto out_notsup;
 
-	cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0);
+	cred = rpc_lookup_cred();
 	if (!IS_ERR(cred)) {
 		res = nfs_do_access(inode, cred, mask);
 		put_rpccred(cred);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c49f6d8b42d..15f787355d2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -613,7 +613,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 	struct nfs_open_context *ctx;
 	struct rpc_cred *cred;
 
-	cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0);
+	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
 	ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bbb0d58ee6a..f38d0573be1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1408,7 +1408,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		BUG_ON(nd->intent.open.flags & O_CREAT);
 	}
 
-	cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
+	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return (struct dentry *)cred;
 	parent = dentry->d_parent;
@@ -1443,7 +1443,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
 
-	cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
+	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
 	state = nfs4_do_open(dir, &path, openflags, NULL, cred);
@@ -1660,7 +1660,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 	nfs_fattr_init(fattr);
 	
-	cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0);
+	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
 
@@ -1896,7 +1896,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	struct rpc_cred *cred;
 	int status = 0;
 
-	cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
+	cred = rpc_lookup_cred();
 	if (IS_ERR(cred)) {
 		status = PTR_ERR(cred);
 		goto out;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 75741536342..3adf8b26646 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -234,7 +234,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 	if (data == NULL)
 		goto out;
 
-	data->cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
+	data->cred = rpc_lookup_cred();
 	if (IS_ERR(data->cred)) {
 		status = PTR_ERR(data->cred);
 		goto out_free;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d65dd794010..0632cd0a1ad 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -356,7 +356,6 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
 	put_group_info(acred.group_info);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
 
 void
 rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
-- 
cgit v1.2.3


From 2f42b5d043ee271d1e5d30ecd77186b6c4d4e534 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@citi.umich.edu>
Date: Thu, 13 Mar 2008 15:26:30 +0200
Subject: NFS: fix encode_fsinfo_maxsz

The previous value was not taking into account space for bitmap array size.

Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index db1ed9c46ed..37421dd4805 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -110,7 +110,7 @@ static int nfs4_stat_to_errno(int);
 #define decode_savefh_maxsz     (op_decode_hdr_maxsz)
 #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
 #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
-#define encode_fsinfo_maxsz	(op_encode_hdr_maxsz + 2)
+#define encode_fsinfo_maxsz	(encode_getattr_maxsz)
 #define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + 11)
 #define encode_renew_maxsz	(op_encode_hdr_maxsz + 3)
 #define decode_renew_maxsz	(op_decode_hdr_maxsz)
-- 
cgit v1.2.3


From 6d884e8fc8114dc8877218f06a9a9a1d801901e4 Mon Sep 17 00:00:00 2001
From: Fred <iisaman@citi.umich.edu>
Date: Wed, 19 Mar 2008 11:24:38 -0400
Subject: nfs: nfs_redirty_request

Both flush functions have the same error handling routine.  Pull
it out as a function.

Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4cb88df12f8..ce40cadb15d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -282,8 +282,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	spin_unlock(&inode->i_lock);
 	if (!nfs_pageio_add_request(pgio, req)) {
 		nfs_redirty_request(req);
-		nfs_end_page_writeback(page);
-		nfs_clear_page_tag_locked(req);
 		return pgio->pg_error;
 	}
 	return 0;
@@ -402,7 +400,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 }
 
 static void
-nfs_redirty_request(struct nfs_page *req)
+nfs_mark_request_dirty(struct nfs_page *req)
 {
 	__set_page_dirty_nobuffers(req->wb_page);
 }
@@ -456,7 +454,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
 		return 1;
 	}
 	if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
-		nfs_redirty_request(req);
+		nfs_mark_request_dirty(req);
 		return 1;
 	}
 	return 0;
@@ -847,6 +845,17 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 		rpc_put_task(task);
 }
 
+/* If a nfs_flush_* function fails, it should remove reqs from @head and
+ * call this on each, which will prepare them to be retried on next
+ * writeback using standard nfs.
+ */
+static void nfs_redirty_request(struct nfs_page *req)
+{
+	nfs_mark_request_dirty(req);
+	nfs_end_page_writeback(req->wb_page);
+	nfs_clear_page_tag_locked(req);
+}
+
 /*
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
@@ -902,8 +911,6 @@ out_bad:
 		nfs_writedata_release(data);
 	}
 	nfs_redirty_request(req);
-	nfs_end_page_writeback(req->wb_page);
-	nfs_clear_page_tag_locked(req);
 	return -ENOMEM;
 }
 
@@ -944,8 +951,6 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_redirty_request(req);
-		nfs_end_page_writeback(req->wb_page);
-		nfs_clear_page_tag_locked(req);
 	}
 	return -ENOMEM;
 }
@@ -1298,7 +1303,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 		}
 		/* We have a mismatch. Write the page again */
 		dprintk(" mismatch\n");
-		nfs_redirty_request(req);
+		nfs_mark_request_dirty(req);
 	next:
 		nfs_clear_page_tag_locked(req);
 	}
-- 
cgit v1.2.3


From 4af68bffac444a23f027e18ff244101e63b79227 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@citi.umich.edu>
Date: Wed, 19 Mar 2008 11:54:04 -0400
Subject: nfs: remove duplicate initializations of nfs_read_data field

Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index ab2f7d233e0..d333f5fedca 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -251,7 +251,6 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 		data = nfs_readdata_alloc(1);
 		if (!data)
 			goto out_bad;
-		INIT_LIST_HEAD(&data->pages);
 		list_add(&data->pages, &list);
 		requests++;
 		nbytes -= len;
@@ -298,7 +297,6 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
 	if (!data)
 		goto out_bad;
 
-	INIT_LIST_HEAD(&data->pages);
 	pages = data->pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
-- 
cgit v1.2.3


From caa02bd540618e4b447a1f776363ba27c4c79090 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 22 Feb 2008 14:49:59 -0500
Subject: NFS: clean up short packet handling for NFSv2 readdir

Currently, the NFS readdir decoders have a workaround for buggy servers
that send an empty readdir response with the EOF bit unset. If the
server sends a malformed response in some cases, this workaround kicks
in and just returns an empty response rather than returning a proper
error to the caller.

This patch does 3 things:

1) have malformed responses with no entries return error (-EIO)

2) preserve existing workaround for servers that send empty
   responses with the EOF marker unset.

3) Add some comments to clarify the logic in nfs_xdr_readdirres().

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 1f7ea675e0c..86a80b33ec8 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -428,7 +428,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	size_t hdrlen;
 	unsigned int pglen, recvd;
 	u32 len;
-	int status, nr;
+	int status, nr = 0;
 	__be32 *end, *entry, *kaddr;
 
 	if ((status = ntohl(*p++)))
@@ -452,7 +452,12 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	kaddr = p = kmap_atomic(*page, KM_USER0);
 	end = (__be32 *)((char *)p + pglen);
 	entry = p;
-	for (nr = 0; *p++; nr++) {
+
+	/* Make sure the packet actually has a value_follows and EOF entry */
+	if ((entry + 1) > end)
+		goto short_pkt;
+
+	for (; *p++; nr++) {
 		if (p + 2 > end)
 			goto short_pkt;
 		p++; /* fileid */
@@ -467,18 +472,32 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 			goto short_pkt;
 		entry = p;
 	}
-	if (!nr && (entry[0] != 0 || entry[1] == 0))
-		goto short_pkt;
+
+	/*
+	 * Apparently some server sends responses that are a valid size, but
+	 * contain no entries, and have value_follows==0 and EOF==0. For
+	 * those, just set the EOF marker.
+	 */
+	if (!nr && entry[1] == 0) {
+		dprintk("NFS: readdir reply truncated!\n");
+		entry[1] = 1;
+	}
  out:
 	kunmap_atomic(kaddr, KM_USER0);
 	return nr;
  short_pkt:
+	/*
+	 * When we get a short packet there are 2 possibilities. We can
+	 * return an error, or fix up the response to look like a valid
+	 * response and return what we have so far. If there are no
+	 * entries and the packet was short, then return -EIO. If there
+	 * are valid entries in the response, return them and pretend that
+	 * the call was successful, but incomplete. The caller can retry the
+	 * readdir starting at the last cookie.
+	 */
 	entry[0] = entry[1] = 0;
-	/* truncate listing ? */
-	if (!nr) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
+	if (!nr)
+		nr = -errno_NFSERR_IO;
 	goto out;
 err_unmap:
 	nr = -errno_NFSERR_IO;
-- 
cgit v1.2.3


From 643f81115baca3630e544f6874567648b605efae Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 22 Feb 2008 14:50:00 -0500
Subject: NFS: clean up short packet handling for NFSv3 readdir

Currently, the NFS readdir decoders have a workaround for buggy servers
that send an empty readdir response with the EOF bit unset. If the
server sends a malformed response in some cases, this workaround kicks
in and just returns an empty response rather than returning a proper
error to the caller.

This patch does 3 things:

1) have malformed responses with no entries return error (-EIO)

2) preserve existing workaround for servers that send empty
   responses with the EOF marker unset.

3) Add some comments to clarify the logic in nfs3_xdr_readdirres().

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3xdr.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 3917e2fa4e4..fb03048ac65 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -508,7 +508,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	struct page **page;
 	size_t hdrlen;
 	u32 len, recvd, pglen;
-	int status, nr;
+	int status, nr = 0;
 	__be32 *entry, *end, *kaddr;
 
 	status = ntohl(*p++);
@@ -542,7 +542,12 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	kaddr = p = kmap_atomic(*page, KM_USER0);
 	end = (__be32 *)((char *)p + pglen);
 	entry = p;
-	for (nr = 0; *p++; nr++) {
+
+	/* Make sure the packet actually has a value_follows and EOF entry */
+	if ((entry + 1) > end)
+		goto short_pkt;
+
+	for (; *p++; nr++) {
 		if (p + 3 > end)
 			goto short_pkt;
 		p += 2;				/* inode # */
@@ -581,18 +586,32 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 			goto short_pkt;
 		entry = p;
 	}
-	if (!nr && (entry[0] != 0 || entry[1] == 0))
-		goto short_pkt;
+
+	/*
+	 * Apparently some server sends responses that are a valid size, but
+	 * contain no entries, and have value_follows==0 and EOF==0. For
+	 * those, just set the EOF marker.
+	 */
+	if (!nr && entry[1] == 0) {
+		dprintk("NFS: readdir reply truncated!\n");
+		entry[1] = 1;
+	}
  out:
 	kunmap_atomic(kaddr, KM_USER0);
 	return nr;
  short_pkt:
+	/*
+	 * When we get a short packet there are 2 possibilities. We can
+	 * return an error, or fix up the response to look like a valid
+	 * response and return what we have so far. If there are no
+	 * entries and the packet was short, then return -EIO. If there
+	 * are valid entries in the response, return them and pretend that
+	 * the call was successful, but incomplete. The caller can retry the
+	 * readdir starting at the last cookie.
+	 */
 	entry[0] = entry[1] = 0;
-	/* truncate listing ? */
-	if (!nr) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
+	if (!nr)
+		nr = -errno_NFSERR_IO;
 	goto out;
 err_unmap:
 	nr = -errno_NFSERR_IO;
-- 
cgit v1.2.3


From 7bda2cdf484a00e52b0ed925e99d4bf4696b2c7a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 22 Feb 2008 14:50:01 -0500
Subject: NFS: clean up short packet handling for NFSv4 readdir

Currently, the NFS readdir decoders have a workaround for buggy servers
that send an empty readdir response with the EOF bit unset. If the
server sends a malformed response in some cases, this workaround kicks
in and just returns an empty response rather than returning a proper
error to the caller.

This patch does 3 things:

1) have malformed responses with no entries return error (-EIO)

2) preserve existing workaround for servers that send empty
   responses with the EOF marker unset.

3) Add some comments to clarify the logic in decode_readdir().

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 37421dd4805..2b519f6325f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3481,7 +3481,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	size_t		hdrlen;
 	u32		recvd, pglen = rcvbuf->page_len;
 	__be32		*end, *entry, *p, *kaddr;
-	unsigned int	nr;
+	unsigned int	nr = 0;
 	int		status;
 
 	status = decode_op_hdr(xdr, OP_READDIR);
@@ -3505,7 +3505,12 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	kaddr = p = kmap_atomic(page, KM_USER0);
 	end = p + ((pglen + readdir->pgbase) >> 2);
 	entry = p;
-	for (nr = 0; *p++; nr++) {
+
+	/* Make sure the packet actually has a value_follows and EOF entry */
+	if ((entry + 1) > end)
+		goto short_pkt;
+
+	for (; *p++; nr++) {
 		u32 len, attrlen, xlen;
 		if (end - p < 3)
 			goto short_pkt;
@@ -3532,20 +3537,32 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 		p += attrlen;		/* attributes */
 		entry = p;
 	}
-	if (!nr && (entry[0] != 0 || entry[1] == 0))
-		goto short_pkt;
+	/*
+	 * Apparently some server sends responses that are a valid size, but
+	 * contain no entries, and have value_follows==0 and EOF==0. For
+	 * those, just set the EOF marker.
+	 */
+	if (!nr && entry[1] == 0) {
+		dprintk("NFS: readdir reply truncated!\n");
+		entry[1] = 1;
+	}
 out:	
 	kunmap_atomic(kaddr, KM_USER0);
 	return 0;
 short_pkt:
+	/*
+	 * When we get a short packet there are 2 possibilities. We can
+	 * return an error, or fix up the response to look like a valid
+	 * response and return what we have so far. If there are no
+	 * entries and the packet was short, then return -EIO. If there
+	 * are valid entries in the response, return them and pretend that
+	 * the call was successful, but incomplete. The caller can retry the
+	 * readdir starting at the last cookie.
+	 */
 	dprintk("%s: short packet at entry %d\n", __FUNCTION__, nr);
 	entry[0] = entry[1] = 0;
-	/* truncate listing ? */
-	if (!nr) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
-	goto out;
+	if (nr)
+		goto out;
 err_unmap:
 	kunmap_atomic(kaddr, KM_USER0);
 	return -errno_NFSERR_IO;
-- 
cgit v1.2.3


From 2d76743227028a26f884284aade03d1e43f4f249 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:10:08 -0400
Subject: NFS: numeric mount parameters are unsigned

Clean up: use %u instead of %d when displaying NFS mount options.

Nit: Fix reporting of "namlen=" option in nfs_show_mount_stats.  The mount
option is called "namlen" without the "e".

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fcf4b982c88..53a67c6d4d2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -463,17 +463,17 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	const struct proc_nfs_info *nfs_infop;
 	struct nfs_client *clp = nfss->nfs_client;
 
-	seq_printf(m, ",vers=%d", clp->rpc_ops->version);
-	seq_printf(m, ",rsize=%d", nfss->rsize);
-	seq_printf(m, ",wsize=%d", nfss->wsize);
+	seq_printf(m, ",vers=%u", clp->rpc_ops->version);
+	seq_printf(m, ",rsize=%u", nfss->rsize);
+	seq_printf(m, ",wsize=%u", nfss->wsize);
 	if (nfss->acregmin != 3*HZ || showdefaults)
-		seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
+		seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
 	if (nfss->acregmax != 60*HZ || showdefaults)
-		seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
+		seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
 	if (nfss->acdirmin != 30*HZ || showdefaults)
-		seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
+		seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
 	if (nfss->acdirmax != 60*HZ || showdefaults)
-		seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
+		seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
 	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 		if (nfss->flags & nfs_infop->flag)
 			seq_puts(m, nfs_infop->str);
@@ -529,10 +529,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 
 	seq_printf(m, "\n\tcaps:\t");
 	seq_printf(m, "caps=0x%x", nfss->caps);
-	seq_printf(m, ",wtmult=%d", nfss->wtmult);
-	seq_printf(m, ",dtsize=%d", nfss->dtsize);
-	seq_printf(m, ",bsize=%d", nfss->bsize);
-	seq_printf(m, ",namelen=%d", nfss->namelen);
+	seq_printf(m, ",wtmult=%u", nfss->wtmult);
+	seq_printf(m, ",dtsize=%u", nfss->dtsize);
+	seq_printf(m, ",bsize=%u", nfss->bsize);
+	seq_printf(m, ",namlen=%u", nfss->namelen);
 
 #ifdef CONFIG_NFS_V4
 	if (nfss->nfs_client->rpc_ops->version == 4) {
@@ -546,9 +546,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	/*
 	 * Display security flavor in effect for this mount
 	 */
-	seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
+	seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor);
 	if (auth->au_flavor)
-		seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
+		seq_printf(m, ",pseudoflavor=%u", auth->au_flavor);
 
 	/*
 	 * Display superblock I/O counters
-- 
cgit v1.2.3


From 78fa701f341564e60461de91cd08ff5f7fb09b31 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:10:15 -0400
Subject: NFS: Fix up data types of fields in nfs_parsed_mount_options

Clean up: make data types of fields in nfs_parsed_mount_options more
consistent with other uses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4c1122a1384..e89688a955b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -46,9 +46,9 @@ struct nfs_parsed_mount_data {
 		struct sockaddr_storage	address;
 		size_t			addrlen;
 		char			*hostname;
-		unsigned int		version;
+		u32			version;
 		unsigned short		port;
-		int			protocol;
+		unsigned short		protocol;
 	} mount_server;
 
 	struct {
@@ -56,7 +56,7 @@ struct nfs_parsed_mount_data {
 		size_t			addrlen;
 		char			*hostname;
 		char			*export_path;
-		int			protocol;
+		unsigned short		protocol;
 	} nfs_server;
 
 	struct security_mnt_opts lsm_opts;
-- 
cgit v1.2.3


From f22d6d79fe227245363a8849ea8c85fe6c6598c3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:10:22 -0400
Subject: NFS: Save the value of the "port=" mount option

During a remount based on the mount options displayed in /proc/mounts, we
want to preserve the original behavior of the mount request.  Let's save
the original setting of the "port=" mount option in the mount's nfs_server
structure.

This allows us to simplify the default behavior of port setting for NFSv4
mounts: by default, NFSv2/3 mounts first try an RPC bind to determine the
NFS server's port, unless the user specified the "port=" mount option;
Users can force the client to skip the RPC bind by explicitly specifying
"port=<value>".

NFSv4, by contrast, assumes the NFS server port is 2049 and skips the RPC
bind, unless the user specifies "port=".  Users can force an RPC bind for
NFSv4 by explicitly specifying "port=0".

I added a couple of extra comments to clarify this behavior.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  4 ++++
 fs/nfs/internal.h         |  1 +
 fs/nfs/super.c            | 37 ++++++-------------------------------
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 06f064d8fbb..874018113d0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -682,6 +682,8 @@ static int nfs_init_server(struct nfs_server *server,
 	if (error < 0)
 		goto error;
 
+	server->port = data->nfs_server.port;
+
 	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
 	if (error < 0)
 		goto error;
@@ -1064,6 +1066,8 @@ static int nfs4_init_server(struct nfs_server *server,
 	server->acdirmin = data->acdirmin * HZ;
 	server->acdirmax = data->acdirmax * HZ;
 
+	server->port = data->nfs_server.port;
+
 	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
 
 error:
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e89688a955b..999ad8ee064 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -56,6 +56,7 @@ struct nfs_parsed_mount_data {
 		size_t			addrlen;
 		char			*hostname;
 		char			*export_path;
+		unsigned short		port;
 		unsigned short		protocol;
 	} nfs_server;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 53a67c6d4d2..3c6f53aa731 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -685,7 +685,6 @@ static int nfs_parse_mount_options(char *raw,
 				   struct nfs_parsed_mount_data *mnt)
 {
 	char *p, *string, *secdata;
-	unsigned short port = 0;
 	int rc;
 
 	if (!raw) {
@@ -800,7 +799,7 @@ static int nfs_parse_mount_options(char *raw,
 				return 0;
 			if (option < 0 || option > 65535)
 				return 0;
-			port = option;
+			mnt->nfs_server.port = option;
 			break;
 		case Opt_rsize:
 			if (match_int(args, &mnt->rsize))
@@ -1050,7 +1049,8 @@ static int nfs_parse_mount_options(char *raw,
 		}
 	}
 
-	nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, port);
+	nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
+				mnt->nfs_server.port);
 
 	return 1;
 
@@ -1171,7 +1171,9 @@ static int nfs_validate_mount_data(void *options,
 	args->acregmax		= 60;
 	args->acdirmin		= 30;
 	args->acdirmax		= 60;
+	args->mount_server.port	= 0;	/* autobind unless user sets port */
 	args->mount_server.protocol = XPRT_TRANSPORT_UDP;
+	args->nfs_server.port	= 0;	/* autobind unless user sets port */
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 
 	switch (data->version) {
@@ -1707,28 +1709,6 @@ static void nfs4_fill_super(struct super_block *sb)
 	nfs_initialise_sb(sb);
 }
 
-/*
- * If the user didn't specify a port, set the port number to
- * the NFS version 4 default port.
- */
-static void nfs4_default_port(struct sockaddr *sap)
-{
-	switch (sap->sa_family) {
-	case AF_INET: {
-		struct sockaddr_in *ap = (struct sockaddr_in *)sap;
-		if (ap->sin_port == 0)
-			ap->sin_port = htons(NFS_PORT);
-		break;
-	}
-	case AF_INET6: {
-		struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
-		if (ap->sin6_port == 0)
-			ap->sin6_port = htons(NFS_PORT);
-		break;
-	}
-	}
-}
-
 /*
  * Validate NFSv4 mount options
  */
@@ -1753,6 +1733,7 @@ static int nfs4_validate_mount_data(void *options,
 	args->acregmax		= 60;
 	args->acdirmin		= 30;
 	args->acdirmax		= 60;
+	args->nfs_server.port	= NFS_PORT; /* 2049 unless user set port= */
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 
 	switch (data->version) {
@@ -1769,9 +1750,6 @@ static int nfs4_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			goto out_no_address;
 
-		nfs4_default_port((struct sockaddr *)
-				  &args->nfs_server.address);
-
 		switch (data->auth_flavourlen) {
 		case 0:
 			args->auth_flavors[0] = RPC_AUTH_UNIX;
@@ -1829,9 +1807,6 @@ static int nfs4_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			return -EINVAL;
 
-		nfs4_default_port((struct sockaddr *)
-				  &args->nfs_server.address);
-
 		switch (args->auth_flavor_len) {
 		case 0:
 			args->auth_flavors[0] = RPC_AUTH_UNIX;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 3423c6761bf..670e5c7222d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -93,6 +93,7 @@ struct nfs_server {
 	unsigned int		wpages;		/* write size (in pages) */
 	unsigned int		wtmult;		/* server disk block size */
 	unsigned int		dtsize;		/* readdir size */
+	unsigned short		port;		/* "port=" setting */
 	unsigned int		bsize;		/* server block size */
 	unsigned int		acregmin;	/* attr cache timeouts */
 	unsigned int		acregmax;
-- 
cgit v1.2.3


From 3f8400d1f1f9d5fb175bdbf6236e564dde454f28 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:10:30 -0400
Subject: NFS: Save the values of the "mount*=" mount options

Save the value of the mountproto= mountport= mountvers= and mountaddr=
options so that these values can be displayed later via
nfs_show_options().

This preserves the intent of the original mount options, should the file
system need to be remounted based on what's displayed in /proc/mounts.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 10 ++++++++++
 include/linux/nfs_fs_sb.h |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 874018113d0..93dfd75aba7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -688,6 +688,16 @@ static int nfs_init_server(struct nfs_server *server,
 	if (error < 0)
 		goto error;
 
+	/* Preserve the values of mount_server-related mount options */
+	if (data->mount_server.addrlen) {
+		memcpy(&server->mountd_address, &data->mount_server.address,
+			data->mount_server.addrlen);
+		server->mountd_addrlen = data->mount_server.addrlen;
+	}
+	server->mountd_version = data->mount_server.version;
+	server->mountd_port = data->mount_server.port;
+	server->mountd_protocol = data->mount_server.protocol;
+
 	server->namelen  = data->namlen;
 	/* Create a client RPC handle for the NFSv3 ACL management interface */
 	nfs_init_server_aclclient(server);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 670e5c7222d..ac7e4fb943e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -118,6 +118,13 @@ struct nfs_server {
 
 	atomic_t active; /* Keep trace of any activity to this server */
 	wait_queue_head_t active_wq;  /* Wait for any activity to stop  */
+
+	/* mountd-related mount options */
+	struct sockaddr_storage	mountd_address;
+	size_t			mountd_addrlen;
+	u32			mountd_version;
+	unsigned short		mountd_port;
+	unsigned short		mountd_protocol;
 };
 
 /* Server capabilities */
-- 
cgit v1.2.3


From 82d101d58a2312297ee79f96d44c1d8c7fe1032d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:10:37 -0400
Subject: NFS: Show most mount options via nfs_show_options()

Display all mount options in /proc/mount which may be needed to reconstruct
a previous mount.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3c6f53aa731..c99ca1f992c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -441,10 +441,52 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
 	return sec_flavours[i].str;
 }
 
+static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+	struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
+
+	switch (sap->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+		seq_printf(m, ",mountaddr=" NIPQUAD_FMT,
+				NIPQUAD(sin->sin_addr.s_addr));
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+		seq_printf(m, ",mountaddr=" NIP6_FMT,
+				NIP6(sin6->sin6_addr));
+		break;
+	}
+	default:
+		if (showdefaults)
+			seq_printf(m, ",mountaddr=unspecified");
+	}
+
+	if (nfss->mountd_version || showdefaults)
+		seq_printf(m, ",mountvers=%u", nfss->mountd_version);
+	if (nfss->mountd_port || showdefaults)
+		seq_printf(m, ",mountport=%u", nfss->mountd_port);
+
+	switch (nfss->mountd_protocol) {
+	case IPPROTO_UDP:
+		seq_printf(m, ",mountproto=udp");
+		break;
+	case IPPROTO_TCP:
+		seq_printf(m, ",mountproto=tcp");
+		break;
+	default:
+		if (showdefaults)
+			seq_printf(m, ",mountproto=auto");
+	}
+}
+
 /*
  * Describe the mount options in force on this server representation
  */
-static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
+				   int showdefaults)
 {
 	static const struct proc_nfs_info {
 		int flag;
@@ -452,6 +494,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		const char *nostr;
 	} nfs_info[] = {
 		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
+		{ NFS_MOUNT_INTR, ",intr", ",nointr" },
+		{ NFS_MOUNT_POSIX, ",posix", "" },
 		{ NFS_MOUNT_NOCTO, ",nocto", "" },
 		{ NFS_MOUNT_NOAC, ",noac", "" },
 		{ NFS_MOUNT_NONLM, ",nolock", "" },
@@ -462,10 +506,14 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	};
 	const struct proc_nfs_info *nfs_infop;
 	struct nfs_client *clp = nfss->nfs_client;
+	u32 version = clp->rpc_ops->version;
 
-	seq_printf(m, ",vers=%u", clp->rpc_ops->version);
+	seq_printf(m, ",vers=%u", version);
 	seq_printf(m, ",rsize=%u", nfss->rsize);
 	seq_printf(m, ",wsize=%u", nfss->wsize);
+	if (nfss->bsize != 0)
+		seq_printf(m, ",bsize=%u", nfss->bsize);
+	seq_printf(m, ",namlen=%u", nfss->namelen);
 	if (nfss->acregmin != 3*HZ || showdefaults)
 		seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
 	if (nfss->acregmax != 60*HZ || showdefaults)
@@ -482,9 +530,24 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	}
 	seq_printf(m, ",proto=%s",
 		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
+	if (version == 4) {
+		if (nfss->port != NFS_PORT)
+			seq_printf(m, ",port=%u", nfss->port);
+	} else
+		if (nfss->port)
+			seq_printf(m, ",port=%u", nfss->port);
+
 	seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
 	seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
 	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
+
+	if (version != 4)
+		nfs_show_mountd_options(m, nfss, showdefaults);
+
+#ifdef CONFIG_NFS_V4
+	if (clp->rpc_ops->version == 4)
+		seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+#endif
 }
 
 /*
-- 
cgit v1.2.3


From ecfc555a8327ff09b07066d73a98c04115007eec Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:14:56 -0400
Subject: NFS: Always enable NFS direct I/O

Since O_DIRECT is a standard feature that is enabled in most distros,
eliminate the CONFIG_NFS_DIRECTIO build option, and change the
fs/nfs/Makefile to always build in the NFS direct I/O engine.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig        | 24 ------------------------
 fs/nfs/Makefile   |  3 +--
 fs/nfs/file.c     |  6 ------
 fs/nfs/internal.h |  5 -----
 4 files changed, 1 insertion(+), 37 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index d7312825592..55c1a0374e1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1637,30 +1637,6 @@ config NFS_V4
 
 	  If unsure, say N.
 
-config NFS_DIRECTIO
-	bool "Allow direct I/O on NFS files"
-	depends on NFS_FS
-	help
-	  This option enables applications to perform uncached I/O on files
-	  in NFS file systems using the O_DIRECT open() flag.  When O_DIRECT
-	  is set for a file, its data is not cached in the system's page
-	  cache.  Data is moved to and from user-level application buffers
-	  directly.  Unlike local disk-based file systems, NFS O_DIRECT has
-	  no alignment restrictions.
-
-	  Unless your program is designed to use O_DIRECT properly, you are
-	  much better off allowing the NFS client to manage data caching for
-	  you.  Misusing O_DIRECT can cause poor server performance or network
-	  storms.  This kernel build option defaults OFF to avoid exposing
-	  system administrators unwittingly to a potentially hazardous
-	  feature.
-
-	  For more details on NFS O_DIRECT, see fs/nfs/direct.c.
-
-	  If unsure, say N.  This reduces the size of the NFS client, and
-	  causes open() to return EINVAL if a file residing in NFS is
-	  opened with the O_DIRECT flag.
-
 config NFSD
 	tristate "NFS server support"
 	depends on INET
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index df0f41e0988..ac6170c594a 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
-			   pagelist.o proc.o read.o symlink.o unlink.o \
+			   direct.o pagelist.o proc.o read.o symlink.o unlink.o \
 			   write.o namespace.o mount_clnt.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
@@ -14,5 +14,4 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
 			   callback.o callback_xdr.o callback_proc.o \
 			   nfs4namespace.o
-nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ef57a5ae590..10e8b807e7f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -234,10 +234,8 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
 
-#ifdef CONFIG_NFS_DIRECTIO
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return nfs_file_direct_read(iocb, iov, nr_segs, pos);
-#endif
 
 	dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -383,9 +381,7 @@ const struct address_space_operations nfs_file_aops = {
 	.write_end = nfs_write_end,
 	.invalidatepage = nfs_invalidate_page,
 	.releasepage = nfs_release_page,
-#ifdef CONFIG_NFS_DIRECTIO
 	.direct_IO = nfs_direct_IO,
-#endif
 	.launder_page = nfs_launder_page,
 };
 
@@ -443,10 +439,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
 
-#ifdef CONFIG_NFS_DIRECTIO
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return nfs_file_direct_write(iocb, iov, nr_segs, pos);
-#endif
 
 	dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 999ad8ee064..04ae867dddb 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -116,13 +116,8 @@ extern void nfs_destroy_readpagecache(void);
 extern int __init nfs_init_writepagecache(void);
 extern void nfs_destroy_writepagecache(void);
 
-#ifdef CONFIG_NFS_DIRECTIO
 extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
-#else
-#define nfs_init_directcache() (0)
-#define nfs_destroy_directcache() do {} while(0)
-#endif
 
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(int);
-- 
cgit v1.2.3


From 327a299d8abd0f4580098cbd6c58b0f017417005 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:15:11 -0400
Subject: SUNRPC: Update help Kconfig text

Clean up: refresh the help text for Kconfig items related to the sunrpc
module.  Remove obsolete URLs, and make the language consistent among
the options.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 55c1a0374e1..83201856b36 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1757,15 +1757,32 @@ config SUNRPC_XPRT_RDMA
 	tristate
 	depends on SUNRPC && INFINIBAND && EXPERIMENTAL
 	default SUNRPC && INFINIBAND
+	help
+	  This option enables an RPC client transport capability that
+	  allows the NFS client to mount servers via an RDMA-enabled
+	  transport.
+
+	  To compile RPC client RDMA transport support as a module,
+	  choose M here: the module will be called xprtrdma.
+
+	  If unsure, say N.
 
 config SUNRPC_BIND34
 	bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
 	help
-	  Provides kernel support for querying rpcbind servers via versions 3
-	  and 4 of the rpcbind protocol.  The kernel automatically falls back
-	  to version 2 if a remote rpcbind service does not support versions
-	  3 or 4.
+	  RPC requests over IPv6 networks require support for larger
+	  addresses when performing an RPC bind.  Sun added support for
+	  IPv6 addressing by creating two new versions of the rpcbind
+	  protocol (RFC 1833).
+
+	  This option enables support in the kernel RPC client for
+	  querying rpcbind servers via versions 3 and 4 of the rpcbind
+	  protocol.  The kernel automatically falls back to version 2
+	  if a remote rpcbind service does not support versions 3 or 4.
+	  By themselves, these new versions do not provide support for
+	  RPC over IPv6, but the new protocol versions are necessary to
+	  support it.
 
 	  If unsure, say N to get traditional behavior (version 2 rpcbind
 	  requests only).
@@ -1779,12 +1796,13 @@ config RPCSEC_GSS_KRB5
 	select CRYPTO_DES
 	select CRYPTO_CBC
 	help
-	  Provides for secure RPC calls by means of a gss-api
-	  mechanism based on Kerberos V5. This is required for
-	  NFSv4.
+	  Choose Y here to enable Secure RPC using the Kerberos version 5
+	  GSS-API mechanism (RFC 1964).
 
-	  Note: Requires an auxiliary userspace daemon which may be found on
-		http://www.citi.umich.edu/projects/nfsv4/
+	  Secure RPC calls with Kerberos require an auxiliary user-space
+	  daemon which may be found in the Linux nfs-utils package
+	  available from http://linux-nfs.org/.  In addition, user-space
+	  Kerberos support should be installed.
 
 	  If unsure, say N.
 
@@ -1798,11 +1816,12 @@ config RPCSEC_GSS_SPKM3
 	select CRYPTO_CAST5
 	select CRYPTO_CBC
 	help
-	  Provides for secure RPC calls by means of a gss-api
-	  mechanism based on the SPKM3 public-key mechanism.
+	  Choose Y here to enable Secure RPC using the SPKM3 public key
+	  GSS-API mechansim (RFC 2025).
 
-	  Note: Requires an auxiliary userspace daemon which may be found on
-	  	http://www.citi.umich.edu/projects/nfsv4/
+	  Secure RPC calls with SPKM3 require an auxiliary userspace
+	  daemon which may be found in the Linux nfs-utils package
+	  available from http://linux-nfs.org/.
 
 	  If unsure, say N.
 
-- 
cgit v1.2.3


From 1e40316bc44851c5920490df01482ed862e4dbc4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:15:18 -0400
Subject: SUNRPC: Add a default setting for CONFIG_SUNRPC_BIND34

Most distros will want support for rpcbind protocols 3 and 4 to default off
until they have integrated user-space support for the new rpcbind daemon
which supports IPv6 RPC services.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/Kconfig b/fs/Kconfig
index 83201856b36..4115deb567e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1770,6 +1770,7 @@ config SUNRPC_XPRT_RDMA
 config SUNRPC_BIND34
 	bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
+	default n
 	help
 	  RPC requests over IPv6 networks require support for larger
 	  addresses when performing an RPC bind.  Sun added support for
-- 
cgit v1.2.3


From 90d5b18061656993410dfd57ddb88aa5a3f34563 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:18:30 -0400
Subject: NLM: LOCKD fails to load if CONFIG_SYSCTL is not set

Bruce Fields says:
"By the way, we've got another config-related nit here:

	http://bugzilla.linux-nfs.org/show_bug.cgi?id=156

You can build lockd without CONFIG_SYSCTL set, but then the module will
fail to load."

For now, disable the sysctl registration calls in lockd if CONFIG_SYSCTL
is not enabled.  This allows the kernel to build properly if PROC_FS or
SYSCTL is not enabled, but an NFS client is desired.

In the long run, we would like to be able to build the kernel with an
NFS client but without lockd.  This makes sense, for example, if you want
an NFSv4-only NFS client, as NFSv4 doesn't use NLM at all.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1ed8bd4de94..38c2f0b1dd7 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -74,7 +74,9 @@ static const unsigned long	nlm_timeout_min = 3;
 static const unsigned long	nlm_timeout_max = 20;
 static const int		nlm_port_min = 0, nlm_port_max = 65535;
 
+#ifdef CONFIG_SYSCTL
 static struct ctl_table_header * nlm_sysctl_table;
+#endif
 
 static unsigned long get_lockd_grace_period(void)
 {
@@ -359,6 +361,8 @@ out:
 }
 EXPORT_SYMBOL(lockd_down);
 
+#ifdef CONFIG_SYSCTL
+
 /*
  * Sysctl parameters (same as module parameters, different interface).
  */
@@ -443,6 +447,8 @@ static ctl_table nlm_sysctl_root[] = {
 	{ .ctl_name = 0 }
 };
 
+#endif	/* CONFIG_SYSCTL */
+
 /*
  * Module (and sysfs) parameters.
  */
@@ -516,15 +522,21 @@ module_param(nsm_use_hostnames, bool, 0644);
 
 static int __init init_nlm(void)
 {
+#ifdef CONFIG_SYSCTL
 	nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
 	return nlm_sysctl_table ? 0 : -ENOMEM;
+#else
+	return 0;
+#endif
 }
 
 static void __exit exit_nlm(void)
 {
 	/* FIXME: delete all NLM clients */
 	nlm_shutdown_hosts();
+#ifdef CONFIG_SYSCTL
 	unregister_sysctl_table(nlm_sysctl_table);
+#endif
 }
 
 module_init(init_nlm);
-- 
cgit v1.2.3


From eb18860e1385bfc7f08fcb7ba362e4a5156c8324 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:18:37 -0400
Subject: NLM: NLM protocol version numbers are u32

Clean up: RPC protocol version numbers are u32.  Make sure we use an
appropriate type for NLM version numbers when calling nlm_lookup_host().

Eliminates a harmless mixed sign comparison in nlm_host_lookup().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             | 20 +++++++++++---------
 include/linux/lockd/lockd.h |  6 ++++--
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index f1ef49fff11..f23750db165 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -42,11 +42,12 @@ static struct nsm_handle *	nsm_find(const struct sockaddr_in *sin,
 /*
  * Common host lookup routine for server & client
  */
-static struct nlm_host *
-nlm_lookup_host(int server, const struct sockaddr_in *sin,
-		int proto, int version, const char *hostname,
-		unsigned int hostname_len,
-		const struct sockaddr_in *ssin)
+static struct nlm_host *nlm_lookup_host(int server,
+					const struct sockaddr_in *sin,
+					int proto, u32 version,
+					const char *hostname,
+					unsigned int hostname_len,
+					const struct sockaddr_in *ssin)
 {
 	struct hlist_head *chain;
 	struct hlist_node *pos;
@@ -55,7 +56,7 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin,
 	int		hash;
 
 	dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
-			", p=%d, v=%d, my role=%s, name=%.*s)\n",
+			", p=%d, v=%u, my role=%s, name=%.*s)\n",
 			NIPQUAD(ssin->sin_addr.s_addr),
 			NIPQUAD(sin->sin_addr.s_addr), proto, version,
 			server? "server" : "client",
@@ -175,9 +176,10 @@ nlm_destroy_host(struct nlm_host *host)
 /*
  * Find an NLM server handle in the cache. If there is none, create it.
  */
-struct nlm_host *
-nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
-			const char *hostname, unsigned int hostname_len)
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
+				     int proto, u32 version,
+				     const char *hostname,
+				     unsigned int hostname_len)
 {
 	struct sockaddr_in ssin = {0};
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 4babb2a129a..d1559501305 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -173,8 +173,10 @@ void		  nlmclnt_next_cookie(struct nlm_cookie *);
 /*
  * Host cache
  */
-struct nlm_host  *nlmclnt_lookup_host(const struct sockaddr_in *, int, int,
-					const char *, unsigned int);
+struct nlm_host  *nlmclnt_lookup_host(const struct sockaddr_in *sin,
+					int proto, u32 version,
+					const char *hostname,
+					unsigned int hostname_len);
 struct nlm_host  *nlmsvc_lookup_host(struct svc_rqst *, const char *,
 					unsigned int);
 struct rpc_clnt * nlm_bind_host(struct nlm_host *);
-- 
cgit v1.2.3


From f34ec991ae0f015f5cdc51ad46c3a317ffae2466 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:18:45 -0400
Subject: lockd: bring a few function declarations up to date

Clean-up: replace  __inline__ and use up-to-date function declaration
conventions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/lockd/lockd.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index d1559501305..acf39e1e3a3 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -219,8 +219,7 @@ void		  nlmsvc_mark_resources(void);
 void		  nlmsvc_free_host_resources(struct nlm_host *);
 void		  nlmsvc_invalidate_all(void);
 
-static __inline__ struct inode *
-nlmsvc_file_inode(struct nlm_file *file)
+static inline struct inode *nlmsvc_file_inode(struct nlm_file *file)
 {
 	return file->f_file->f_path.dentry->d_inode;
 }
@@ -228,8 +227,8 @@ nlmsvc_file_inode(struct nlm_file *file)
 /*
  * Compare two host addresses (needs modifying for ipv6)
  */
-static __inline__ int
-nlm_cmp_addr(const struct sockaddr_in *sin1, const struct sockaddr_in *sin2)
+static inline int nlm_cmp_addr(const struct sockaddr_in *sin1,
+			       const struct sockaddr_in *sin2)
 {
 	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
 }
@@ -238,8 +237,8 @@ nlm_cmp_addr(const struct sockaddr_in *sin1, const struct sockaddr_in *sin2)
  * Compare two NLM locks.
  * When the second lock is of type F_UNLCK, this acts like a wildcard.
  */
-static __inline__ int
-nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2)
+static inline int nlm_compare_locks(const struct file_lock *fl1,
+				    const struct file_lock *fl2)
 {
 	return	fl1->fl_pid   == fl2->fl_pid
 	     && fl1->fl_owner == fl2->fl_owner
-- 
cgit v1.2.3


From 099bd05f27ff24a3041d54e7ed42d2eb681484b9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:25:32 -0400
Subject: lockd: Ensure NSM strings aren't longer than protocol allows

Introduce a special helper function to check the length of NSM strings
before they are placed on the wire.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 908b23fadd0..84fd84cb67b 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -149,6 +149,15 @@ nsm_create(void)
  * XDR functions for NSM.
  */
 
+static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
+{
+	size_t len = strlen(string);
+
+	if (len > SM_MAXSTRLEN)
+		len = SM_MAXSTRLEN;
+	return xdr_encode_opaque(p, string, len);
+}
+
 static __be32 *
 xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
-- 
cgit v1.2.3


From 496951743100b2d2ccd8b268257e79425e489851 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:25:39 -0400
Subject: lockd: refactor SM_MON mon_name argument encoder

Clean up: introduce a new XDR encoder specifically for the mon_name
argument of SM_MON requests.  This will be updated later to support IPv6
addresses in addition to IPv4 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 84fd84cb67b..87bd4e4d65d 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -18,6 +18,8 @@
 
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
 
+#define XDR_ADDRBUF_LEN		(20)
+
 static struct rpc_clnt *	nsm_create(void);
 
 static struct rpc_program	nsm_program;
@@ -158,6 +160,29 @@ static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
 	return xdr_encode_opaque(p, string, len);
 }
 
+/*
+ * "mon_name" specifies the host to be monitored.
+ *
+ * Linux uses a text version of the IP address of the remote
+ * host as the host identifier (the "mon_name" argument).
+ *
+ * Linux statd always looks up the canonical hostname first for
+ * whatever remote hostname it receives, so this works alright.
+ */
+static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
+{
+	char	buffer[XDR_ADDRBUF_LEN + 1];
+	char	*name = argp->mon_name;
+
+	if (!nsm_use_hostnames) {
+		snprintf(buffer, XDR_ADDRBUF_LEN,
+			 NIPQUAD_FMT, NIPQUAD(argp->addr));
+		name = buffer;
+	}
+
+	return xdr_encode_nsm_string(p, name);
+}
+
 static __be32 *
 xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
-- 
cgit v1.2.3


From 850c95fd07b63704d06909c9a081c51272d32db1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:25:46 -0400
Subject: lockd: refactor SM_MON my_id argument encoder

Clean up: introduce a new XDR encoder specifically for the my_id
argument of SM_MON requests.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 87bd4e4d65d..06216d6715f 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -183,6 +183,25 @@ static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
 	return xdr_encode_nsm_string(p, name);
 }
 
+/*
+ * The "my_id" argument specifies the hostname and RPC procedure
+ * to be called when the status manager receives notification
+ * (via the SM_NOTIFY call) that the state of host "mon_name"
+ * has changed.
+ */
+static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
+{
+	p = xdr_encode_nsm_string(p, utsname()->nodename);
+	if (!p)
+		return ERR_PTR(-EIO);
+
+	*p++ = htonl(argp->prog);
+	*p++ = htonl(argp->vers);
+	*p++ = htonl(argp->proc);
+
+	return p;
+}
+
 static __be32 *
 xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
-- 
cgit v1.2.3


From ea72a7f170e686baf00ceee57b6197bef686889c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:25:53 -0400
Subject: lockd: document use of mon_id argument in SM_MON requests

Clean up: document the argument type that xdr_encode_common() is
marshalling by introducing a new function.  The new function will replace
xdr_encode_common() in just a sec.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 06216d6715f..3935d7b604f 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -202,6 +202,19 @@ static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
 	return p;
 }
 
+/*
+ * The "mon_id" argument specifies the non-private arguments
+ * of an SM_MON or SM_UNMON call.
+ */
+static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
+{
+	p = xdr_encode_mon_name(p, argp);
+	if (!p)
+		return ERR_PTR(-EIO);
+
+	return xdr_encode_my_id(p, argp);
+}
+
 static __be32 *
 xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
-- 
cgit v1.2.3


From 2ca7754d4c96d68e1475690422a202ba5f0443d8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:26:01 -0400
Subject: lockd: Fix up incorrect RPC buffer size calculations.

Switch to using the new mon_id encoder function.

Now that we've refactored the encoding of SM_MON requests, we've
discovered that the pre-computed buffer length maximums are
incorrect!

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c | 39 ++++++++-------------------------------
 1 file changed, 8 insertions(+), 31 deletions(-)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3935d7b604f..f2507fec7c6 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -149,6 +149,9 @@ nsm_create(void)
 
 /*
  * XDR functions for NSM.
+ *
+ * See http://www.opengroup.org/ for details on the Network
+ * Status Monitor wire protocol.
  */
 
 static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
@@ -215,37 +218,10 @@ static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
 	return xdr_encode_my_id(p, argp);
 }
 
-static __be32 *
-xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
-{
-	char	buffer[20], *name;
-
-	/*
-	 * Use the dotted-quad IP address of the remote host as
-	 * identifier. Linux statd always looks up the canonical
-	 * hostname first for whatever remote hostname it receives,
-	 * so this works alright.
-	 */
-	if (nsm_use_hostnames) {
-		name = argp->mon_name;
-	} else {
-		sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
-		name = buffer;
-	}
-	if (!(p = xdr_encode_string(p, name))
-	 || !(p = xdr_encode_string(p, utsname()->nodename)))
-		return ERR_PTR(-EIO);
-	*p++ = htonl(argp->prog);
-	*p++ = htonl(argp->vers);
-	*p++ = htonl(argp->proc);
-
-	return p;
-}
-
 static int
 xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
-	p = xdr_encode_common(rqstp, p, argp);
+	p = xdr_encode_mon_id(p, argp);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
@@ -261,7 +237,7 @@ xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 static int
 xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
-	p = xdr_encode_common(rqstp, p, argp);
+	p = xdr_encode_mon_id(p, argp);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 	rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
@@ -286,8 +262,9 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 }
 
 #define SM_my_name_sz	(1+XDR_QUADLEN(SM_MAXSTRLEN))
-#define SM_my_id_sz	(3+1+SM_my_name_sz)
-#define SM_mon_id_sz	(1+XDR_QUADLEN(20)+SM_my_id_sz)
+#define SM_my_id_sz	(SM_my_name_sz+3)
+#define SM_mon_name_sz	(1+XDR_QUADLEN(SM_MAXSTRLEN))
+#define SM_mon_id_sz	(SM_mon_name_sz+SM_my_id_sz)
 #define SM_mon_sz	(SM_mon_id_sz+4)
 #define SM_monres_sz	2
 #define SM_unmonres_sz	1
-- 
cgit v1.2.3


From 0490a54a00c14212f22c5948c8c13a4553d745bd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Mar 2008 14:26:08 -0400
Subject: lockd: introduce new function to encode private argument in SM_MON
 requests

Clean up: refactor the encoding of the opaque 16-byte private argument in
xdr_encode_mon().  This will be updated later to support IPv6 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c                 | 30 ++++++++++++++++++++++++------
 include/linux/lockd/sm_inter.h |  1 +
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index f2507fec7c6..e4d563543b1 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -218,6 +218,24 @@ static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
 	return xdr_encode_my_id(p, argp);
 }
 
+/*
+ * The "priv" argument may contain private information required
+ * by the SM_MON call. This information will be supplied in the
+ * SM_NOTIFY call.
+ *
+ * Linux provides the raw IP address of the monitored host,
+ * left in network byte order.
+ */
+static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp)
+{
+	*p++ = argp->addr;
+	*p++ = 0;
+	*p++ = 0;
+	*p++ = 0;
+
+	return p;
+}
+
 static int
 xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
@@ -225,11 +243,10 @@ xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
-	/* Surprise - there may even be room for an IPv6 address now */
-	*p++ = argp->addr;
-	*p++ = 0;
-	*p++ = 0;
-	*p++ = 0;
+	p = xdr_encode_priv(p, argp);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
 	rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
 	return 0;
 }
@@ -265,7 +282,8 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_my_id_sz	(SM_my_name_sz+3)
 #define SM_mon_name_sz	(1+XDR_QUADLEN(SM_MAXSTRLEN))
 #define SM_mon_id_sz	(SM_mon_name_sz+SM_my_id_sz)
-#define SM_mon_sz	(SM_mon_id_sz+4)
+#define SM_priv_sz	(XDR_QUADLEN(SM_PRIV_SIZE))
+#define SM_mon_sz	(SM_mon_id_sz+SM_priv_sz)
 #define SM_monres_sz	2
 #define SM_unmonres_sz	1
 
diff --git a/include/linux/lockd/sm_inter.h b/include/linux/lockd/sm_inter.h
index 22a645828f2..5a5448bdb17 100644
--- a/include/linux/lockd/sm_inter.h
+++ b/include/linux/lockd/sm_inter.h
@@ -19,6 +19,7 @@
 #define SM_NOTIFY	6
 
 #define SM_MAXSTRLEN	1024
+#define SM_PRIV_SIZE	16
 
 /*
  * Arguments for all calls to statd
-- 
cgit v1.2.3


From 24b74bf0c9e08cbda74d3c64af69ad402ed54e04 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 19 Apr 2008 13:15:47 -0400
Subject: SUNRPC: Fix a bug in call_decode()

call_verify() can, under certain circumstances, free the RPC slot. In that
case, our cached pointer 'req = task->tk_rqstp' is invalid. Bug was
introduced in commit 220bcc2afd7011b3e0569fc178331fa983c92c1b (SUNRPC:
Don't call xprt_release in call refresh).

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index ea14314331b..522b06849f8 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1240,10 +1240,13 @@ call_decode(struct rpc_task *task)
 			task->tk_status);
 	return;
 out_retry:
-	req->rq_received = req->rq_private_buf.len = 0;
 	task->tk_status = 0;
-	if (task->tk_client->cl_discrtry)
-		xprt_force_disconnect(task->tk_xprt);
+	/* Note: call_verify() may have freed the RPC slot */
+	if (task->tk_rqstp == req) {
+		req->rq_received = req->rq_private_buf.len = 0;
+		if (task->tk_client->cl_discrtry)
+			xprt_force_disconnect(task->tk_xprt);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From b6ddf64ffe9d59577a9176856bb6fe69a539f573 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 17 Apr 2008 18:52:19 -0400
Subject: SUNRPC: Fix up xprt_write_space()

The rest of the networking layer uses SOCK_ASYNC_NOSPACE to signal whether
or not we have someone waiting for buffer memory. Convert the SUNRPC layer
to use the same idiom.
Remove the unlikely()s in xs_udp_write_space and xs_tcp_write_space. In
fact, the most common case will be that there is nobody waiting for buffer
space.

SOCK_NOSPACE is there to tell the TCP layer whether or not the cwnd was
limited by the application window. Ensure that we follow the same idiom as
the rest of the networking layer here too.

Finally, ensure that we clear SOCK_ASYNC_NOSPACE once we wake up, so that
write_space() doesn't keep waking things up on xprt->pending.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  2 +-
 net/sunrpc/xprt.c           |  4 +--
 net/sunrpc/xprtsock.c       | 61 ++++++++++++++++++++++++++++++---------------
 3 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index b3ff9a815e6..8a0629abb86 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -232,7 +232,7 @@ int			xprt_unregister_transport(struct xprt_class *type);
 void			xprt_set_retrans_timeout_def(struct rpc_task *task);
 void			xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
-void			xprt_wait_for_buffer_space(struct rpc_task *task);
+void			xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action);
 void			xprt_write_space(struct rpc_xprt *xprt);
 void			xprt_update_rtt(struct rpc_task *task);
 void			xprt_adjust_cwnd(struct rpc_task *task, int result);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 85199c64702..3ba64f9f84b 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -447,13 +447,13 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
  * @task: task to be put to sleep
  *
  */
-void xprt_wait_for_buffer_space(struct rpc_task *task)
+void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
 
 	task->tk_timeout = req->rq_timeout;
-	rpc_sleep_on(&xprt->pending, task, NULL);
+	rpc_sleep_on(&xprt->pending, task, action);
 }
 EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8bd3b0f73ac..4c2462e2f2b 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -516,6 +516,14 @@ out:
 	return sent;
 }
 
+static void xs_nospace_callback(struct rpc_task *task)
+{
+	struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
+
+	transport->inet->sk_write_pending--;
+	clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+}
+
 /**
  * xs_nospace - place task on wait queue if transmit was incomplete
  * @task: task to put to sleep
@@ -531,20 +539,27 @@ static void xs_nospace(struct rpc_task *task)
 			task->tk_pid, req->rq_slen - req->rq_bytes_sent,
 			req->rq_slen);
 
-	if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
-		/* Protect against races with write_space */
-		spin_lock_bh(&xprt->transport_lock);
-
-		/* Don't race with disconnect */
-		if (!xprt_connected(xprt))
-			task->tk_status = -ENOTCONN;
-		else if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
-			xprt_wait_for_buffer_space(task);
+	/* Protect against races with write_space */
+	spin_lock_bh(&xprt->transport_lock);
+
+	/* Don't race with disconnect */
+	if (xprt_connected(xprt)) {
+		if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
+			/*
+			 * Notify TCP that we're limited by the application
+			 * window size
+			 */
+			set_bit(SOCK_NOSPACE, &transport->sock->flags);
+			transport->inet->sk_write_pending++;
+			/* ...and wait for more buffer space */
+			xprt_wait_for_buffer_space(task, xs_nospace_callback);
+		}
+	} else {
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		task->tk_status = -ENOTCONN;
+	}
 
-		spin_unlock_bh(&xprt->transport_lock);
-	} else
-		/* Keep holding the socket if it is blocked */
-		rpc_delay(task, HZ>>4);
+	spin_unlock_bh(&xprt->transport_lock);
 }
 
 /**
@@ -588,19 +603,20 @@ static int xs_udp_send_request(struct rpc_task *task)
 	}
 
 	switch (status) {
+	case -EAGAIN:
+		xs_nospace(task);
+		break;
 	case -ENETUNREACH:
 	case -EPIPE:
 	case -ECONNREFUSED:
 		/* When the server has died, an ICMP port unreachable message
 		 * prompts ECONNREFUSED. */
-		break;
-	case -EAGAIN:
-		xs_nospace(task);
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
 		break;
 	default:
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
 			-status);
-		break;
 	}
 
 	return status;
@@ -695,12 +711,13 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	case -ENOTCONN:
 	case -EPIPE:
 		status = -ENOTCONN;
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
 		break;
 	default:
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
 			-status);
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
 		xs_tcp_shutdown(xprt);
-		break;
 	}
 
 	return status;
@@ -1189,9 +1206,11 @@ static void xs_udp_write_space(struct sock *sk)
 
 		if (unlikely(!(sock = sk->sk_socket)))
 			goto out;
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+
 		if (unlikely(!(xprt = xprt_from_sock(sk))))
 			goto out;
-		if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)))
+		if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0)
 			goto out;
 
 		xprt_write_space(xprt);
@@ -1222,9 +1241,11 @@ static void xs_tcp_write_space(struct sock *sk)
 
 		if (unlikely(!(sock = sk->sk_socket)))
 			goto out;
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+
 		if (unlikely(!(xprt = xprt_from_sock(sk))))
 			goto out;
-		if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)))
+		if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0)
 			goto out;
 
 		xprt_write_space(xprt);
-- 
cgit v1.2.3


From 080a1f148df0615f7a610e4776dd8f3fb706f54f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 19 Apr 2008 14:22:31 -0400
Subject: SUNRPC: Don't attempt to destroy expired RPCSEC_GSS credentials..

..and always destroy using a 'soft' RPC call. Destroying GSS credentials
isn't mandatory; the server can always cope with a few credentials not
getting destroyed in a timely fashion.

This actually fixes a hang situation. Basically, some servers will decide
that the client is crazy if it tries to destroy an RPC context for which
they have sent an RPCSEC_GSS_CREDPROBLEM, and so will refuse to talk to it
for a while.
The regression therefor probably was introduced by commit
0df7fb74fbb709591301871a38aac7735a1d6583.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth_gss/auth_gss.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index d34f6dfc751..55948cd5ea5 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -710,7 +710,7 @@ gss_destroying_context(struct rpc_cred *cred)
 	struct rpc_task *task;
 
 	if (gss_cred->gc_ctx == NULL ||
-			gss_cred->gc_ctx->gc_proc == RPC_GSS_PROC_DESTROY)
+	    test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
 		return 0;
 
 	gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY;
@@ -720,7 +720,7 @@ gss_destroying_context(struct rpc_cred *cred)
 	 * by the RPC call or by the put_rpccred() below */
 	get_rpccred(cred);
 
-	task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC);
+	task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC|RPC_TASK_SOFT);
 	if (!IS_ERR(task))
 		rpc_put_task(task);
 
-- 
cgit v1.2.3


From 73e3302f60c0e11a0db0b34b903f591139c4f937 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 11 Apr 2008 16:03:54 -0400
Subject: NFS: Fix nfs_wb_page() to always exit with an error or a clean page

It is possible for nfs_wb_page() to sometimes exit with 0 return value, yet
the page is left in a dirty state.
For instance in the case where the server rebooted, and the COMMIT request
failed, then all the previously "clean" pages which were cached by the
server, but were not guaranteed to have been writted out to disk,
have to be redirtied and resent to the server.
The fix is to have nfs_wb_page_priority() check that the page is clean
before it exits...

This fixes a condition that triggers the BUG_ON(PagePrivate(page)) in
nfs_create_request() when we're in the nfs_readpage() path.

Also eliminate a redundant BUG_ON(!PageLocked(page)) while we're at it. It
turns out that clear_page_dirty_for_io() has the exact same test.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ce40cadb15d..997b42aa370 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1493,18 +1493,19 @@ static int nfs_wb_page_priority(struct inode *inode, struct page *page,
 	};
 	int ret;
 
-	BUG_ON(!PageLocked(page));
-	if (clear_page_dirty_for_io(page)) {
-		ret = nfs_writepage_locked(page, &wbc);
+	do {
+		if (clear_page_dirty_for_io(page)) {
+			ret = nfs_writepage_locked(page, &wbc);
+			if (ret < 0)
+				goto out_error;
+		} else if (!PagePrivate(page))
+			break;
+		ret = nfs_sync_mapping_wait(page->mapping, &wbc, how);
 		if (ret < 0)
-			goto out;
-	}
-	if (!PagePrivate(page))
-		return 0;
-	ret = nfs_sync_mapping_wait(page->mapping, &wbc, how);
-	if (ret >= 0)
-		return 0;
-out:
+			goto out_error;
+	} while (PagePrivate(page));
+	return 0;
+out_error:
 	__mark_inode_dirty(inode, I_DIRTY_PAGES);
 	return ret;
 }
-- 
cgit v1.2.3


From fdd1e74c89fe39259a29c494209abad63ff76f82 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 15 Apr 2008 16:33:58 -0400
Subject: NFS: Ensure that the read code cleans up properly when rpc_run_task()
 fails

In the case of readpage() we need to ensure that the pages get unlocked,
and that the error is flagged.

In the case of O_DIRECT, we need to ensure that the pages are all released.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 19 +++++++++++++------
 fs/nfs/read.c   | 53 +++++++++++++++++++++++++++++++++--------------------
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e44200579c8..9d9085b93a3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -229,14 +229,20 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
 
-	if (nfs_readpage_result(task, data) != 0)
-		return;
+	nfs_readpage_result(task, data);
+}
+
+static void nfs_direct_read_release(void *calldata)
+{
+
+	struct nfs_read_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	int status = data->task.tk_status;
 
 	spin_lock(&dreq->lock);
-	if (unlikely(task->tk_status < 0)) {
-		dreq->error = task->tk_status;
+	if (unlikely(status < 0)) {
+		dreq->error = status;
 		spin_unlock(&dreq->lock);
 	} else {
 		dreq->count += data->res.count;
@@ -249,11 +255,12 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
+	nfs_readdata_release(calldata);
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
 	.rpc_call_done = nfs_direct_read_result,
-	.rpc_release = nfs_readdata_release,
+	.rpc_release = nfs_direct_read_release,
 };
 
 /*
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index d333f5fedca..6f9208a549a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -338,26 +338,25 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
-static int nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
+static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
 {
 	struct nfs_readargs *argp = &data->args;
 	struct nfs_readres *resp = &data->res;
 
 	if (resp->eof || resp->count == argp->count)
-		return 0;
+		return;
 
 	/* This is a short read! */
 	nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
 	/* Has the server at least made some progress? */
 	if (resp->count == 0)
-		return 0;
+		return;
 
 	/* Yes, so retry the read at the end of the data */
 	argp->offset += resp->count;
 	argp->pgbase += resp->count;
 	argp->count -= resp->count;
 	rpc_restart_call(task);
-	return -EAGAIN;
 }
 
 /*
@@ -366,29 +365,37 @@ static int nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
 static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
-	struct nfs_page *req = data->req;
-	struct page *page = req->wb_page;
  
 	if (nfs_readpage_result(task, data) != 0)
 		return;
+	if (task->tk_status < 0)
+		return;
 
-	if (likely(task->tk_status >= 0)) {
-		nfs_readpage_truncate_uninitialised_page(data);
-		if (nfs_readpage_retry(task, data) != 0)
-			return;
-	}
-	if (unlikely(task->tk_status < 0))
+	nfs_readpage_truncate_uninitialised_page(data);
+	nfs_readpage_retry(task, data);
+}
+
+static void nfs_readpage_release_partial(void *calldata)
+{
+	struct nfs_read_data *data = calldata;
+	struct nfs_page *req = data->req;
+	struct page *page = req->wb_page;
+	int status = data->task.tk_status;
+
+	if (status < 0)
 		SetPageError(page);
+
 	if (atomic_dec_and_test(&req->wb_complete)) {
 		if (!PageError(page))
 			SetPageUptodate(page);
 		nfs_readpage_release(req);
 	}
+	nfs_readdata_release(calldata);
 }
 
 static const struct rpc_call_ops nfs_read_partial_ops = {
 	.rpc_call_done = nfs_readpage_result_partial,
-	.rpc_release = nfs_readdata_release,
+	.rpc_release = nfs_readpage_release_partial,
 };
 
 static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
@@ -423,29 +430,35 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 
 	if (nfs_readpage_result(task, data) != 0)
 		return;
+	if (task->tk_status < 0)
+		return;
 	/*
 	 * Note: nfs_readpage_retry may change the values of
 	 * data->args. In the multi-page case, we therefore need
 	 * to ensure that we call nfs_readpage_set_pages_uptodate()
 	 * first.
 	 */
-	if (likely(task->tk_status >= 0)) {
-		nfs_readpage_truncate_uninitialised_page(data);
-		nfs_readpage_set_pages_uptodate(data);
-		if (nfs_readpage_retry(task, data) != 0)
-			return;
-	}
+	nfs_readpage_truncate_uninitialised_page(data);
+	nfs_readpage_set_pages_uptodate(data);
+	nfs_readpage_retry(task, data);
+}
+
+static void nfs_readpage_release_full(void *calldata)
+{
+	struct nfs_read_data *data = calldata;
+
 	while (!list_empty(&data->pages)) {
 		struct nfs_page *req = nfs_list_entry(data->pages.next);
 
 		nfs_list_remove_request(req);
 		nfs_readpage_release(req);
 	}
+	nfs_readdata_release(calldata);
 }
 
 static const struct rpc_call_ops nfs_read_full_ops = {
 	.rpc_call_done = nfs_readpage_result_full,
-	.rpc_release = nfs_readdata_release,
+	.rpc_release = nfs_readpage_release_full,
 };
 
 /*
-- 
cgit v1.2.3


From c9d8f89d9816c1d16ada492aa547a4d692508c0d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 15 Apr 2008 16:56:39 -0400
Subject: NFS: Ensure that the write code cleans up properly when
 rpc_run_task() fails

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c        | 50 ++++++++++++++++++++-----------------
 fs/nfs/write.c         | 67 ++++++++++++++++++++++++++++++++------------------
 include/linux/nfs_fs.h |  4 +--
 3 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9d9085b93a3..abf8e0286e3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -508,27 +508,34 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
 
 	/* Call the NFS version-specific code */
-	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
-		return;
-	if (unlikely(task->tk_status < 0)) {
+	NFS_PROTO(data->inode)->commit_done(task, data);
+}
+
+static void nfs_direct_commit_release(void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	int status = data->task.tk_status;
+
+	if (status < 0) {
 		dprintk("NFS: %5u commit failed with error %d.\n",
-				task->tk_pid, task->tk_status);
+				data->task.tk_pid, status);
 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 	} else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
-		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
+		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 	}
 
-	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
+	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
 	nfs_direct_write_complete(dreq, data->inode);
+	nfs_commitdata_release(calldata);
 }
 
 static const struct rpc_call_ops nfs_commit_direct_ops = {
 	.rpc_call_done = nfs_direct_commit_result,
-	.rpc_release = nfs_commit_release,
+	.rpc_release = nfs_direct_commit_release,
 };
 
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
@@ -596,7 +603,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 
 static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
 {
-	dreq->commit_data = nfs_commit_alloc();
+	dreq->commit_data = nfs_commitdata_alloc();
 	if (dreq->commit_data != NULL)
 		dreq->commit_data->req = (struct nfs_page *) dreq;
 }
@@ -617,11 +624,20 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
-	int status = task->tk_status;
 
 	if (nfs_writeback_done(task, data) != 0)
 		return;
+}
+
+/*
+ * NB: Return the value of the first error return code.  Subsequent
+ *     errors after the first one are ignored.
+ */
+static void nfs_direct_write_release(void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	int status = data->task.tk_status;
 
 	spin_lock(&dreq->lock);
 
@@ -643,23 +659,13 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 				break;
 			case NFS_ODIRECT_DO_COMMIT:
 				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
-					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
+					dprintk("NFS: %5u write verify failed\n", data->task.tk_pid);
 					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 				}
 		}
 	}
 out_unlock:
 	spin_unlock(&dreq->lock);
-}
-
-/*
- * NB: Return the value of the first error return code.  Subsequent
- *     errors after the first one are ignored.
- */
-static void nfs_direct_write_release(void *calldata)
-{
-	struct nfs_write_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
 
 	if (put_dreq(dreq))
 		nfs_direct_write_complete(dreq, data->inode);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 997b42aa370..31681bb5e4c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -48,7 +48,7 @@ static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
 static mempool_t *nfs_commit_mempool;
 
-struct nfs_write_data *nfs_commit_alloc(void)
+struct nfs_write_data *nfs_commitdata_alloc(void)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
 
@@ -973,7 +973,6 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req = data->req;
-	struct page		*page = req->wb_page;
 
 	dprintk("NFS: write (%s/%Ld %d@%Ld)",
 		req->wb_context->path.dentry->d_inode->i_sb->s_id,
@@ -981,13 +980,20 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 		req->wb_bytes,
 		(long long)req_offset(req));
 
-	if (nfs_writeback_done(task, data) != 0)
-		return;
+	nfs_writeback_done(task, data);
+}
 
-	if (task->tk_status < 0) {
+static void nfs_writeback_release_partial(void *calldata)
+{
+	struct nfs_write_data	*data = calldata;
+	struct nfs_page		*req = data->req;
+	struct page		*page = req->wb_page;
+	int status = data->task.tk_status;
+
+	if (status < 0) {
 		nfs_set_pageerror(page);
-		nfs_context_set_write_error(req->wb_context, task->tk_status);
-		dprintk(", error = %d\n", task->tk_status);
+		nfs_context_set_write_error(req->wb_context, status);
+		dprintk(", error = %d\n", status);
 		goto out;
 	}
 
@@ -1012,11 +1018,12 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 out:
 	if (atomic_dec_and_test(&req->wb_complete))
 		nfs_writepage_release(req);
+	nfs_writedata_release(calldata);
 }
 
 static const struct rpc_call_ops nfs_write_partial_ops = {
 	.rpc_call_done = nfs_writeback_done_partial,
-	.rpc_release = nfs_writedata_release,
+	.rpc_release = nfs_writeback_release_partial,
 };
 
 /*
@@ -1029,17 +1036,21 @@ static const struct rpc_call_ops nfs_write_partial_ops = {
 static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
-	struct nfs_page		*req;
-	struct page		*page;
 
-	if (nfs_writeback_done(task, data) != 0)
-		return;
+	nfs_writeback_done(task, data);
+}
+
+static void nfs_writeback_release_full(void *calldata)
+{
+	struct nfs_write_data	*data = calldata;
+	int status = data->task.tk_status;
 
 	/* Update attributes as result of writeback. */
 	while (!list_empty(&data->pages)) {
-		req = nfs_list_entry(data->pages.next);
+		struct nfs_page *req = nfs_list_entry(data->pages.next);
+		struct page *page = req->wb_page;
+
 		nfs_list_remove_request(req);
-		page = req->wb_page;
 
 		dprintk("NFS: write (%s/%Ld %d@%Ld)",
 			req->wb_context->path.dentry->d_inode->i_sb->s_id,
@@ -1047,10 +1058,10 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 			req->wb_bytes,
 			(long long)req_offset(req));
 
-		if (task->tk_status < 0) {
+		if (status < 0) {
 			nfs_set_pageerror(page);
-			nfs_context_set_write_error(req->wb_context, task->tk_status);
-			dprintk(", error = %d\n", task->tk_status);
+			nfs_context_set_write_error(req->wb_context, status);
+			dprintk(", error = %d\n", status);
 			goto remove_request;
 		}
 
@@ -1070,11 +1081,12 @@ remove_request:
 	next:
 		nfs_clear_page_tag_locked(req);
 	}
+	nfs_writedata_release(calldata);
 }
 
 static const struct rpc_call_ops nfs_write_full_ops = {
 	.rpc_call_done = nfs_writeback_done_full,
-	.rpc_release = nfs_writedata_release,
+	.rpc_release = nfs_writeback_release_full,
 };
 
 
@@ -1160,7 +1172,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-void nfs_commit_release(void *data)
+void nfs_commitdata_release(void *data)
 {
 	struct nfs_write_data *wdata = data;
 
@@ -1233,7 +1245,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 	struct nfs_write_data	*data;
 	struct nfs_page         *req;
 
-	data = nfs_commit_alloc();
+	data = nfs_commitdata_alloc();
 
 	if (!data)
 		goto out_bad;
@@ -1261,7 +1273,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 static void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
-	struct nfs_page		*req;
 
         dprintk("NFS: %5u nfs_commit_done (status %d)\n",
                                 task->tk_pid, task->tk_status);
@@ -1269,6 +1280,13 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 	/* Call the NFS version-specific code */
 	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
 		return;
+}
+
+static void nfs_commit_release(void *calldata)
+{
+	struct nfs_write_data	*data = calldata;
+	struct nfs_page		*req;
+	int status = data->task.tk_status;
 
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
@@ -1283,10 +1301,10 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
-		if (task->tk_status < 0) {
-			nfs_context_set_write_error(req->wb_context, task->tk_status);
+		if (status < 0) {
+			nfs_context_set_write_error(req->wb_context, status);
 			nfs_inode_remove_request(req);
-			dprintk(", error = %d\n", task->tk_status);
+			dprintk(", error = %d\n", status);
 			goto next;
 		}
 
@@ -1307,6 +1325,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 	next:
 		nfs_clear_page_tag_locked(req);
 	}
+	nfs_commitdata_release(calldata);
 }
 
 static const struct rpc_call_ops nfs_commit_ops = {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f4a0e4c218d..7f0602c8771 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -466,9 +466,9 @@ extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
-extern struct nfs_write_data *nfs_commit_alloc(void);
+extern struct nfs_write_data *nfs_commitdata_alloc(void);
 extern void nfs_commit_free(struct nfs_write_data *wdata);
-extern void nfs_commit_release(void *wdata);
+extern void nfs_commitdata_release(void *wdata);
 #else
 static inline int
 nfs_commit_inode(struct inode *inode, int how)
-- 
cgit v1.2.3


From dbae4c73f08b8a7980cc912954ade3d4c1fb6147 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 14 Apr 2008 14:54:53 -0400
Subject: NFS: Ensure that rpc_run_task() errors are propagated back to the
 caller

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 10 ++++++----
 fs/nfs/read.c   | 23 +++++++++++++++--------
 fs/nfs/write.c  | 33 +++++++++++++++++++--------------
 3 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index abf8e0286e3..4757a2b326a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -347,8 +347,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		NFS_PROTO(inode)->read_setup(data, &msg);
 
 		task = rpc_run_task(&task_setup_data);
-		if (!IS_ERR(task))
-			rpc_put_task(task);
+		if (IS_ERR(task))
+			break;
+		rpc_put_task(task);
 
 		dprintk("NFS: %5u initiated direct read call "
 			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
@@ -763,8 +764,9 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		NFS_PROTO(inode)->write_setup(data, &msg);
 
 		task = rpc_run_task(&task_setup_data);
-		if (!IS_ERR(task))
-			rpc_put_task(task);
+		if (IS_ERR(task))
+			break;
+		rpc_put_task(task);
 
 		dprintk("NFS: %5u initiated direct write call "
 			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6f9208a549a..16f57e0af99 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -153,7 +153,7 @@ static void nfs_readpage_release(struct nfs_page *req)
 /*
  * Set up the NFS read request struct
  */
-static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 		const struct rpc_call_ops *call_ops,
 		unsigned int count, unsigned int offset)
 {
@@ -202,8 +202,10 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 			(unsigned long long)data->args.offset);
 
 	task = rpc_run_task(&task_setup_data);
-	if (!IS_ERR(task))
-		rpc_put_task(task);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
 }
 
 static void
@@ -240,6 +242,7 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 	size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
 	unsigned int offset;
 	int requests = 0;
+	int ret = 0;
 	LIST_HEAD(list);
 
 	nfs_list_remove_request(req);
@@ -261,6 +264,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 	offset = 0;
 	nbytes = count;
 	do {
+		int ret2;
+
 		data = list_entry(list.next, struct nfs_read_data, pages);
 		list_del_init(&data->pages);
 
@@ -268,13 +273,15 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 
 		if (nbytes < rsize)
 			rsize = nbytes;
-		nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
+		ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
 				  rsize, offset);
+		if (ret == 0)
+			ret = ret2;
 		offset += rsize;
 		nbytes -= rsize;
 	} while (nbytes != 0);
 
-	return 0;
+	return ret;
 
 out_bad:
 	while (!list_empty(&list)) {
@@ -292,6 +299,7 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_read_data	*data;
+	int ret = -ENOMEM;
 
 	data = nfs_readdata_alloc(npages);
 	if (!data)
@@ -307,11 +315,10 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
 	}
 	req = nfs_list_entry(data->pages.next);
 
-	nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
-	return 0;
+	return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
 out_bad:
 	nfs_async_read_error(head);
-	return -ENOMEM;
+	return ret;
 }
 
 /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 31681bb5e4c..1ade11d1ba0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -778,7 +778,7 @@ static int flush_task_priority(int how)
 /*
  * Set up the argument/result storage required for the RPC call.
  */
-static void nfs_write_rpcsetup(struct nfs_page *req,
+static int nfs_write_rpcsetup(struct nfs_page *req,
 		struct nfs_write_data *data,
 		const struct rpc_call_ops *call_ops,
 		unsigned int count, unsigned int offset,
@@ -841,8 +841,10 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 		(unsigned long long)data->args.offset);
 
 	task = rpc_run_task(&task_setup_data);
-	if (!IS_ERR(task))
-		rpc_put_task(task);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -868,6 +870,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 	size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
 	unsigned int offset;
 	int requests = 0;
+	int ret = 0;
 	LIST_HEAD(list);
 
 	nfs_list_remove_request(req);
@@ -889,6 +892,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 	offset = 0;
 	nbytes = count;
 	do {
+		int ret2;
+
 		data = list_entry(list.next, struct nfs_write_data, pages);
 		list_del_init(&data->pages);
 
@@ -896,13 +901,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 
 		if (nbytes < wsize)
 			wsize = nbytes;
-		nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
+		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
 				   wsize, offset, how);
+		if (ret == 0)
+			ret = ret2;
 		offset += wsize;
 		nbytes -= wsize;
 	} while (nbytes != 0);
 
-	return 0;
+	return ret;
 
 out_bad:
 	while (!list_empty(&list)) {
@@ -943,9 +950,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 	req = nfs_list_entry(data->pages.next);
 
 	/* Set up the argument struct */
-	nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
-
-	return 0;
+	return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
  out_bad:
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
@@ -1183,7 +1188,7 @@ void nfs_commitdata_release(void *data)
 /*
  * Set up the argument/result storage required for the RPC call.
  */
-static void nfs_commit_rpcsetup(struct list_head *head,
+static int nfs_commit_rpcsetup(struct list_head *head,
 		struct nfs_write_data *data,
 		int how)
 {
@@ -1232,8 +1237,10 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 
 	task = rpc_run_task(&task_setup_data);
-	if (!IS_ERR(task))
-		rpc_put_task(task);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
 }
 
 /*
@@ -1251,9 +1258,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		goto out_bad;
 
 	/* Set up the argument struct */
-	nfs_commit_rpcsetup(head, data, how);
-
-	return 0;
+	return nfs_commit_rpcsetup(head, data, how);
  out_bad:
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
-- 
cgit v1.2.3


From 35d05778e25ee16dbddb60331be0bc1309efba19 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 5 Apr 2008 15:54:17 -0400
Subject: NFSv4: Remove bogus call to nfs4_drop_state_owner() in
 _nfs4_open_expired()

There should be no need to invalidate a perfectly good state owner just
because of a stale filehandle. Doing so can cause the state recovery code
to break, since nfs4_get_renew_cred() and nfs4_get_setclientid_cred() rely
on finding active state owners.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f38d0573be1..424aa206470 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -982,11 +982,8 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
 	if (IS_ERR(opendata))
 		return PTR_ERR(opendata);
 	ret = nfs4_open_recover(opendata, state);
-	if (ret == -ESTALE) {
-		/* Invalidate the state owner so we don't ever use it again */
-		nfs4_drop_state_owner(state->owner);
+	if (ret == -ESTALE)
 		d_drop(ctx->path.dentry);
-	}
 	nfs4_opendata_put(opendata);
 	return ret;
 }
-- 
cgit v1.2.3


From c1d519312dcdf11532fed9f99a8ecc3547ffd9d6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 7 Apr 2008 13:20:54 -0400
Subject: NFSv4: Only increment the sequence id if the server saw it

It is quite possible that the OPEN, CLOSE, LOCK, LOCKU,... compounds fail
before the actual stateful operation has been executed (for instance in the
PUTFH call). There is no way to tell from the overall status result which
operations were executed from the COMPOUND.

The fix is to move incrementing of the sequence id into the XDR layer,
so that we do it as we process the results from the stateful operation.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 12 ++++++------
 fs/nfs/nfs4xdr.c        | 18 +++++++++++++++++-
 include/linux/nfs_xdr.h | 10 ++++++++--
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 424aa206470..9f2759da74e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -240,6 +240,8 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 {
 	p->o_res.f_attr = &p->f_attr;
 	p->o_res.dir_attr = &p->dir_attr;
+	p->o_res.seqid = p->o_arg.seqid;
+	p->c_res.seqid = p->c_arg.seqid;
 	p->o_res.server = p->o_arg.server;
 	nfs_fattr_init(&p->f_attr);
 	nfs_fattr_init(&p->dir_attr);
@@ -730,7 +732,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
 		renew_lease(data->o_res.server, data->timestamp);
 		data->rpc_done = 1;
 	}
-	nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
 }
 
 static void nfs4_open_confirm_release(void *calldata)
@@ -860,7 +861,6 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
 		if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
 			nfs_confirm_seqid(&data->owner->so_seqid, 0);
 	}
-	nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid);
 	data->rpc_done = 1;
 }
 
@@ -1226,7 +1226,6 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
         /* hmm. we are done with the inode, and in the process of freeing
 	 * the state_owner. we keep this around to process errors
 	 */
-	nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid);
 	switch (task->tk_status) {
 		case 0:
 			nfs_set_open_stateid(state, &calldata->res.stateid, 0);
@@ -1333,6 +1332,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 		goto out_free_calldata;
 	calldata->arg.bitmask = server->attr_bitmask;
 	calldata->res.fattr = &calldata->fattr;
+	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
 	calldata->path.mnt = mntget(path->mnt);
 	calldata->path.dentry = dget(path->dentry);
@@ -3159,6 +3159,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	p->arg.fh = NFS_FH(inode);
 	p->arg.fl = &p->fl;
 	p->arg.seqid = seqid;
+	p->res.seqid = seqid;
 	p->arg.stateid = &lsp->ls_stateid;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
@@ -3184,7 +3185,6 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 
 	if (RPC_ASSASSINATED(task))
 		return;
-	nfs_increment_lock_seqid(task->tk_status, calldata->arg.seqid);
 	switch (task->tk_status) {
 		case 0:
 			memcpy(calldata->lsp->ls_stateid.data,
@@ -3322,6 +3322,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->arg.lock_stateid = &lsp->ls_stateid;
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id.id;
+	p->res.lock_seqid = p->arg.lock_seqid;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
 	p->ctx = get_nfs_open_context(ctx);
@@ -3348,6 +3349,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 			return;
 		data->arg.open_stateid = &state->stateid;
 		data->arg.new_lock_owner = 1;
+		data->res.open_seqid = data->arg.open_seqid;
 	} else
 		data->arg.new_lock_owner = 0;
 	data->timestamp = jiffies;
@@ -3365,7 +3367,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 	if (RPC_ASSASSINATED(task))
 		goto out;
 	if (data->arg.new_lock_owner != 0) {
-		nfs_increment_open_seqid(data->rpc_status, data->arg.open_seqid);
 		if (data->rpc_status == 0)
 			nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
 		else
@@ -3377,7 +3378,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 		data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
 		renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp);
 	}
-	nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid);
 out:
 	dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2b519f6325f..b8b2e391d18 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3005,6 +3005,8 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 	int status;
 
 	status = decode_op_hdr(xdr, OP_CLOSE);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
 	if (status)
 		return status;
 	READ_BUF(NFS4_STATEID_SIZE);
@@ -3296,11 +3298,17 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCK);
+	if (status == -EIO)
+		goto out;
 	if (status == 0) {
 		READ_BUF(NFS4_STATEID_SIZE);
 		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 	} else if (status == -NFS4ERR_DENIED)
-		return decode_lock_denied(xdr, NULL);
+		status = decode_lock_denied(xdr, NULL);
+	if (res->open_seqid != NULL)
+		nfs_increment_open_seqid(status, res->open_seqid);
+	nfs_increment_lock_seqid(status, res->lock_seqid);
+out:
 	return status;
 }
 
@@ -3319,6 +3327,8 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCKU);
+	if (status != -EIO)
+		nfs_increment_lock_seqid(status, res->seqid);
 	if (status == 0) {
 		READ_BUF(NFS4_STATEID_SIZE);
 		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
@@ -3384,6 +3394,8 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
         int status;
 
         status = decode_op_hdr(xdr, OP_OPEN);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
         if (status)
                 return status;
         READ_BUF(NFS4_STATEID_SIZE);
@@ -3416,6 +3428,8 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
 	int status;
 
         status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
         if (status)
                 return status;
         READ_BUF(NFS4_STATEID_SIZE);
@@ -3429,6 +3443,8 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
 	int status;
 
 	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
+	if (status != -EIO)
+		nfs_increment_open_seqid(status, res->seqid);
 	if (status)
 		return status;
 	READ_BUF(NFS4_STATEID_SIZE);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f301d0b8bab..24263bb8e0b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -140,6 +140,7 @@ struct nfs_openres {
 	__u32                   rflags;
 	struct nfs_fattr *      f_attr;
 	struct nfs_fattr *      dir_attr;
+	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
 	int			delegation_type;
 	nfs4_stateid		delegation;
@@ -159,6 +160,7 @@ struct nfs_open_confirmargs {
 
 struct nfs_open_confirmres {
 	nfs4_stateid            stateid;
+	struct nfs_seqid *	seqid;
 };
 
 /*
@@ -175,6 +177,7 @@ struct nfs_closeargs {
 struct nfs_closeres {
 	nfs4_stateid            stateid;
 	struct nfs_fattr *	fattr;
+	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
 };
 /*
@@ -199,7 +202,9 @@ struct nfs_lock_args {
 };
 
 struct nfs_lock_res {
-	nfs4_stateid			stateid;
+	nfs4_stateid		stateid;
+	struct nfs_seqid *	lock_seqid;
+	struct nfs_seqid *	open_seqid;
 };
 
 struct nfs_locku_args {
@@ -210,7 +215,8 @@ struct nfs_locku_args {
 };
 
 struct nfs_locku_res {
-	nfs4_stateid			stateid;
+	nfs4_stateid		stateid;
+	struct nfs_seqid *	seqid;
 };
 
 struct nfs_lockt_args {
-- 
cgit v1.2.3


From 1e799b673c6b82b336ab13c48b5651d511ca3000 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 21 Mar 2008 16:19:41 -0400
Subject: SUNRPC: Fix read ordering problems with req->rq_private_buf.len

We want to ensure that req->rq_private_buf.len is updated before
req->rq_received, so that call_decode() doesn't use an old value for
req->rq_rcv_buf.len.

In 'call_decode()' itself, instead of using task->tk_status (which is set
using req->rq_received) must use the actual value of
req->rq_private_buf.len when deciding whether or not the received RPC reply
is too short.

Finally ensure that we set req->rq_rcv_buf.len to zero when retrying a
request. A typo meant that we were resetting req->rq_private_buf.len in
call_decode(), and then clobbering that value with the old rq_rcv_buf.len
again in xprt_transmit().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 26 +++++++++++++-------------
 net/sunrpc/xprt.c |  3 ++-
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 522b06849f8..3ae56046451 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1199,18 +1199,6 @@ call_decode(struct rpc_task *task)
 		task->tk_flags &= ~RPC_CALL_MAJORSEEN;
 	}
 
-	if (task->tk_status < 12) {
-		if (!RPC_IS_SOFT(task)) {
-			task->tk_action = call_bind;
-			clnt->cl_stats->rpcretrans++;
-			goto out_retry;
-		}
-		dprintk("RPC:       %s: too small RPC reply size (%d bytes)\n",
-				clnt->cl_protname, task->tk_status);
-		task->tk_action = call_timeout;
-		goto out_retry;
-	}
-
 	/*
 	 * Ensure that we see all writes made by xprt_complete_rqst()
 	 * before it changed req->rq_received.
@@ -1222,6 +1210,18 @@ call_decode(struct rpc_task *task)
 	WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf,
 				sizeof(req->rq_rcv_buf)) != 0);
 
+	if (req->rq_rcv_buf.len < 12) {
+		if (!RPC_IS_SOFT(task)) {
+			task->tk_action = call_bind;
+			clnt->cl_stats->rpcretrans++;
+			goto out_retry;
+		}
+		dprintk("RPC:       %s: too small RPC reply size (%d bytes)\n",
+				clnt->cl_protname, task->tk_status);
+		task->tk_action = call_timeout;
+		goto out_retry;
+	}
+
 	/* Verify the RPC header */
 	p = call_verify(task);
 	if (IS_ERR(p)) {
@@ -1243,7 +1243,7 @@ out_retry:
 	task->tk_status = 0;
 	/* Note: call_verify() may have freed the RPC slot */
 	if (task->tk_rqstp == req) {
-		req->rq_received = req->rq_private_buf.len = 0;
+		req->rq_received = req->rq_rcv_buf.len = 0;
 		if (task->tk_client->cl_discrtry)
 			xprt_force_disconnect(task->tk_xprt);
 	}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3ba64f9f84b..5110a4ea7fd 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -757,9 +757,10 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
 	task->tk_rtt = (long)jiffies - req->rq_xtime;
 
 	list_del_init(&req->rq_list);
+	req->rq_private_buf.len = copied;
 	/* Ensure all writes are done before we update req->rq_received */
 	smp_wmb();
-	req->rq_received = req->rq_private_buf.len = copied;
+	req->rq_received = copied;
 	rpc_wake_up_queued_task(&xprt->pending, task);
 }
 EXPORT_SYMBOL_GPL(xprt_complete_rqst);
-- 
cgit v1.2.3


From 4a9af59fee0701d9db99bc148d87b8852d6d6dd8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 Apr 2008 18:57:06 -0400
Subject: NLM/lockd: Ensure we don't corrupt fl->fl_flags in nlmclnt_unlock()

Also fix up nlmclnt_lock() so that it doesn't pass modified versions of
fl->fl_flags to nlmclnt_cancel() and other helpers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index b6b74a60e1e..4e1c0123b45 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -493,6 +493,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 	fl->fl_flags |= FL_ACCESS;
 	status = do_vfs_lock(fl);
+	fl->fl_flags = fl_flags;
 	if (status < 0)
 		goto out;
 
@@ -530,10 +531,11 @@ again:
 			goto again;
 		}
 		/* Ensure the resulting lock will get added to granted list */
-		fl->fl_flags = fl_flags | FL_SLEEP;
+		fl->fl_flags |= FL_SLEEP;
 		if (do_vfs_lock(fl) < 0)
 			printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__);
 		up_read(&host->h_rwsem);
+		fl->fl_flags = fl_flags;
 	}
 	status = nlm_stat_to_errno(resp->status);
 out_unblock:
@@ -543,7 +545,6 @@ out_unblock:
 		nlmclnt_cancel(host, req->a_args.block, fl);
 out:
 	nlm_release_call(req);
-	fl->fl_flags = fl_flags;
 	return status;
 }
 
@@ -598,7 +599,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 {
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
-	int status = 0;
+	int status;
+	unsigned char fl_flags = fl->fl_flags;
 
 	/*
 	 * Note: the server is supposed to either grant us the unlock
@@ -607,11 +609,13 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 */
 	fl->fl_flags |= FL_EXISTS;
 	down_read(&host->h_rwsem);
-	if (do_vfs_lock(fl) == -ENOENT) {
-		up_read(&host->h_rwsem);
+	status = do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
+	fl->fl_flags = fl_flags;
+	if (status == -ENOENT) {
+		status = 0;
 		goto out;
 	}
-	up_read(&host->h_rwsem);
 
 	if (req->a_flags & RPC_TASK_ASYNC)
 		return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
-- 
cgit v1.2.3


From 536ff0f809b0f4d56e1c41e66768d330668e0a55 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 4 Apr 2008 15:08:02 -0400
Subject: NFSv4: Ensure we don't corrupt fl->fl_flags in nfs4_proc_unlck

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9f2759da74e..a1069325b87 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3263,6 +3263,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	struct nfs4_lock_state *lsp;
 	struct rpc_task *task;
 	int status = 0;
+	unsigned char fl_flags = request->fl_flags;
 
 	status = nfs4_set_lock_state(state, request);
 	/* Unlock _before_ we do the RPC call */
@@ -3286,6 +3287,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	status = nfs4_wait_for_completion_rpc_task(task);
 	rpc_put_task(task);
 out:
+	request->fl_flags = fl_flags;
 	return status;
 }
 
-- 
cgit v1.2.3


From 5e7f37a76fa5b604949020b7317962262812b2dd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 Apr 2008 18:58:49 -0400
Subject: NLM/lockd: Add a reference counter to struct nlm_rqst

When we replace the existing synchronous RPC calls with asynchronous calls,
the reference count will be needed in order to allow us to examine the
result of the RPC call.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         | 3 +++
 include/linux/lockd/lockd.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4e1c0123b45..749eb5328cb 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -221,6 +221,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 	for(;;) {
 		call = kzalloc(sizeof(*call), GFP_KERNEL);
 		if (call != NULL) {
+			atomic_set(&call->a_count, 1);
 			locks_init_lock(&call->a_args.lock.fl);
 			locks_init_lock(&call->a_res.lock.fl);
 			call->a_host = host;
@@ -237,6 +238,8 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 
 void nlm_release_call(struct nlm_rqst *call)
 {
+	if (!atomic_dec_and_test(&call->a_count))
+		return;
 	nlm_release_host(call->a_host);
 	nlmclnt_release_lockargs(call);
 	kfree(call);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index acf39e1e3a3..94649a8da01 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -91,6 +91,7 @@ struct nlm_wait;
  */
 #define NLMCLNT_OHSIZE		((__NEW_UTS_LEN) + 10u)
 struct nlm_rqst {
+	atomic_t		a_count;
 	unsigned int		a_flags;	/* initial RPC task flags */
 	struct nlm_host *	a_host;		/* host handle */
 	struct nlm_args		a_args;		/* arguments */
-- 
cgit v1.2.3


From dc9d8d048168ff61c458bec06b28996cb90b182a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Mar 2008 16:04:36 -0400
Subject: NLM/lockd: convert __nlm_async_call to use rpc_run_task()

Peter Staubach comments:

> In the course of investigating testing failures in the locking phase of
> the Connectathon testsuite, I discovered a couple of things.  One was
> that one of the tests in the locking tests was racy when it didn't seem
> to need to be and two, that the NFS client asynchronously releases locks
> when a process is exiting.
...
> The Single UNIX Specification Version 3 specifies that:  "All locks
> associated with a file for a given process shall be removed when a file
> descriptor for that file is closed by that process or the process holding
> that file descriptor terminates.".
>
> This does not specify whether those locks must be released prior to the
> completion of the exit processing for the process or not.  However,
> general assumptions seem to be that those locks will be released.  This
> leads to more deterministic behavior under normal circumstances.

The following patch converts the NFSv2/v3 locking code to use the same
mechanism as NFSv4 for sending asynchronous RPC calls and then waiting for
them to complete. This ensures that the UNLOCK and CANCEL RPC calls will
complete even if the user interrupts the call, yet satisfies the
above request for synchronous behaviour on process exit.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 10 deletions(-)

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 749eb5328cb..a34b709006a 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -346,10 +346,16 @@ in_grace_period:
 /*
  * Generic NLM call, async version.
  */
-static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
+static struct rpc_task *__nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = msg,
+		.callback_ops = tk_ops,
+		.callback_data = req,
+		.flags = RPC_TASK_ASYNC,
+	};
 
 	dprintk("lockd: call procedure %d on %s (async)\n",
 			(int)proc, host->h_name);
@@ -359,21 +365,36 @@ static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *
 	if (clnt == NULL)
 		goto out_err;
 	msg->rpc_proc = &clnt->cl_procinfo[proc];
+	task_setup_data.rpc_client = clnt;
 
         /* bootstrap and kick off the async RPC call */
-        return rpc_call_async(clnt, msg, RPC_TASK_ASYNC, tk_ops, req);
+	return rpc_run_task(&task_setup_data);
 out_err:
 	tk_ops->rpc_release(req);
-	return -ENOLCK;
+	return ERR_PTR(-ENOLCK);
+}
+
+static int nlm_do_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_task *task;
+
+	task = __nlm_async_call(req, proc, msg, tk_ops);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
 }
 
+/*
+ * NLM asynchronous call.
+ */
 int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
 {
 	struct rpc_message msg = {
 		.rpc_argp	= &req->a_args,
 		.rpc_resp	= &req->a_res,
 	};
-	return __nlm_async_call(req, proc, &msg, tk_ops);
+	return nlm_do_async_call(req, proc, &msg, tk_ops);
 }
 
 int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
@@ -381,7 +402,32 @@ int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *t
 	struct rpc_message msg = {
 		.rpc_argp	= &req->a_res,
 	};
-	return __nlm_async_call(req, proc, &msg, tk_ops);
+	return nlm_do_async_call(req, proc, &msg, tk_ops);
+}
+
+/*
+ * NLM client asynchronous call.
+ *
+ * Note that although the calls are asynchronous, and are therefore
+ *      guaranteed to complete, we still always attempt to wait for
+ *      completion in order to be able to correctly track the lock
+ *      state.
+ */
+static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_args,
+		.rpc_resp	= &req->a_res,
+	};
+	struct rpc_task *task;
+	int err;
+
+	task = __nlm_async_call(req, proc, &msg, tk_ops);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	err = rpc_wait_for_completion_task(task);
+	rpc_put_task(task);
+	return err;
 }
 
 /*
@@ -620,10 +666,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 		goto out;
 	}
 
-	if (req->a_flags & RPC_TASK_ASYNC)
-		return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
-
-	status = nlmclnt_call(req, NLMPROC_UNLOCK);
+	atomic_inc(&req->a_count);
+	status = nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
 	if (status < 0)
 		goto out;
 
@@ -697,7 +741,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
 	nlmclnt_setlockargs(req, fl);
 	req->a_args.block = block;
 
-	status = nlm_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+	status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 
 	spin_lock_irqsave(&current->sighand->siglock, flags);
 	current->blocked = oldset;
-- 
cgit v1.2.3


From 8ec7ff74448f65ac963e330795d771ab14ec8408 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Mar 2008 16:04:51 -0400
Subject: NLM: Remove the signal masking in nlmclnt_proc/nlmclnt_cancel

The signal masks have been rendered obsolete by the preceding patch.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 42 +-----------------------------------------
 1 file changed, 1 insertion(+), 41 deletions(-)

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index a34b709006a..5f13e0363b2 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -155,8 +155,6 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
 int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 {
 	struct nlm_rqst		*call;
-	sigset_t		oldset;
-	unsigned long		flags;
 	int			status;
 
 	nlm_get_host(host);
@@ -168,22 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 	/* Set up the argument struct */
 	nlmclnt_setlockargs(call, fl);
 
-	/* Keep the old signal mask */
-	spin_lock_irqsave(&current->sighand->siglock, flags);
-	oldset = current->blocked;
-
-	/* If we're cleaning up locks because the process is exiting,
-	 * perform the RPC call asynchronously. */
-	if ((IS_SETLK(cmd) || IS_SETLKW(cmd))
-	    && fl->fl_type == F_UNLCK
-	    && (current->flags & PF_EXITING)) {
-		sigfillset(&current->blocked);	/* Mask all signals */
-		recalc_sigpending();
-
-		call->a_flags = RPC_TASK_ASYNC;
-	}
-	spin_unlock_irqrestore(&current->sighand->siglock, flags);
-
 	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
 		if (fl->fl_type != F_UNLCK) {
 			call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -198,11 +180,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 	fl->fl_ops->fl_release_private(fl);
 	fl->fl_ops = NULL;
 
-	spin_lock_irqsave(&current->sighand->siglock, flags);
-	current->blocked = oldset;
-	recalc_sigpending();
-	spin_unlock_irqrestore(&current->sighand->siglock, flags);
-
 	dprintk("lockd: clnt proc returns %d\n", status);
 	return status;
 }
@@ -722,16 +699,6 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = {
 static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl)
 {
 	struct nlm_rqst	*req;
-	unsigned long	flags;
-	sigset_t	oldset;
-	int		status;
-
-	/* Block all signals while setting up call */
-	spin_lock_irqsave(&current->sighand->siglock, flags);
-	oldset = current->blocked;
-	sigfillset(&current->blocked);
-	recalc_sigpending();
-	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 
 	req = nlm_alloc_call(nlm_get_host(host));
 	if (!req)
@@ -741,14 +708,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
 	nlmclnt_setlockargs(req, fl);
 	req->a_args.block = block;
 
-	status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
-
-	spin_lock_irqsave(&current->sighand->siglock, flags);
-	current->blocked = oldset;
-	recalc_sigpending();
-	spin_unlock_irqrestore(&current->sighand->siglock, flags);
-
-	return status;
+	return nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 }
 
 static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
-- 
cgit v1.2.3


From 6b4b3a752b3464f2fd9fe2837fb19270c23c1d6b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 2 Apr 2008 14:40:53 -0400
Subject: NLM/lockd: Ensure that nlmclnt_cancel() returns results of the CANCEL
 call

Currently, it returns success as long as the RPC call was sent. We'd like
to know if the CANCEL operation succeeded on the server.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5f13e0363b2..ea1a6940af2 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -699,6 +699,10 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = {
 static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl)
 {
 	struct nlm_rqst	*req;
+	int status;
+
+	dprintk("lockd: blocking lock attempt was interrupted by a signal.\n"
+		"       Attempting to cancel lock.\n");
 
 	req = nlm_alloc_call(nlm_get_host(host));
 	if (!req)
@@ -708,7 +712,12 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
 	nlmclnt_setlockargs(req, fl);
 	req->a_args.block = block;
 
-	return nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+	atomic_inc(&req->a_count);
+	status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+	if (status == 0 && req->a_res.status == nlm_lck_denied)
+		status = -ENOLCK;
+	nlm_release_call(req);
+	return status;
 }
 
 static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
-- 
cgit v1.2.3


From 5f50c0c6d644d6c8180d9079c13c5d9de3adeb34 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 Apr 2008 20:26:22 -0400
Subject: NLM/lockd: Fix a race when cancelling a blocking lock

We shouldn't remove the lock from the list of blocked locks until the
CANCEL call has completed since we may be racing with a GRANTED callback.

Also ensure that we send an UNLOCK if the CANCEL request failed. Normally
that should only happen if the process gets hit with a fatal signal.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 43 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index ea1a6940af2..37d1aa20a60 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -510,6 +510,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	struct nlm_res	*resp = &req->a_res;
 	struct nlm_wait *block = NULL;
 	unsigned char fl_flags = fl->fl_flags;
+	unsigned char fl_type;
 	int status = -ENOLCK;
 
 	if (nsm_monitor(host) < 0) {
@@ -525,13 +526,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 
 	block = nlmclnt_prepare_block(host, fl);
 again:
+	/*
+	 * Initialise resp->status to a valid non-zero value,
+	 * since 0 == nlm_lck_granted
+	 */
+	resp->status = nlm_lck_blocked;
 	for(;;) {
 		/* Reboot protection */
 		fl->fl_u.nfs_fl.state = host->h_state;
 		status = nlmclnt_call(req, NLMPROC_LOCK);
 		if (status < 0)
-			goto out_unblock;
-		if (!req->a_args.block)
 			break;
 		/* Did a reclaimer thread notify us of a server reboot? */
 		if (resp->status ==  nlm_lck_denied_grace_period)
@@ -540,15 +544,22 @@ again:
 			break;
 		/* Wait on an NLM blocking lock */
 		status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
-		/* if we were interrupted. Send a CANCEL request to the server
-		 * and exit
-		 */
 		if (status < 0)
-			goto out_unblock;
+			break;
 		if (resp->status != nlm_lck_blocked)
 			break;
 	}
 
+	/* if we were interrupted while blocking, then cancel the lock request
+	 * and exit
+	 */
+	if (resp->status == nlm_lck_blocked) {
+		if (!req->a_args.block)
+			goto out_unlock;
+		if (nlmclnt_cancel(host, req->a_args.block, fl) == 0)
+			goto out_unblock;
+	}
+
 	if (resp->status == nlm_granted) {
 		down_read(&host->h_rwsem);
 		/* Check whether or not the server has rebooted */
@@ -562,16 +573,30 @@ again:
 			printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__);
 		up_read(&host->h_rwsem);
 		fl->fl_flags = fl_flags;
+		status = 0;
 	}
+	if (status < 0)
+		goto out_unlock;
 	status = nlm_stat_to_errno(resp->status);
 out_unblock:
 	nlmclnt_finish_block(block);
-	/* Cancel the blocked request if it is still pending */
-	if (resp->status == nlm_lck_blocked)
-		nlmclnt_cancel(host, req->a_args.block, fl);
 out:
 	nlm_release_call(req);
 	return status;
+out_unlock:
+	/* Fatal error: ensure that we remove the lock altogether */
+	dprintk("lockd: lock attempt ended in fatal error.\n"
+		"       Attempting to unlock.\n");
+	nlmclnt_finish_block(block);
+	fl_type = fl->fl_type;
+	fl->fl_type = F_UNLCK;
+	down_read(&host->h_rwsem);
+	do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
+	fl->fl_type = fl_type;
+	fl->fl_flags = fl_flags;
+	nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+	return status;
 }
 
 /*
-- 
cgit v1.2.3


From c4d7c402b788b73dc24f1e54a57f89d3dc5eb7bc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 Apr 2008 20:26:52 -0400
Subject: NFS: Remove the buggy lock-if-signalled case from do_setlk()

Both NLM and NFSv4 should be able to clean up adequately in the case where
the user interrupts the RPC call...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 10e8b807e7f..742cb745cb4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -566,17 +566,9 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
 
 	lock_kernel();
 	/* Use local locking if mounted with "-onolock" */
-	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) {
+	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
-		/* If we were signalled we still need to ensure that
-		 * we clean up any state on the server. We therefore
-		 * record the lock call as having succeeded in order to
-		 * ensure that locks_remove_posix() cleans it out when
-		 * the process exits.
-		 */
-		if (status == -EINTR || status == -ERESTARTSYS)
-			do_vfs_lock(filp, fl);
-	} else
+	else
 		status = do_vfs_lock(filp, fl);
 	unlock_kernel();
 	if (status < 0)
-- 
cgit v1.2.3


From d11d10cc05c94a32632d6928d15a1034200dd9a5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 2 Apr 2008 14:44:05 -0400
Subject: NLM/lockd: Ensure client locking calls use correct credentials

Now that we've added the 'generic' credentials (that are independent of the
rpc_client) to the nfs_open_context, we can use those in the NLM client to
ensure that the lock/unlock requests are authenticated to whoever
originally opened the file.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 37d1aa20a60..40b16f23e49 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -247,7 +247,7 @@ static int nlm_wait_on_grace(wait_queue_head_t *queue)
  * Generic NLM call
  */
 static int
-nlmclnt_call(struct nlm_rqst *req, u32 proc)
+nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
@@ -256,6 +256,7 @@ nlmclnt_call(struct nlm_rqst *req, u32 proc)
 	struct rpc_message msg = {
 		.rpc_argp	= argp,
 		.rpc_resp	= resp,
+		.rpc_cred	= cred,
 	};
 	int		status;
 
@@ -390,11 +391,12 @@ int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *t
  *      completion in order to be able to correctly track the lock
  *      state.
  */
-static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
 {
 	struct rpc_message msg = {
 		.rpc_argp	= &req->a_args,
 		.rpc_resp	= &req->a_res,
+		.rpc_cred	= cred,
 	};
 	struct rpc_task *task;
 	int err;
@@ -415,7 +417,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 {
 	int	status;
 
-	status = nlmclnt_call(req, NLMPROC_TEST);
+	status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST);
 	if (status < 0)
 		goto out;
 
@@ -506,6 +508,7 @@ static int do_vfs_lock(struct file_lock *fl)
 static int
 nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 {
+	struct rpc_cred *cred = nfs_file_cred(fl->fl_file);
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
 	struct nlm_wait *block = NULL;
@@ -534,7 +537,7 @@ again:
 	for(;;) {
 		/* Reboot protection */
 		fl->fl_u.nfs_fl.state = host->h_state;
-		status = nlmclnt_call(req, NLMPROC_LOCK);
+		status = nlmclnt_call(cred, req, NLMPROC_LOCK);
 		if (status < 0)
 			break;
 		/* Did a reclaimer thread notify us of a server reboot? */
@@ -595,7 +598,7 @@ out_unlock:
 	up_read(&host->h_rwsem);
 	fl->fl_type = fl_type;
 	fl->fl_flags = fl_flags;
-	nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+	nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
 	return status;
 }
 
@@ -619,8 +622,8 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
 	nlmclnt_setlockargs(req, fl);
 	req->a_args.reclaim = 1;
 
-	if ((status = nlmclnt_call(req, NLMPROC_LOCK)) >= 0
-	 && req->a_res.status == nlm_granted)
+	status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK);
+	if (status >= 0 && req->a_res.status == nlm_granted)
 		return 0;
 
 	printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d "
@@ -669,7 +672,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	atomic_inc(&req->a_count);
-	status = nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+	status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
+			NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
 	if (status < 0)
 		goto out;
 
@@ -738,7 +742,8 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
 	req->a_args.block = block;
 
 	atomic_inc(&req->a_count);
-	status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+	status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
+			NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 	if (status == 0 && req->a_res.status == nlm_lck_denied)
 		status = -ENOLCK;
 	nlm_release_call(req);
-- 
cgit v1.2.3


From 856dff3d3875bdc8b88e4a65779873af76776a69 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Mon, 31 Mar 2008 17:39:06 +0300
Subject: nfs: return negative error value from nfs{,4}_stat_to_errno

All use sites for nfs{,4}_stat_to_errno negate their return value.
It's more efficient to return a negative error from the stat_to_errno convertors
rather than negating its return value everywhere. This also produces slightly
smaller code.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c | 76 ++++++++++++++++++++++++++---------------------------
 fs/nfs/nfs3xdr.c | 34 ++++++++++++------------
 fs/nfs/nfs4xdr.c | 80 ++++++++++++++++++++++++++++----------------------------
 3 files changed, 95 insertions(+), 95 deletions(-)

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 86a80b33ec8..28bab67d151 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -267,7 +267,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	int status;
 
 	if ((status = ntohl(*p++)))
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	p = xdr_decode_fattr(p, res->fattr);
 
 	count = ntohl(*p++);
@@ -432,7 +432,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	__be32 *end, *entry, *kaddr;
 
 	if ((status = ntohl(*p++)))
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
@@ -537,7 +537,7 @@ nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
 	int	status;
 
 	if ((status = ntohl(*p++)) != 0)
-		status = -nfs_stat_to_errno(status);
+		status = nfs_stat_to_errno(status);
 	return status;
 }
 
@@ -551,7 +551,7 @@ nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	int	status;
 
 	if ((status = ntohl(*p++)))
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	xdr_decode_fattr(p, fattr);
 	return 0;
 }
@@ -566,7 +566,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 	int	status;
 
 	if ((status = ntohl(*p++)))
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	p = xdr_decode_fhandle(p, res->fh);
 	xdr_decode_fattr(p, res->fattr);
 	return 0;
@@ -604,7 +604,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	int	status;
 
 	if ((status = ntohl(*p++)))
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	/* Convert length of symlink */
 	len = ntohl(*p++);
 	if (len >= rcvbuf->page_len) {
@@ -653,7 +653,7 @@ nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
 	int	status;
 
 	if ((status = ntohl(*p++)))
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 
 	res->tsize  = ntohl(*p++);
 	res->bsize  = ntohl(*p++);
@@ -672,39 +672,39 @@ static struct {
 	int errno;
 } nfs_errtbl[] = {
 	{ NFS_OK,		0		},
-	{ NFSERR_PERM,		EPERM		},
-	{ NFSERR_NOENT,		ENOENT		},
-	{ NFSERR_IO,		errno_NFSERR_IO	},
-	{ NFSERR_NXIO,		ENXIO		},
-/*	{ NFSERR_EAGAIN,	EAGAIN		}, */
-	{ NFSERR_ACCES,		EACCES		},
-	{ NFSERR_EXIST,		EEXIST		},
-	{ NFSERR_XDEV,		EXDEV		},
-	{ NFSERR_NODEV,		ENODEV		},
-	{ NFSERR_NOTDIR,	ENOTDIR		},
-	{ NFSERR_ISDIR,		EISDIR		},
-	{ NFSERR_INVAL,		EINVAL		},
-	{ NFSERR_FBIG,		EFBIG		},
-	{ NFSERR_NOSPC,		ENOSPC		},
-	{ NFSERR_ROFS,		EROFS		},
-	{ NFSERR_MLINK,		EMLINK		},
-	{ NFSERR_NAMETOOLONG,	ENAMETOOLONG	},
-	{ NFSERR_NOTEMPTY,	ENOTEMPTY	},
-	{ NFSERR_DQUOT,		EDQUOT		},
-	{ NFSERR_STALE,		ESTALE		},
-	{ NFSERR_REMOTE,	EREMOTE		},
+	{ NFSERR_PERM,		-EPERM		},
+	{ NFSERR_NOENT,		-ENOENT		},
+	{ NFSERR_IO,		-errno_NFSERR_IO},
+	{ NFSERR_NXIO,		-ENXIO		},
+/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
+	{ NFSERR_ACCES,		-EACCES		},
+	{ NFSERR_EXIST,		-EEXIST		},
+	{ NFSERR_XDEV,		-EXDEV		},
+	{ NFSERR_NODEV,		-ENODEV		},
+	{ NFSERR_NOTDIR,	-ENOTDIR	},
+	{ NFSERR_ISDIR,		-EISDIR		},
+	{ NFSERR_INVAL,		-EINVAL		},
+	{ NFSERR_FBIG,		-EFBIG		},
+	{ NFSERR_NOSPC,		-ENOSPC		},
+	{ NFSERR_ROFS,		-EROFS		},
+	{ NFSERR_MLINK,		-EMLINK		},
+	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFSERR_DQUOT,		-EDQUOT		},
+	{ NFSERR_STALE,		-ESTALE		},
+	{ NFSERR_REMOTE,	-EREMOTE	},
 #ifdef EWFLUSH
-	{ NFSERR_WFLUSH,	EWFLUSH		},
+	{ NFSERR_WFLUSH,	-EWFLUSH	},
 #endif
-	{ NFSERR_BADHANDLE,	EBADHANDLE	},
-	{ NFSERR_NOT_SYNC,	ENOTSYNC	},
-	{ NFSERR_BAD_COOKIE,	EBADCOOKIE	},
-	{ NFSERR_NOTSUPP,	ENOTSUPP	},
-	{ NFSERR_TOOSMALL,	ETOOSMALL	},
-	{ NFSERR_SERVERFAULT,	ESERVERFAULT	},
-	{ NFSERR_BADTYPE,	EBADTYPE	},
-	{ NFSERR_JUKEBOX,	EJUKEBOX	},
-	{ -1,			EIO		}
+	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
+	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFSERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFSERR_BADTYPE,	-EBADTYPE	},
+	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
+	{ -1,			-EIO		}
 };
 
 /*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index fb03048ac65..11cdddec143 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -515,7 +515,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	/* Decode post_op_attrs */
 	p = xdr_decode_post_op_attr(p, res->dir_attr);
 	if (status)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	/* Decode verifier cookie */
 	if (res->verf) {
 		res->verf[0] = *p++;
@@ -751,7 +751,7 @@ nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	int	status;
 
 	if ((status = ntohl(*p++)))
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	xdr_decode_fattr(p, fattr);
 	return 0;
 }
@@ -766,7 +766,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	int	status;
 
 	if ((status = ntohl(*p++)))
-		status = -nfs_stat_to_errno(status);
+		status = nfs_stat_to_errno(status);
 	xdr_decode_wcc_data(p, fattr);
 	return status;
 }
@@ -786,7 +786,7 @@ nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 	int	status;
 
 	if ((status = ntohl(*p++))) {
-		status = -nfs_stat_to_errno(status);
+		status = nfs_stat_to_errno(status);
 	} else {
 		if (!(p = xdr_decode_fhandle(p, res->fh)))
 			return -errno_NFSERR_IO;
@@ -806,7 +806,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 
 	p = xdr_decode_post_op_attr(p, res->fattr);
 	if (status)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	res->access = ntohl(*p++);
 	return 0;
 }
@@ -843,7 +843,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_post_op_attr(p, fattr);
 
 	if (status != 0)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 
 	/* Convert length of symlink */
 	len = ntohl(*p++);
@@ -891,7 +891,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	p = xdr_decode_post_op_attr(p, res->fattr);
 
 	if (status != 0)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 
 	/* Decode reply count and EOF flag. NFSv3 is somewhat redundant
 	 * in that it puts the count both in the res struct and in the
@@ -941,7 +941,7 @@ nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 	p = xdr_decode_wcc_data(p, res->fattr);
 
 	if (status != 0)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 
 	res->count = ntohl(*p++);
 	res->verf->committed = (enum nfs3_stable_how)ntohl(*p++);
@@ -972,7 +972,7 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 			res->fattr->valid = 0;
 		}
 	} else {
-		status = -nfs_stat_to_errno(status);
+		status = nfs_stat_to_errno(status);
 	}
 	p = xdr_decode_wcc_data(p, res->dir_attr);
 	return status;
@@ -987,7 +987,7 @@ nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
 	int	status;
 
 	if ((status = ntohl(*p++)) != 0)
-		status = -nfs_stat_to_errno(status);
+		status = nfs_stat_to_errno(status);
 	p = xdr_decode_wcc_data(p, res->fromattr);
 	p = xdr_decode_wcc_data(p, res->toattr);
 	return status;
@@ -1002,7 +1002,7 @@ nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
 	int	status;
 
 	if ((status = ntohl(*p++)) != 0)
-		status = -nfs_stat_to_errno(status);
+		status = nfs_stat_to_errno(status);
 	p = xdr_decode_post_op_attr(p, res->fattr);
 	p = xdr_decode_wcc_data(p, res->dir_attr);
 	return status;
@@ -1020,7 +1020,7 @@ nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
 
 	p = xdr_decode_post_op_attr(p, res->fattr);
 	if (status != 0)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 
 	p = xdr_decode_hyper(p, &res->tbytes);
 	p = xdr_decode_hyper(p, &res->fbytes);
@@ -1045,7 +1045,7 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
 
 	p = xdr_decode_post_op_attr(p, res->fattr);
 	if (status != 0)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 
 	res->rtmax  = ntohl(*p++);
 	res->rtpref = ntohl(*p++);
@@ -1073,7 +1073,7 @@ nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
 
 	p = xdr_decode_post_op_attr(p, res->fattr);
 	if (status != 0)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	res->max_link = ntohl(*p++);
 	res->max_namelen = ntohl(*p++);
 
@@ -1092,7 +1092,7 @@ nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 	status = ntohl(*p++);
 	p = xdr_decode_wcc_data(p, res->fattr);
 	if (status != 0)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 
 	res->verf->verifier[0] = *p++;
 	res->verf->verifier[1] = *p++;
@@ -1114,7 +1114,7 @@ nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
 	int err, base;
 
 	if (status != 0)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	p = xdr_decode_post_op_attr(p, res->fattr);
 	res->mask = ntohl(*p++);
 	if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -1141,7 +1141,7 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	int status = ntohl(*p++);
 
 	if (status)
-		return -nfs_stat_to_errno(status);
+		return nfs_stat_to_errno(status);
 	xdr_decode_post_op_attr(p, fattr);
 	return 0;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b8b2e391d18..809ef0d787e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2241,7 +2241,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	}
 	READ32(nfserr);
 	if (nfserr != NFS_OK)
-		return -nfs4_stat_to_errno(nfserr);
+		return nfs4_stat_to_errno(nfserr);
 	return 0;
 }
 
@@ -3760,7 +3760,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 		READ_BUF(len);
 		return -NFSERR_CLID_INUSE;
 	} else
-		return -nfs4_stat_to_errno(nfserr);
+		return nfs4_stat_to_errno(nfserr);
 
 	return 0;
 }
@@ -4422,7 +4422,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
 	if (!status)
 		status = decode_fsinfo(&xdr, fsinfo);
 	if (!status)
-		status = -nfs4_stat_to_errno(hdr.status);
+		status = nfs4_stat_to_errno(hdr.status);
 	return status;
 }
 
@@ -4512,7 +4512,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
 	if (!status)
 		status = decode_setclientid(&xdr, clp);
 	if (!status)
-		status = -nfs4_stat_to_errno(hdr.status);
+		status = nfs4_stat_to_errno(hdr.status);
 	return status;
 }
 
@@ -4534,7 +4534,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
 	if (!status)
 		status = decode_fsinfo(&xdr, fsinfo);
 	if (!status)
-		status = -nfs4_stat_to_errno(hdr.status);
+		status = nfs4_stat_to_errno(hdr.status);
 	return status;
 }
 
@@ -4644,42 +4644,42 @@ static struct {
 	int errno;
 } nfs_errtbl[] = {
 	{ NFS4_OK,		0		},
-	{ NFS4ERR_PERM,		EPERM		},
-	{ NFS4ERR_NOENT,	ENOENT		},
-	{ NFS4ERR_IO,		errno_NFSERR_IO	},
-	{ NFS4ERR_NXIO,		ENXIO		},
-	{ NFS4ERR_ACCESS,	EACCES		},
-	{ NFS4ERR_EXIST,	EEXIST		},
-	{ NFS4ERR_XDEV,		EXDEV		},
-	{ NFS4ERR_NOTDIR,	ENOTDIR		},
-	{ NFS4ERR_ISDIR,	EISDIR		},
-	{ NFS4ERR_INVAL,	EINVAL		},
-	{ NFS4ERR_FBIG,		EFBIG		},
-	{ NFS4ERR_NOSPC,	ENOSPC		},
-	{ NFS4ERR_ROFS,		EROFS		},
-	{ NFS4ERR_MLINK,	EMLINK		},
-	{ NFS4ERR_NAMETOOLONG,	ENAMETOOLONG	},
-	{ NFS4ERR_NOTEMPTY,	ENOTEMPTY	},
-	{ NFS4ERR_DQUOT,	EDQUOT		},
-	{ NFS4ERR_STALE,	ESTALE		},
-	{ NFS4ERR_BADHANDLE,	EBADHANDLE	},
-	{ NFS4ERR_BADOWNER,	EINVAL		},
-	{ NFS4ERR_BADNAME,	EINVAL		},
-	{ NFS4ERR_BAD_COOKIE,	EBADCOOKIE	},
-	{ NFS4ERR_NOTSUPP,	ENOTSUPP	},
-	{ NFS4ERR_TOOSMALL,	ETOOSMALL	},
-	{ NFS4ERR_SERVERFAULT,	ESERVERFAULT	},
-	{ NFS4ERR_BADTYPE,	EBADTYPE	},
-	{ NFS4ERR_LOCKED,	EAGAIN		},
-	{ NFS4ERR_RESOURCE,	EREMOTEIO	},
-	{ NFS4ERR_SYMLINK,	ELOOP		},
-	{ NFS4ERR_OP_ILLEGAL,	EOPNOTSUPP	},
-	{ NFS4ERR_DEADLOCK,	EDEADLK		},
-	{ NFS4ERR_WRONGSEC,	EPERM		}, /* FIXME: this needs
+	{ NFS4ERR_PERM,		-EPERM		},
+	{ NFS4ERR_NOENT,	-ENOENT		},
+	{ NFS4ERR_IO,		-errno_NFSERR_IO},
+	{ NFS4ERR_NXIO,		-ENXIO		},
+	{ NFS4ERR_ACCESS,	-EACCES		},
+	{ NFS4ERR_EXIST,	-EEXIST		},
+	{ NFS4ERR_XDEV,		-EXDEV		},
+	{ NFS4ERR_NOTDIR,	-ENOTDIR	},
+	{ NFS4ERR_ISDIR,	-EISDIR		},
+	{ NFS4ERR_INVAL,	-EINVAL		},
+	{ NFS4ERR_FBIG,		-EFBIG		},
+	{ NFS4ERR_NOSPC,	-ENOSPC		},
+	{ NFS4ERR_ROFS,		-EROFS		},
+	{ NFS4ERR_MLINK,	-EMLINK		},
+	{ NFS4ERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFS4ERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFS4ERR_DQUOT,	-EDQUOT		},
+	{ NFS4ERR_STALE,	-ESTALE		},
+	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFS4ERR_BADOWNER,	-EINVAL		},
+	{ NFS4ERR_BADNAME,	-EINVAL		},
+	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFS4ERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
+	{ NFS4ERR_LOCKED,	-EAGAIN		},
+	{ NFS4ERR_RESOURCE,	-EREMOTEIO	},
+	{ NFS4ERR_SYMLINK,	-ELOOP		},
+	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},
+	{ NFS4ERR_DEADLOCK,	-EDEADLK	},
+	{ NFS4ERR_WRONGSEC,	-EPERM		}, /* FIXME: this needs
 						    * to be handled by a
 						    * middle-layer.
 						    */
-	{ -1,			EIO		}
+	{ -1,			-EIO		}
 };
 
 /*
@@ -4696,14 +4696,14 @@ nfs4_stat_to_errno(int stat)
 	}
 	if (stat <= 10000 || stat > 10100) {
 		/* The server is looney tunes. */
-		return ESERVERFAULT;
+		return -ESERVERFAULT;
 	}
 	/* If we cannot translate the error, the recovery routines should
 	 * handle it.
 	 * Note: remaining NFSv4 error codes have values > 10000, so should
 	 * not conflict with native Linux error codes.
 	 */
-	return stat;
+	return -stat;
 }
 
 #define PROC(proc, argtype, restype)				\
-- 
cgit v1.2.3


From 441092415770ddec648800701895913c4bfd60c1 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@citi.umich.edu>
Date: Wed, 2 Apr 2008 15:21:15 +0300
Subject: nfs: fix printout of multiword bitfields

Benny points out that zero-padding of multiword bitfields is necessary,
and that delimiting each word is nice to avoid endianess confusion.

bhalevy: without zero padding output can be ambiguous. Also,
since the printed array of two 32-bit unsigned integers is not a
64-bit number, delimiting the output with a semicolon makes more sense.

Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 809ef0d787e..5a2d64927b3 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1191,8 +1191,8 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 	WRITE32(attrs[0] & readdir->bitmask[0]);
 	WRITE32(attrs[1] & readdir->bitmask[1]);
-	dprintk("%s: cookie = %Lu, verifier = 0x%x%x, bitmap = 0x%x%x\n",
-			__FUNCTION__,
+	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
+			__func__,
 			(unsigned long long)readdir->cookie,
 			((u32 *)readdir->verifier.data)[0],
 			((u32 *)readdir->verifier.data)[1],
@@ -2291,7 +2291,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
 		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
 	} else
 		bitmask[0] = bitmask[1] = 0;
-	dprintk("%s: bitmask=0x%x%x\n", __FUNCTION__, bitmask[0], bitmask[1]);
+	dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
 	return 0;
 }
 
@@ -3505,8 +3505,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 		return status;
 	READ_BUF(8);
 	COPYMEM(readdir->verifier.data, 8);
-	dprintk("%s: verifier = 0x%x%x\n",
-			__FUNCTION__,
+	dprintk("%s: verifier = %08x:%08x\n",
+			__func__,
 			((u32 *)readdir->verifier.data)[0],
 			((u32 *)readdir->verifier.data)[1]);
 
-- 
cgit v1.2.3


From 78ea323be6380a9313e87fe241809e912e8ae401 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 7 Apr 2008 20:49:28 -0400
Subject: NFSv4: Don't use cred->cr_ops->cr_name in nfs4_proc_setclientid()

With the recent change to generic creds, we can no longer use
cred->cr_ops->cr_name to distinguish between RPCSEC_GSS principals and
AUTH_SYS/AUTH_NULL identities. Replace it with the rpc_authops->au_name
instead...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c              | 2 +-
 include/linux/sunrpc/auth.h    | 2 --
 net/sunrpc/auth_generic.c      | 2 --
 net/sunrpc/auth_gss/auth_gss.c | 2 --
 net/sunrpc/auth_null.c         | 2 --
 net/sunrpc/auth_unix.c         | 2 --
 6 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a1069325b87..dbc09271af0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2885,7 +2885,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
 							RPC_DISPLAY_ADDR),
 				rpc_peeraddr2str(clp->cl_rpcclient,
 							RPC_DISPLAY_PROTO),
-				cred->cr_ops->cr_name,
+				clp->cl_rpcclient->cl_auth->au_ops->au_name,
 				clp->cl_id_uniquifier);
 		setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
 				sizeof(setclientid.sc_netid),
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index e93cd8aa3eb..a19c3af933c 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -96,9 +96,7 @@ struct rpc_auth {
 struct rpc_authops {
 	struct module		*owner;
 	rpc_authflavor_t	au_flavor;	/* flavor (RPC_AUTH_*) */
-#ifdef RPC_DEBUG
 	char *			au_name;
-#endif
 	struct rpc_auth *	(*create)(struct rpc_clnt *, rpc_authflavor_t);
 	void			(*destroy)(struct rpc_auth *);
 
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 6a3f77c9e4d..b6f124c8507 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -136,9 +136,7 @@ static struct rpc_cred_cache generic_cred_cache = {
 
 static const struct rpc_authops generic_auth_ops = {
 	.owner = THIS_MODULE,
-#ifdef RPC_DEBUG
 	.au_name = "Generic",
-#endif
 	.lookup_cred = generic_lookup_cred,
 	.crcreate = generic_create_cred,
 };
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 55948cd5ea5..7567eb95823 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1287,9 +1287,7 @@ out:
 static const struct rpc_authops authgss_ops = {
 	.owner		= THIS_MODULE,
 	.au_flavor	= RPC_AUTH_GSS,
-#ifdef RPC_DEBUG
 	.au_name	= "RPCSEC_GSS",
-#endif
 	.create		= gss_create,
 	.destroy	= gss_destroy,
 	.lookup_cred	= gss_lookup_cred,
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 3c26c18df0d..c70dd7f5258 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -104,9 +104,7 @@ nul_validate(struct rpc_task *task, __be32 *p)
 const struct rpc_authops authnull_ops = {
 	.owner		= THIS_MODULE,
 	.au_flavor	= RPC_AUTH_NULL,
-#ifdef RPC_DEBUG
 	.au_name	= "NULL",
-#endif
 	.create		= nul_create,
 	.destroy	= nul_destroy,
 	.lookup_cred	= nul_lookup_cred,
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 04e936a56fb..44920b90bdc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -210,9 +210,7 @@ void __init rpc_init_authunix(void)
 const struct rpc_authops authunix_ops = {
 	.owner		= THIS_MODULE,
 	.au_flavor	= RPC_AUTH_UNIX,
-#ifdef RPC_DEBUG
 	.au_name	= "UNIX",
-#endif
 	.create		= unx_create,
 	.destroy	= unx_destroy,
 	.lookup_cred	= unx_lookup_cred,
-- 
cgit v1.2.3


From 7c67db3a8a98045744f06fcd6d8f476d9df0ba5c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 7 Apr 2008 20:50:11 -0400
Subject: NFSv4: Reintroduce machine creds

We need to try to ensure that we always use the same credentials whenever
we re-establish the clientid on the server. If not, the server won't
recognise that we're the same client, and so may not allow us to recover
state.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c                 |  7 +++++++
 include/linux/nfs_fs_sb.h       |  2 ++
 include/linux/sunrpc/auth.h     |  2 ++
 include/linux/sunrpc/auth_gss.h |  1 +
 net/sunrpc/auth_generic.c       | 26 ++++++++++++++++++++++++--
 net/sunrpc/auth_gss/auth_gss.c  | 12 +++++++++++-
 6 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 93dfd75aba7..f2f3b284e6d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -112,6 +112,7 @@ struct nfs_client_initdata {
 static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 {
 	struct nfs_client *clp;
+	struct rpc_cred *cred;
 
 	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
 		goto error_0;
@@ -150,6 +151,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_boot_time = CURRENT_TIME;
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 #endif
+	cred = rpc_lookup_machine_cred();
+	if (!IS_ERR(cred))
+		clp->cl_machine_cred = cred;
 
 	return clp;
 
@@ -191,6 +195,9 @@ static void nfs_free_client(struct nfs_client *clp)
 	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
 		nfs_callback_down();
 
+	if (clp->cl_machine_cred != NULL)
+		put_rpccred(clp->cl_machine_cred);
+
 	kfree(clp->cl_hostname);
 	kfree(clp);
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index ac7e4fb943e..c9beacd16c0 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -32,6 +32,8 @@ struct nfs_client {
 	const struct nfs_rpc_ops *rpc_ops;	/* NFS protocol vector */
 	int			cl_proto;	/* Network transport protocol */
 
+	struct rpc_cred		*cl_machine_cred;
+
 #ifdef CONFIG_NFS_V4
 	u64			cl_clientid;	/* constant */
 	nfs4_verifier		cl_confirm;
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index a19c3af933c..3f632182d8e 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -26,6 +26,7 @@ struct auth_cred {
 	uid_t	uid;
 	gid_t	gid;
 	struct group_info *group_info;
+	unsigned char machine_cred : 1;
 };
 
 /*
@@ -130,6 +131,7 @@ void __exit		rpcauth_remove_module(void);
 void __exit		rpc_destroy_generic_auth(void);
 
 struct rpc_cred *	rpc_lookup_cred(void);
+struct rpc_cred *	rpc_lookup_machine_cred(void);
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 67658e17a37..fec6899bf35 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -84,6 +84,7 @@ struct gss_cred {
 	enum rpc_gss_svc	gc_service;
 	struct gss_cl_ctx	*gc_ctx;
 	struct gss_upcall_msg	*gc_upcall;
+	unsigned char		gc_machine_cred : 1;
 };
 
 #endif /* __KERNEL__ */
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index b6f124c8507..d927d9f5741 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -17,6 +17,9 @@
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
+#define RPC_ANONYMOUS_USERID	((uid_t)-2)
+#define RPC_ANONYMOUS_GROUPID	((gid_t)-2)
+
 struct generic_cred {
 	struct rpc_cred gc_base;
 	struct auth_cred acred;
@@ -35,6 +38,22 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+/*
+ * Public call interface for looking up machine creds.
+ */
+struct rpc_cred *rpc_lookup_machine_cred(void)
+{
+	struct auth_cred acred = {
+		.uid = RPC_ANONYMOUS_USERID,
+		.gid = RPC_ANONYMOUS_GROUPID,
+		.machine_cred = 1,
+	};
+
+	dprintk("RPC:       looking up machine cred\n");
+	return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
+
 static void
 generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred)
 {
@@ -75,8 +94,10 @@ generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	gcred->acred.group_info = acred->group_info;
 	if (gcred->acred.group_info != NULL)
 		get_group_info(gcred->acred.group_info);
+	gcred->acred.machine_cred = acred->machine_cred;
 
-	dprintk("RPC:       allocated generic cred %p for uid %d gid %d\n",
+	dprintk("RPC:       allocated %s cred %p for uid %d gid %d\n",
+			gcred->acred.machine_cred ? "machine" : "generic",
 			gcred, acred->uid, acred->gid);
 	return &gcred->gc_base;
 }
@@ -115,7 +136,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
 
 	if (gcred->acred.uid != acred->uid ||
 	    gcred->acred.gid != acred->gid ||
-	    gcred->acred.group_info != acred->group_info)
+	    gcred->acred.group_info != acred->group_info ||
+	    gcred->acred.machine_cred != acred->machine_cred)
 		return 0;
 	return 1;
 }
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 7567eb95823..46f7ec800af 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -371,9 +371,16 @@ gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid)
 static struct gss_upcall_msg *
 gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cred *cred)
 {
+	struct gss_cred *gss_cred = container_of(cred,
+			struct gss_cred, gc_base);
 	struct gss_upcall_msg *gss_new, *gss_msg;
+	uid_t uid = cred->cr_uid;
 
-	gss_new = gss_alloc_msg(gss_auth, cred->cr_uid);
+	/* Special case: rpc.gssd assumes that uid == 0 implies machine creds */
+	if (gss_cred->gc_machine_cred != 0)
+		uid = 0;
+
+	gss_new = gss_alloc_msg(gss_auth, uid);
 	if (gss_new == NULL)
 		return ERR_PTR(-ENOMEM);
 	gss_msg = gss_add_msg(gss_auth, gss_new);
@@ -818,6 +825,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	 */
 	cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW;
 	cred->gc_service = gss_auth->service;
+	cred->gc_machine_cred = acred->machine_cred;
 	kref_get(&gss_auth->kref);
 	return &cred->gc_base;
 
@@ -855,6 +863,8 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 	if (gss_cred->gc_ctx && time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
 		return 0;
 out:
+	if (acred->machine_cred != gss_cred->gc_machine_cred)
+		return 0;
 	return (rc->cr_uid == acred->uid);
 }
 
-- 
cgit v1.2.3


From a2b2bb8822c78806930bbb4d4c5bb3ae69648fd0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 8 Apr 2008 16:02:17 -0400
Subject: NFSv4: Attempt to use machine credentials in SETCLIENTID calls

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7775435ea7a..46eb624e4f1 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -71,6 +71,29 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 	return status;
 }
 
+static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp)
+{
+	struct rpc_cred *cred = NULL;
+
+	spin_lock(&clp->cl_lock);
+	if (clp->cl_machine_cred != NULL)
+		cred = get_rpccred(clp->cl_machine_cred);
+	spin_unlock(&clp->cl_lock);
+	return cred;
+}
+
+static void nfs4_clear_machine_cred(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+
+	spin_lock(&clp->cl_lock);
+	cred = clp->cl_machine_cred;
+	clp->cl_machine_cred = NULL;
+	spin_unlock(&clp->cl_lock);
+	if (cred != NULL)
+		put_rpccred(cred);
+}
+
 struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
@@ -91,13 +114,18 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
+	struct rpc_cred *cred;
 
+	cred = nfs4_get_machine_cred(clp);
+	if (cred != NULL)
+		goto out;
 	pos = rb_first(&clp->cl_state_owners);
 	if (pos != NULL) {
 		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-		return get_rpccred(sp->so_cred);
+		cred = get_rpccred(sp->so_cred);
 	}
-	return NULL;
+out:
+	return cred;
 }
 
 static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
@@ -924,10 +952,10 @@ restart_loop:
 	if (cred != NULL) {
 		/* Yes there are: try to renew the old lease */
 		status = nfs4_proc_renew(clp, cred);
+		put_rpccred(cred);
 		switch (status) {
 			case 0:
 			case -NFS4ERR_CB_PATH_DOWN:
-				put_rpccred(cred);
 				goto out;
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_LEASE_MOVED:
@@ -936,14 +964,19 @@ restart_loop:
 	} else {
 		/* "reboot" to ensure we clear all state on the server */
 		clp->cl_boot_time = CURRENT_TIME;
-		cred = nfs4_get_setclientid_cred(clp);
 	}
 	/* We're going to have to re-establish a clientid */
 	nfs4_state_mark_reclaim(clp);
 	status = -ENOENT;
+	cred = nfs4_get_setclientid_cred(clp);
 	if (cred != NULL) {
 		status = nfs4_init_client(clp, cred);
 		put_rpccred(cred);
+		/* Handle case where the user hasn't set up machine creds */
+		if (status == -EACCES && cred == clp->cl_machine_cred) {
+			nfs4_clear_machine_cred(clp);
+			goto restart_loop;
+		}
 	}
 	if (status)
 		goto out_error;
-- 
cgit v1.2.3


From d2b831416365e8b1f27809b62d5e4260883956cc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 14 Apr 2008 18:13:37 -0400
Subject: SUNRPC: Protect creds against early garbage collection

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 0632cd0a1ad..6bfea9ed686 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -220,6 +220,9 @@ rpcauth_destroy_credcache(struct rpc_auth *auth)
 }
 EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
 
+
+#define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ)
+
 /*
  * Remove stale credentials. Avoid sleeping inside the loop.
  */
@@ -228,6 +231,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 {
 	spinlock_t *cache_lock;
 	struct rpc_cred *cred;
+	unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
 
 	while (!list_empty(&cred_unused)) {
 		cred = list_entry(cred_unused.next, struct rpc_cred, cr_lru);
@@ -235,6 +239,10 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 		number_cred_unused--;
 		if (atomic_read(&cred->cr_count) != 0)
 			continue;
+		/* Enforce a 5 second garbage collection moratorium */
+		if (time_in_range(cred->cr_expire, expired, jiffies) &&
+		    test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0)
+			continue;
 		cache_lock = &cred->cr_auth->au_credcache->lock;
 		spin_lock(cache_lock);
 		if (atomic_read(&cred->cr_count) == 0) {
-- 
cgit v1.2.3


From 06b4b681ababc20596aa947595714710f557131d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 16 Apr 2008 16:51:38 -0400
Subject: SUNRPC: remove XS_SENDMSG_RETRY

The condition for exiting from the loop in xs_tcp_send_request() should be
that we find we're not making progress (i.e. number of bytes sent is 0).

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4c2462e2f2b..4a567a93e6a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -135,12 +135,6 @@ static ctl_table sunrpc_table[] = {
 
 #endif
 
-/*
- * How many times to try sending a request on a socket before waiting
- * for the socket buffer to clear.
- */
-#define XS_SENDMSG_RETRY	(10U)
-
 /*
  * Time out for an RPC UDP socket connect.  UDP socket connects are
  * synchronous, but we set a timeout anyway in case of resource
@@ -666,7 +660,6 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct xdr_buf *xdr = &req->rq_snd_buf;
 	int status;
-	unsigned int retry = 0;
 
 	xs_encode_tcp_record_marker(&req->rq_snd_buf);
 
@@ -697,9 +690,10 @@ static int xs_tcp_send_request(struct rpc_task *task)
 			return 0;
 		}
 
+		if (status != 0)
+			continue;
 		status = -EAGAIN;
-		if (retry++ > XS_SENDMSG_RETRY)
-			break;
+		break;
 	}
 
 	switch (status) {
-- 
cgit v1.2.3


From 636ac43318ce6939c1698fb67e714d421314ed71 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 17 Apr 2008 14:00:47 -0400
Subject: SUNRPC: Remove the unused export of xprt_force_disconnect

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 5110a4ea7fd..a0646a3b4a3 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -605,7 +605,6 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
 	xprt_wake_pending_tasks(xprt, -ENOTCONN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
-EXPORT_SYMBOL_GPL(xprt_force_disconnect);
 
 static void
 xprt_init_autodisconnect(unsigned long data)
-- 
cgit v1.2.3


From 7c1d71cf56feebfb5b98219b9d11dfc3a2feca62 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 17 Apr 2008 16:52:57 -0400
Subject: SUNRPC: Don't disconnect more than once if retransmitting NFSv4
 requests

NFSv4 requires us to ensure that we break the TCP connection before we're
allowed to retransmit a request. However in the case where we're
retransmitting several requests that have been sent on the same
connection, we need to ensure that we don't interfere with the attempt to
reconnect and/or break the connection again once it has been established.

We therefore introduce a 'connection' cookie that is bumped every time a
connection is broken. This allows requests to track if they need to force a
disconnection.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  8 ++++++++
 net/sunrpc/clnt.c           |  6 ++++--
 net/sunrpc/xprt.c           | 29 +++++++++++++++++++++++++++++
 net/sunrpc/xprtsock.c       |  2 ++
 4 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 8a0629abb86..4d80a118d53 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -86,6 +86,10 @@ struct rpc_rqst {
 	unsigned long		rq_majortimeo;	/* major timeout alarm */
 	unsigned long		rq_timeout;	/* Current timeout value */
 	unsigned int		rq_retries;	/* # of retries */
+	unsigned int		rq_connect_cookie;
+						/* A cookie used to track the
+						   state of the transport
+						   connection */
 	
 	/*
 	 * Partial send handling
@@ -152,6 +156,9 @@ struct rpc_xprt {
 	unsigned long		connect_timeout,
 				bind_timeout,
 				reestablish_timeout;
+	unsigned int		connect_cookie;	/* A cookie that gets bumped
+						   every time the transport
+						   is reconnected */
 
 	/*
 	 * Disconnection of idle transports
@@ -241,6 +248,7 @@ void			xprt_complete_rqst(struct rpc_task *task, int copied);
 void			xprt_release_rqst_cong(struct rpc_task *task);
 void			xprt_disconnect_done(struct rpc_xprt *xprt);
 void			xprt_force_disconnect(struct rpc_xprt *xprt);
+void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 
 /*
  * Reserved bit positions in xprt->state
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3ae56046451..8773b4342c9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1120,7 +1120,8 @@ call_status(struct rpc_task *task)
 	case -ETIMEDOUT:
 		task->tk_action = call_timeout;
 		if (task->tk_client->cl_discrtry)
-			xprt_force_disconnect(task->tk_xprt);
+			xprt_conditional_disconnect(task->tk_xprt,
+					req->rq_connect_cookie);
 		break;
 	case -ECONNREFUSED:
 	case -ENOTCONN:
@@ -1245,7 +1246,8 @@ out_retry:
 	if (task->tk_rqstp == req) {
 		req->rq_received = req->rq_rcv_buf.len = 0;
 		if (task->tk_client->cl_discrtry)
-			xprt_force_disconnect(task->tk_xprt);
+			xprt_conditional_disconnect(task->tk_xprt,
+					req->rq_connect_cookie);
 	}
 }
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a0646a3b4a3..75d748eee0e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -606,6 +606,34 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
+/**
+ * xprt_conditional_disconnect - force a transport to disconnect
+ * @xprt: transport to disconnect
+ * @cookie: 'connection cookie'
+ *
+ * This attempts to break the connection if and only if 'cookie' matches
+ * the current transport 'connection cookie'. It ensures that we don't
+ * try to break the connection more than once when we need to retransmit
+ * a batch of RPC requests.
+ *
+ */
+void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
+{
+	/* Don't race with the test_bit() in xprt_clear_locked() */
+	spin_lock_bh(&xprt->transport_lock);
+	if (cookie != xprt->connect_cookie)
+		goto out;
+	if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt))
+		goto out;
+	set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	/* Try to schedule an autoclose RPC call */
+	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
+		queue_work(rpciod_workqueue, &xprt->task_cleanup);
+	xprt_wake_pending_tasks(xprt, -ENOTCONN);
+out:
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
 static void
 xprt_init_autodisconnect(unsigned long data)
 {
@@ -849,6 +877,7 @@ void xprt_transmit(struct rpc_task *task)
 	} else if (!req->rq_bytes_sent)
 		return;
 
+	req->rq_connect_cookie = xprt->connect_cookie;
 	status = xprt->ops->send_request(task);
 	if (status == 0) {
 		dprintk("RPC: %5u xmit complete\n", task->tk_pid);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4a567a93e6a..63d79e347c0 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1142,6 +1142,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		break;
 	case TCP_FIN_WAIT1:
 		/* The client initiated a shutdown of the socket */
+		xprt->connect_cookie++;
 		xprt->reestablish_timeout = 0;
 		set_bit(XPRT_CLOSING, &xprt->state);
 		smp_mb__before_clear_bit();
@@ -1154,6 +1155,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		set_bit(XPRT_CLOSING, &xprt->state);
 		xprt_force_disconnect(xprt);
 	case TCP_SYN_SENT:
+		xprt->connect_cookie++;
 	case TCP_CLOSING:
 		/*
 		 * If the server closed down the connection, make sure that
-- 
cgit v1.2.3


From 7b6962b0a6000df48c8a5fd967d262f77704101b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 17 Apr 2008 16:53:01 -0400
Subject: SUNRPC: Fix a race in gss_refresh_upcall()

If the downcall completes before we get the spin_lock then we currently
fail to refresh the credential.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth_gss/auth_gss.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 46f7ec800af..6f1b4e2f5e8 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -117,6 +117,7 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
 	struct gss_cl_ctx *old;
 
 	old = gss_cred->gc_ctx;
+	gss_get_ctx(ctx);
 	rcu_assign_pointer(gss_cred->gc_ctx, ctx);
 	set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
@@ -340,7 +341,7 @@ gss_upcall_callback(struct rpc_task *task)
 
 	spin_lock(&inode->i_lock);
 	if (gss_msg->ctx)
-		gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_get_ctx(gss_msg->ctx));
+		gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx);
 	else
 		task->tk_status = gss_msg->msg.errno;
 	gss_cred->gc_upcall = NULL;
@@ -417,7 +418,11 @@ gss_refresh_upcall(struct rpc_task *task)
 	spin_lock(&inode->i_lock);
 	if (gss_cred->gc_upcall != NULL)
 		rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL);
-	else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) {
+	else if (gss_msg->ctx != NULL) {
+		gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_msg->ctx);
+		gss_cred->gc_upcall = NULL;
+		rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno);
+	} else if (gss_msg->msg.errno >= 0) {
 		task->tk_timeout = 0;
 		gss_cred->gc_upcall = gss_msg;
 		/* gss_upcall_callback will release the reference to gss_upcall_msg */
@@ -462,7 +467,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
 		schedule();
 	}
 	if (gss_msg->ctx)
-		gss_cred_set_ctx(cred, gss_get_ctx(gss_msg->ctx));
+		gss_cred_set_ctx(cred, gss_msg->ctx);
 	else
 		err = gss_msg->msg.errno;
 	spin_unlock(&inode->i_lock);
-- 
cgit v1.2.3


From cd019f7517206a74d8cdb64d5c82b1f76be608cc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 17 Apr 2008 17:03:58 -0400
Subject: SUNRPC: Don't change the RPCSEC_GSS context on a credential that is
 in use

When a server rejects our credential with an AUTH_REJECTEDCRED or similar,
we need to refresh the credential and then retry the request.
However, we do want to allow any requests that are in flight to finish
executing, so that we can at least attempt to process the replies that
depend on this instance of the credential.

The solution is to ensure that gss_refresh() looks up an entirely new
RPCSEC_GSS credential instead of attempting to create a context for the
existing invalid credential.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth_gss/auth_gss.c | 69 +++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 6f1b4e2f5e8..621c07f322c 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -114,28 +114,14 @@ static void
 gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
 {
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
-	struct gss_cl_ctx *old;
 
-	old = gss_cred->gc_ctx;
+	if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
+		return;
 	gss_get_ctx(ctx);
 	rcu_assign_pointer(gss_cred->gc_ctx, ctx);
 	set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	smp_mb__before_clear_bit();
 	clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
-	if (old)
-		gss_put_ctx(old);
-}
-
-static int
-gss_cred_is_uptodate_ctx(struct rpc_cred *cred)
-{
-	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
-	int res = 0;
-
-	rcu_read_lock();
-	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) && gss_cred->gc_ctx)
-		res = 1;
-	rcu_read_unlock();
-	return res;
 }
 
 static const void *
@@ -857,15 +843,12 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 {
 	struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
 
-	/*
-	 * If the searchflags have set RPCAUTH_LOOKUP_NEW, then
-	 * we don't really care if the credential has expired or not,
-	 * since the caller should be prepared to reinitialise it.
-	 */
-	if ((flags & RPCAUTH_LOOKUP_NEW) && test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
+	if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
 		goto out;
 	/* Don't match with creds that have expired. */
-	if (gss_cred->gc_ctx && time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
+	if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
+		return 0;
+	if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
 		return 0;
 out:
 	if (acred->machine_cred != gss_cred->gc_machine_cred)
@@ -933,16 +916,48 @@ out_put_ctx:
 	return NULL;
 }
 
+static int gss_renew_cred(struct rpc_task *task)
+{
+	struct rpc_cred *oldcred = task->tk_msg.rpc_cred;
+	struct gss_cred *gss_cred = container_of(oldcred,
+						 struct gss_cred,
+						 gc_base);
+	struct rpc_auth *auth = oldcred->cr_auth;
+	struct auth_cred acred = {
+		.uid = oldcred->cr_uid,
+		.machine_cred = gss_cred->gc_machine_cred,
+	};
+	struct rpc_cred *new;
+
+	new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	task->tk_msg.rpc_cred = new;
+	put_rpccred(oldcred);
+	return 0;
+}
+
 /*
 * Refresh credentials. XXX - finish
 */
 static int
 gss_refresh(struct rpc_task *task)
 {
+	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	int ret = 0;
+
+	if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
+			!test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) {
+		ret = gss_renew_cred(task);
+		if (ret < 0)
+			goto out;
+		cred = task->tk_msg.rpc_cred;
+	}
 
-	if (!gss_cred_is_uptodate_ctx(task->tk_msg.rpc_cred))
-		return gss_refresh_upcall(task);
-	return 0;
+	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
+		ret = gss_refresh_upcall(task);
+out:
+	return ret;
 }
 
 /* Dummy refresh routine: used only when destroying the context */
-- 
cgit v1.2.3


From 63649bd7080a6a50fabcb1935f4b7c4e64155066 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 17 Apr 2008 20:42:09 +0400
Subject: NFS - fix potential NULL pointer dereference v2

There is possible NULL pointer dereference if kstr[n]dup failed.
So fix them for safety.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c99ca1f992c..2215bcd24bd 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1297,6 +1297,8 @@ static int nfs_validate_mount_data(void *options,
 		args->namlen		= data->namlen;
 		args->bsize		= data->bsize;
 		args->auth_flavors[0]	= data->pseudoflavor;
+		if (!args->nfs_server.hostname)
+			goto out_nomem;
 
 		/*
 		 * The legacy version 6 binary mount data from userspace has a
@@ -1343,6 +1345,8 @@ static int nfs_validate_mount_data(void *options,
 		len = c - dev_name;
 		/* N.B. caller will free nfs_server.hostname in all cases */
 		args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
+		if (!args->nfs_server.hostname)
+			goto out_nomem;
 
 		c++;
 		if (strlen(c) > NFS_MAXPATHLEN)
@@ -1386,6 +1390,10 @@ out_v3_not_compiled:
 	return -EPROTONOSUPPORT;
 #endif /* !CONFIG_NFS_V3 */
 
+out_nomem:
+	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
+	return -ENOMEM;
+
 out_no_address:
 	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
 	return -EINVAL;
@@ -1892,12 +1900,16 @@ static int nfs4_validate_mount_data(void *options,
 			return -ENAMETOOLONG;
 		/* N.B. caller will free nfs_server.hostname in all cases */
 		args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
+		if (!args->nfs_server.hostname)
+			goto out_nomem;
 
 		c++;			/* step over the ':' */
 		len = strlen(c);
 		if (len > NFS4_MAXPATHLEN)
 			return -ENAMETOOLONG;
 		args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
+		if (!args->nfs_server.export_path)
+			goto out_nomem;
 
 		dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
 
@@ -1919,6 +1931,10 @@ out_inval_auth:
 		 data->auth_flavourlen);
 	return -EINVAL;
 
+out_nomem:
+	dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
+	return -ENOMEM;
+
 out_no_address:
 	dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
 	return -EINVAL;
-- 
cgit v1.2.3


From daa7da5fd3040e08e3d7878cd0667c1f4cfd338a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 11 Apr 2008 11:50:58 -0400
Subject: NFS: remove duplicate flags assignment from nfs_validate_mount_data

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2215bcd24bd..88a0ecd84c6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1275,7 +1275,6 @@ static int nfs_validate_mount_data(void *options,
 		args->flags		= data->flags;
 		args->rsize		= data->rsize;
 		args->wsize		= data->wsize;
-		args->flags		= data->flags;
 		args->timeo		= data->timeo;
 		args->retrans		= data->retrans;
 		args->acregmin		= data->acregmin;
-- 
cgit v1.2.3


From a3dab293539031b0970585b9b355cebbc91ecbd4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 14 Apr 2008 21:41:32 +0300
Subject: make nfs_automount_list static

nfs_automount_list can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/namespace.c     | 2 +-
 include/linux/nfs_fs.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 607f6eb9cdb..af4d0f1e402 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -20,7 +20,7 @@
 
 static void nfs_expire_automounts(struct work_struct *work);
 
-LIST_HEAD(nfs_automount_list);
+static LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 7f0602c8771..27d6a8d98ce 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -430,7 +430,6 @@ extern void nfs_unregister_sysctl(void);
 /*
  * linux/fs/nfs/namespace.c
  */
-extern struct list_head nfs_automount_list;
 extern const struct inode_operations nfs_mountpoint_inode_operations;
 extern const struct inode_operations nfs_referral_inode_operations;
 extern int nfs_mountpoint_expiry_timeout;
-- 
cgit v1.2.3


From b48633bd086d21f4a2a5bea96c7e6c7ba58eb60c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Apr 2008 16:47:55 -0400
Subject: SUNRPC: Invalidate the RPCSEC_GSS session if the server dropped the
 request

RFC 2203 requires the server to drop the request if it believes the
RPCSEC_GSS context is out of sequence. The problem is that we have no way
on the client to know why the server dropped the request. In order to avoid
spinning forever trying to resend the request, the safe approach is
therefore to always invalidate the RPCSEC_GSS context on every major
timeout.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8773b4342c9..c6efb982057 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1173,6 +1173,11 @@ call_timeout(struct rpc_task *task)
 			clnt->cl_protname, clnt->cl_server);
 	}
 	rpc_force_rebind(clnt);
+	/*
+	 * Did our request time out due to an RPCSEC_GSS out-of-sequence
+	 * event? RFC2203 requires the server to drop all such requests.
+	 */
+	rpcauth_invalcred(task);
 
 retry:
 	clnt->cl_stats->rpcretrans++;
-- 
cgit v1.2.3