From ab0920ce7ebb6d60063c793f227ae198a492251b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 16 Mar 2006 15:06:37 -0800
Subject: ocfs2: multi node truncate fix

Fix ocfs2_truncate_file() so that it forces a truncate_inode_pages() on all
interested nodes in all cases of a truncate(), not just allocation change.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 34e903a6a46..581eb451a41 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode,
 	if (new_i_size == le64_to_cpu(fe->i_size))
 		goto bail;
 
+	/* This forces other nodes to sync and drop their pages. Do
+	 * this even if we have a truncate without allocation change -
+	 * ocfs2 cluster sizes can be much greater than page size, so
+	 * we have to truncate them anyway.  */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_data_unlock(inode, 1);
+
 	if (le32_to_cpu(fe->i_clusters) ==
 	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
 		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
@@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode,
 		goto bail;
 	}
 
-	/* This forces other nodes to sync and drop their pages */
-	status = ocfs2_data_lock(inode, 1);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-	ocfs2_data_unlock(inode, 1);
-
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
-- 
cgit v1.2.3


From 1f7bc828e30fe3e23ea0968b9595ad20e2785978 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 29 Mar 2006 10:33:35 -0800
Subject: ocfs2: remove an overly aggressive BUG() in dlmfs

Don't BUG() user_dlm_unblock_lock() on the absence of the USER_LOCK_BLOCKED
flag - this turns out to be a valid case. Make some of the related BUG()
statements print more useful information.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/userdlm.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index c3764f4744e..bac4615965f 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -268,13 +268,26 @@ static void user_dlm_unblock_lock(void *opaque)
 
 	spin_lock(&lockres->l_lock);
 
-	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
-	BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
+	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
+			"Lockres %s, flags 0x%x\n",
+			lockres->l_name, lockres->l_flags);
 
-	/* notice that we don't clear USER_LOCK_BLOCKED here. That's
-	 * for user_ast to do. */
+	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
+	 * set, we want user_ast clear it. */
 	lockres->l_flags &= ~USER_LOCK_QUEUED;
 
+	/* It's valid to get here and no longer be blocked - if we get
+	 * several basts in a row, we might be queued by the first
+	 * one, the unblock thread might run and clear the queued
+	 * flag, and finally we might get another bast which re-queues
+	 * us before our ast for the downconvert is called. */
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+		mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n",
+			lockres->l_name, lockres->l_flags);
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
 		mlog(0, "lock is in teardown so we do nothing\n");
 		spin_unlock(&lockres->l_lock);
-- 
cgit v1.2.3


From cc6eb725955efb026007e1d7da8fe5383981afd2 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 29 Mar 2006 10:34:21 -0800
Subject: ocfs2: catch an invalid ast case in dlmfs

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/userdlm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index bac4615965f..d0f1027a385 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -139,6 +139,10 @@ static void user_ast(void *opaque)
 		return;
 	}
 
+	mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+			"Lockres %s, requested ivmode. flags 0x%x\n",
+			lockres->l_name, lockres->l_flags);
+
 	/* we're downconverting. */
 	if (lockres->l_requested < lockres->l_level) {
 		if (lockres->l_requested <=
-- 
cgit v1.2.3


From f43e6918c0e3906fd4483316f6a1a07bba615908 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 29 Mar 2006 18:24:12 -0800
Subject: ocfs2: Handle the DLM_CANCELGRANT case in user_unlock_ast()

Remove the code which attempted to catch it via dlmunlock() return status -
this never happens there.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/userdlm.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index d0f1027a385..808ec0527c7 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -233,23 +233,38 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 
 	mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
 
-	if (status != DLM_NORMAL)
+	if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
 		mlog(ML_ERROR, "Dlm returns status %d\n", status);
 
 	spin_lock(&lockres->l_lock);
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
 		lockres->l_level = LKM_IVMODE;
-	else {
+	else if (status == DLM_CANCELGRANT) {
+		mlog(0, "Lock %s, cancel fails, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
+		/* We tried to cancel a convert request, but it was
+		 * already granted. Don't clear the busy flag - the
+		 * ast should've done this already. */
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		goto out_noclear;
+	} else {
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		/* Cancel succeeded, we want to re-queue */
+		mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
 		lockres->l_requested = LKM_IVMODE; /* cancel an
 						    * upconvert
 						    * request. */
 		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
 		/* we want the unblock thread to look at it again
 		 * now. */
-		__user_dlm_queue_lockres(lockres);
+		if (lockres->l_flags & USER_LOCK_BLOCKED)
+			__user_dlm_queue_lockres(lockres);
 	}
 
 	lockres->l_flags &= ~USER_LOCK_BUSY;
+out_noclear:
 	spin_unlock(&lockres->l_lock);
 
 	wake_up(&lockres->l_event);
@@ -299,7 +314,9 @@ static void user_dlm_unblock_lock(void *opaque)
 	}
 
 	if (lockres->l_flags & USER_LOCK_BUSY) {
-		mlog(0, "BUSY flag detected...\n");
+		mlog(0, "Cancel lock %s, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
+
 		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
 			spin_unlock(&lockres->l_lock);
 			goto drop_ref;
@@ -313,14 +330,7 @@ static void user_dlm_unblock_lock(void *opaque)
 				   LKM_CANCEL,
 				   user_unlock_ast,
 				   lockres);
-		if (status == DLM_CANCELGRANT) {
-			/* If we got this, then the ast was fired
-			 * before we could cancel. We cleanup our
-			 * state, and restart the function. */
-			spin_lock(&lockres->l_lock);
-			lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
-			spin_unlock(&lockres->l_lock);
-		} else if (status != DLM_NORMAL)
+		if (status != DLM_NORMAL)
 			user_log_dlm_error("dlmunlock", status, lockres);
 		goto drop_ref;
 	}
-- 
cgit v1.2.3


From 2cd9888590c52ac7592e3607d0a3174ccd57ef86 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 29 Mar 2006 16:49:13 -0800
Subject: ocfs2: test and set teardown flag early in user_dlm_destroy_lock()

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/userdlm.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index 808ec0527c7..74ca4e5f976 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -237,9 +237,13 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 		mlog(ML_ERROR, "Dlm returns status %d\n", status);
 
 	spin_lock(&lockres->l_lock);
-	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
+	/* The teardown flag gets set early during the unlock process,
+	 * so test the cancel flag to make sure that this ast isn't
+	 * for a concurrent cancel. */
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
+	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
 		lockres->l_level = LKM_IVMODE;
-	else if (status == DLM_CANCELGRANT) {
+	} else if (status == DLM_CANCELGRANT) {
 		mlog(0, "Lock %s, cancel fails, flags 0x%x\n",
 		     lockres->l_name, lockres->l_flags);
 		/* We tried to cancel a convert request, but it was
@@ -608,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 	mlog(0, "asked to destroy %s\n", lockres->l_name);
 
 	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(0, "Lock is already torn down\n");
+		spin_unlock(&lockres->l_lock);
+		return 0;
+	}
+
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+
 	while (lockres->l_flags & USER_LOCK_BUSY) {
 		spin_unlock(&lockres->l_lock);
 
@@ -633,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 
 	lockres->l_flags &= ~USER_LOCK_ATTACHED;
 	lockres->l_flags |= USER_LOCK_BUSY;
-	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
 	spin_unlock(&lockres->l_lock);
 
 	mlog(0, "unlocking lockres %s\n", lockres->l_name);
-- 
cgit v1.2.3


From a9e2ae39170d01937725e1fff2e606baaa71346c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 24 Mar 2006 14:20:17 -0800
Subject: ocfs2: Better I/O error handling in heartbeat

Propagate errors received in o2hb_bio_end_io() back to the heartbeat thread
so it can skip re-arming the timer.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index bff0f0d0686..21f38accd03 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -153,6 +153,7 @@ struct o2hb_region {
 struct o2hb_bio_wait_ctxt {
 	atomic_t          wc_num_reqs;
 	struct completion wc_io_complete;
+	int               wc_error;
 };
 
 static void o2hb_write_timeout(void *arg)
@@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
 {
 	atomic_set(&wc->wc_num_reqs, num_ios);
 	init_completion(&wc->wc_io_complete);
+	wc->wc_error = 0;
 }
 
 /* Used in error paths too */
@@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio,
 {
 	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
 
-	if (error)
+	if (error) {
 		mlog(ML_ERROR, "IO Error %d\n", error);
+		wc->wc_error = error;
+	}
 
 	if (bio->bi_size)
 		return 1;
@@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg,
 
 bail_and_wait:
 	o2hb_wait_on_io(reg, &wc);
+	if (wc.wc_error && !status)
+		status = wc.wc_error;
 
 	if (bios) {
 		for(i = 0; i < num_bios; i++)
@@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes,
 	return highest;
 }
 
-static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
 	int i, ret, highest_node, change = 0;
 	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	struct bio *write_bio;
 	struct o2hb_bio_wait_ctxt write_wc;
 
-	if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
-		return;
+	ret = o2nm_configured_node_map(configured_nodes,
+				       sizeof(configured_nodes));
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
 
 	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
 	if (highest_node >= O2NM_MAX_NODES) {
 		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
-		return;
+		return -EINVAL;
 	}
 
 	/* No sense in reading the slots of nodes that don't exist
@@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	ret = o2hb_read_slots(reg, highest_node + 1);
 	if (ret < 0) {
 		mlog_errno(ret);
-		return;
+		return ret;
 	}
 
 	/* With an up to date view of the slots, we can check that no
@@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
 	if (ret < 0) {
 		mlog_errno(ret);
-		return;
+		return ret;
 	}
 
 	i = -1;
@@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	 */
 	o2hb_wait_on_io(reg, &write_wc);
 	bio_put(write_bio);
+	if (write_wc.wc_error) {
+		/* Do not re-arm the write timeout on I/O error - we
+		 * can't be sure that the new block ever made it to
+		 * disk */
+		mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
+		     write_wc.wc_error, reg->hr_dev_name);
+		return write_wc.wc_error;
+	}
+
 	o2hb_arm_write_timeout(reg);
 
 	/* let the person who launched us know when things are steady */
@@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 		if (atomic_dec_and_test(&reg->hr_steady_iterations))
 			wake_up(&o2hb_steady_queue);
 	}
+
+	return 0;
 }
 
 /* Subtract b from a, storing the result in a. a *must* have a larger
@@ -913,7 +934,10 @@ static int o2hb_thread(void *data)
 		 * likely to time itself out. */
 		do_gettimeofday(&before_hb);
 
-		o2hb_do_disk_heartbeat(reg);
+		i = 0;
+		do {
+			ret = o2hb_do_disk_heartbeat(reg);
+		} while (ret && ++i < 2);
 
 		do_gettimeofday(&after_hb);
 		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
-- 
cgit v1.2.3


From c7f21e4f5a3d4e378e4d453b2be209dcfd1bb964 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 10 Apr 2006 09:01:01 +0200
Subject: [PATCH] splice: mark the io page as accessed

We should do that, since we do the LRU manipulation ourselves now. Suggested
by Nick Piggin.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index bfa42a277bb..b450acdff39 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -501,6 +501,7 @@ find_page:
 	} else if (ret)
 		goto out;
 
+	mark_page_accessed(page);
 	balance_dirty_pages_ratelimited(mapping);
 out:
 	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
-- 
cgit v1.2.3


From 9aefe431f5a000884db7ae74ac208de814fe5913 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Mon, 10 Apr 2006 09:02:40 +0200
Subject: [PATCH] splice: potential !page dereference

We can get to out: with a NULL page, which we probably
don't want to be calling page_cache_release() on.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index b450acdff39..26f5f7ecee5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -445,7 +445,7 @@ find_page:
 		ret = -ENOMEM;
 		page = find_or_create_page(mapping, index, gfp_mask);
 		if (!page)
-			goto out;
+			goto out_nomem;
 
 		/*
 		 * If the page is uptodate, it is also locked. If it isn't
@@ -508,6 +508,7 @@ out:
 		page_cache_release(page);
 		unlock_page(page);
 	}
+out_nomem:
 	buf->ops->unmap(info, buf);
 	return ret;
 }
-- 
cgit v1.2.3


From c0bd1f650bd06a43435808d44f1e9520ea806206 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 10 Apr 2006 09:03:32 +0200
Subject: [PATCH] splice: only call wake_up_interruptible() when we really have
 to

__wake_up_common() is pretty heavy in the kernel profiles, this brings
it down to a more acceptable level.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 26f5f7ecee5..9f796b1034d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -187,7 +187,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 		}
 
 		if (do_wakeup) {
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
+			smp_mb();
+			if (waitqueue_active(PIPE_WAIT(*inode)))
+				wake_up_interruptible_sync(PIPE_WAIT(*inode));
 			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
 				    POLL_IN);
 			do_wakeup = 0;
@@ -201,7 +203,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 	mutex_unlock(PIPE_MUTEX(*inode));
 
 	if (do_wakeup) {
-		wake_up_interruptible(PIPE_WAIT(*inode));
+		smp_mb();
+		if (waitqueue_active(PIPE_WAIT(*inode)))
+			wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 	}
 
@@ -600,7 +604,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 		}
 
 		if (do_wakeup) {
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
+			smp_mb();
+			if (waitqueue_active(PIPE_WAIT(*inode)))
+				wake_up_interruptible_sync(PIPE_WAIT(*inode));
 			kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
 			do_wakeup = 0;
 		}
@@ -611,7 +617,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 	mutex_unlock(PIPE_MUTEX(*inode));
 
 	if (do_wakeup) {
-		wake_up_interruptible(PIPE_WAIT(*inode));
+		smp_mb();
+		if (waitqueue_active(PIPE_WAIT(*inode)))
+			wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
 
-- 
cgit v1.2.3


From 16c523ddabcce5d3d817f4a2491d628f84dfaaa1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 10 Apr 2006 09:03:58 +0200
Subject: [PATCH] splice: cleanup __generic_file_splice_read()

The whole shadow/pages logic got overly complex, and this simpler
approach is actually faster in testing.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 59 ++++++++++-------------------------------------------------
 1 file changed, 10 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 9f796b1034d..8b5efcc906d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -220,10 +220,10 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe,
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int offset, nr_pages;
-	struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
+	struct page *pages[PIPE_BUFFERS];
 	struct page *page;
-	pgoff_t index, pidx;
-	int i, j;
+	pgoff_t index;
+	int i;
 
 	index = in->f_pos >> PAGE_CACHE_SHIFT;
 	offset = in->f_pos & ~PAGE_CACHE_MASK;
@@ -237,42 +237,14 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe,
 	 */
 	do_page_cache_readahead(mapping, in, index, nr_pages);
 
-	/*
-	 * Get as many pages from the page cache as possible..
-	 * Start IO on the page cache entries we create (we
-	 * can assume that any pre-existing ones we find have
-	 * already had IO started on them).
-	 */
-	i = find_get_pages(mapping, index, nr_pages, pages);
-
-	/*
-	 * common case - we found all pages and they are contiguous,
-	 * kick them off
-	 */
-	if (i && (pages[i - 1]->index == index + i - 1))
-		goto splice_them;
-
-	/*
-	 * fill shadow[] with pages at the right locations, so we only
-	 * have to fill holes
-	 */
-	memset(shadow, 0, nr_pages * sizeof(struct page *));
-	for (j = 0; j < i; j++)
-		shadow[pages[j]->index - index] = pages[j];
-
 	/*
 	 * now fill in the holes
 	 */
-	for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
-		int error;
-
-		if (shadow[i])
-			continue;
-
+	for (i = 0; i < nr_pages; i++, index++) {
 		/*
 		 * no page there, look one up / create it
 		 */
-		page = find_or_create_page(mapping, pidx,
+		page = find_or_create_page(mapping, index,
 						   mapping_gfp_mask(mapping));
 		if (!page)
 			break;
@@ -280,31 +252,20 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe,
 		if (PageUptodate(page))
 			unlock_page(page);
 		else {
-			error = mapping->a_ops->readpage(in, page);
+			int error = mapping->a_ops->readpage(in, page);
 
 			if (unlikely(error)) {
 				page_cache_release(page);
 				break;
 			}
 		}
-		shadow[i] = page;
-	}
-
-	if (!i) {
-		for (i = 0; i < nr_pages; i++) {
-			 if (shadow[i])
-				page_cache_release(shadow[i]);
-		}
-		return 0;
+		pages[i] = page;
 	}
 
-	memcpy(pages, shadow, i * sizeof(struct page *));
+	if (i)
+		return move_to_pipe(pipe, pages, i, offset, len, flags);
 
-	/*
-	 * Now we splice them into the pipe..
-	 */
-splice_them:
-	return move_to_pipe(pipe, pages, i, offset, len, flags);
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From 49d0b21be21efc07526d637e0ae935019667e532 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 10 Apr 2006 09:04:41 +0200
Subject: [PATCH] splice: optimize the splice buffer mapping

We don't really need to lock down the pages, just make sure they
are uptodate.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 8b5efcc906d..50c43a1e092 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -84,26 +84,43 @@ static void *page_cache_pipe_buf_map(struct file *file,
 				     struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
-
-	lock_page(page);
+	int err;
 
 	if (!PageUptodate(page)) {
-		unlock_page(page);
-		return ERR_PTR(-EIO);
-	}
+		lock_page(page);
+
+		/*
+		 * Page got truncated/unhashed. This will cause a 0-byte
+		 * splice, if this is the first page
+		 */
+		if (!page->mapping) {
+			err = -ENODATA;
+			goto error;
+		}
+
+		/*
+		 * uh oh, read-error from disk
+		 */
+		if (!PageUptodate(page)) {
+			err = -EIO;
+			goto error;
+		}
 
-	if (!page->mapping) {
+		/*
+		 * page is ok afterall, fall through to mapping
+		 */
 		unlock_page(page);
-		return ERR_PTR(-ENODATA);
 	}
 
-	return kmap(buf->page);
+	return kmap(page);
+error:
+	unlock_page(page);
+	return ERR_PTR(err);
 }
 
 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
 				      struct pipe_buffer *buf)
 {
-	unlock_page(buf->page);
 	kunmap(buf->page);
 }
 
@@ -379,7 +396,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	int ret;
 
 	/*
-	 * after this, page will be locked and unmapped
+	 * make sure the data in this buffer is uptodate
 	 */
 	src = buf->ops->map(file, info, buf);
 	if (IS_ERR(src))
@@ -399,6 +416,9 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 		if (buf->ops->steal(info, buf))
 			goto find_page;
 
+		/*
+		 * this will also set the page locked
+		 */
 		page = buf->page;
 		if (add_to_page_cache(page, mapping, index, gfp_mask))
 			goto find_page;
-- 
cgit v1.2.3


From 0b749ce3802428007a37870eb51ba3c0bdf90857 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 10 Apr 2006 09:05:04 +0200
Subject: [PATCH] splice: be smarter about calling do_page_cache_readahead()

We don't want to call into the read-ahead logic unless we are at the
start of a page, _or_ we have multiple pages to read.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 50c43a1e092..9bfd6af0cf4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -250,9 +250,12 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe,
 		nr_pages = PIPE_BUFFERS;
 
 	/*
-	 * initiate read-ahead on this page range
+	 * initiate read-ahead on this page range. however, don't call into
+	 * read-ahead if this is a non-zero offset (we are likely doing small
+	 * chunk splice and the page is already there) for a single page.
 	 */
-	do_page_cache_readahead(mapping, in, index, nr_pages);
+	if (!offset || nr_pages > 1)
+		do_page_cache_readahead(mapping, in, index, nr_pages);
 
 	/*
 	 * now fill in the holes
-- 
cgit v1.2.3


From 3a326a2ce88e71d00ac0d133e314a3342a7709f8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 10 Apr 2006 15:18:35 +0200
Subject: [PATCH] introduce a "kernel-internal pipe object" abstraction

separate out the 'internal pipe object' abstraction, and make it
usable to splice. This cleans up and fixes several aspects of the
internal splice APIs and the pipe code:

 - pipes: the allocation and freeing of pipe_inode_info is now more symmetric
   and more streamlined with existing kernel practices.

 - splice: small micro-optimization: less pointer dereferencing in splice
   methods

Signed-off-by: Ingo Molnar <mingo@elte.hu>

Update XFS for the ->splice_read/->splice_write changes.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/fifo.c                    |  12 +++--
 fs/pipe.c                    |  51 +++++++++---------
 fs/splice.c                  | 122 ++++++++++++++++++++++---------------------
 fs/xfs/linux-2.6/xfs_file.c  |   8 +--
 fs/xfs/linux-2.6/xfs_lrw.c   |   4 +-
 fs/xfs/linux-2.6/xfs_lrw.h   |   4 +-
 fs/xfs/linux-2.6/xfs_vnode.h |   4 +-
 7 files changed, 106 insertions(+), 99 deletions(-)

(limited to 'fs')

diff --git a/fs/fifo.c b/fs/fifo.c
index 889f722ee36..b16e2f597d6 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -15,12 +15,13 @@
 #include <linux/fs.h>
 #include <linux/pipe_fs_i.h>
 
-static void wait_for_partner(struct inode* inode, unsigned int* cnt)
+static void wait_for_partner(struct inode* inode, unsigned int *cnt)
 {
 	int cur = *cnt;	
-	while(cur == *cnt) {
-		pipe_wait(inode);
-		if(signal_pending(current))
+
+	while (cur == *cnt) {
+		pipe_wait(inode->i_pipe);
+		if (signal_pending(current))
 			break;
 	}
 }
@@ -37,7 +38,8 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	mutex_lock(PIPE_MUTEX(*inode));
 	if (!inode->i_pipe) {
 		ret = -ENOMEM;
-		if(!pipe_new(inode))
+		inode->i_pipe = alloc_pipe_info(inode);
+		if (!inode->i_pipe)
 			goto err_nocleanup;
 	}
 	filp->f_version = 0;
diff --git a/fs/pipe.c b/fs/pipe.c
index 795df987cd3..705b4869262 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -36,7 +36,7 @@
  */
 
 /* Drop the inode semaphore and wait for a pipe event, atomically */
-void pipe_wait(struct inode * inode)
+void pipe_wait(struct pipe_inode_info *pipe)
 {
 	DEFINE_WAIT(wait);
 
@@ -44,11 +44,13 @@ void pipe_wait(struct inode * inode)
 	 * Pipes are system-local resources, so sleeping on them
 	 * is considered a noninteractive wait:
 	 */
-	prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
-	mutex_unlock(PIPE_MUTEX(*inode));
+	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 	schedule();
-	finish_wait(PIPE_WAIT(*inode), &wait);
-	mutex_lock(PIPE_MUTEX(*inode));
+	finish_wait(&pipe->wait, &wait);
+	if (pipe->inode)
+		mutex_lock(&pipe->inode->i_mutex);
 }
 
 static int
@@ -223,7 +225,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			wake_up_interruptible_sync(PIPE_WAIT(*inode));
  			kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 		}
-		pipe_wait(inode);
+		pipe_wait(inode->i_pipe);
 	}
 	mutex_unlock(PIPE_MUTEX(*inode));
 	/* Signal writers asynchronously that there is more room.  */
@@ -370,7 +372,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			do_wakeup = 0;
 		}
 		PIPE_WAITING_WRITERS(*inode)++;
-		pipe_wait(inode);
+		pipe_wait(inode->i_pipe);
 		PIPE_WAITING_WRITERS(*inode)--;
 	}
 out:
@@ -675,6 +677,20 @@ static struct file_operations rdwr_pipe_fops = {
 	.fasync		= pipe_rdwr_fasync,
 };
 
+struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
+{
+	struct pipe_inode_info *info;
+
+	info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	if (info) {
+		init_waitqueue_head(&info->wait);
+		info->r_counter = info->w_counter = 1;
+		info->inode = inode;
+	}
+
+	return info;
+}
+
 void free_pipe_info(struct inode *inode)
 {
 	int i;
@@ -691,23 +707,6 @@ void free_pipe_info(struct inode *inode)
 	kfree(info);
 }
 
-struct inode* pipe_new(struct inode* inode)
-{
-	struct pipe_inode_info *info;
-
-	info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
-	if (!info)
-		goto fail_page;
-	inode->i_pipe = info;
-
-	init_waitqueue_head(PIPE_WAIT(*inode));
-	PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
-
-	return inode;
-fail_page:
-	return NULL;
-}
-
 static struct vfsmount *pipe_mnt __read_mostly;
 static int pipefs_delete_dentry(struct dentry *dentry)
 {
@@ -724,8 +723,10 @@ static struct inode * get_pipe_inode(void)
 	if (!inode)
 		goto fail_inode;
 
-	if(!pipe_new(inode))
+	inode->i_pipe = alloc_pipe_info(inode);
+	if (!inode->i_pipe)
 		goto fail_iput;
+
 	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
 	inode->i_fop = &rdwr_pipe_fops;
 
diff --git a/fs/splice.c b/fs/splice.c
index 9bfd6af0cf4..ed91a62402e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -136,34 +136,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
+static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 			    int nr_pages, unsigned long offset,
 			    unsigned long len, unsigned int flags)
 {
-	struct pipe_inode_info *info;
 	int ret, do_wakeup, i;
 
 	ret = 0;
 	do_wakeup = 0;
 	i = 0;
 
-	mutex_lock(PIPE_MUTEX(*inode));
+	if (pipe->inode)
+		mutex_lock(&pipe->inode->i_mutex);
 
-	info = inode->i_pipe;
 	for (;;) {
 		int bufs;
 
-		if (!PIPE_READERS(*inode)) {
+		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret)
 				ret = -EPIPE;
 			break;
 		}
 
-		bufs = info->nrbufs;
+		bufs = pipe->nrbufs;
 		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
-			struct pipe_buffer *buf = info->bufs + newbuf;
+			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS - 1);
+			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pages[i++];
 			unsigned long this_len;
 
@@ -175,7 +174,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 			buf->offset = offset;
 			buf->len = this_len;
 			buf->ops = &page_cache_pipe_buf_ops;
-			info->nrbufs = ++bufs;
+			pipe->nrbufs = ++bufs;
 			do_wakeup = 1;
 
 			ret += this_len;
@@ -205,25 +204,25 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 
 		if (do_wakeup) {
 			smp_mb();
-			if (waitqueue_active(PIPE_WAIT(*inode)))
-				wake_up_interruptible_sync(PIPE_WAIT(*inode));
-			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
-				    POLL_IN);
+			if (waitqueue_active(&pipe->wait))
+				wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
 
-		PIPE_WAITING_WRITERS(*inode)++;
-		pipe_wait(inode);
-		PIPE_WAITING_WRITERS(*inode)--;
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
 	}
 
-	mutex_unlock(PIPE_MUTEX(*inode));
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 
 	if (do_wakeup) {
 		smp_mb();
-		if (waitqueue_active(PIPE_WAIT(*inode)))
-			wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
 	while (i < nr_pages)
@@ -232,8 +231,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 	return ret;
 }
 
-static int __generic_file_splice_read(struct file *in, struct inode *pipe,
-				      size_t len, unsigned int flags)
+static int
+__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
+			   size_t len, unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int offset, nr_pages;
@@ -298,7 +298,7 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe,
  * Will read pages from given file and fill them into a pipe.
  *
  */
-ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
+ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 				 size_t len, unsigned int flags)
 {
 	ssize_t spliced;
@@ -306,6 +306,7 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
 
 	ret = 0;
 	spliced = 0;
+
 	while (len) {
 		ret = __generic_file_splice_read(in, pipe, len, flags);
 
@@ -509,11 +510,10 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
  * key here is the 'actor' worker passed in that actually moves the data
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
-static ssize_t move_from_pipe(struct inode *inode, struct file *out,
+static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 			      size_t len, unsigned int flags,
 			      splice_actor *actor)
 {
-	struct pipe_inode_info *info;
 	int ret, do_wakeup, err;
 	struct splice_desc sd;
 
@@ -525,22 +525,22 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 	sd.file = out;
 	sd.pos = out->f_pos;
 
-	mutex_lock(PIPE_MUTEX(*inode));
+	if (pipe->inode)
+		mutex_lock(&pipe->inode->i_mutex);
 
-	info = inode->i_pipe;
 	for (;;) {
-		int bufs = info->nrbufs;
+		int bufs = pipe->nrbufs;
 
 		if (bufs) {
-			int curbuf = info->curbuf;
-			struct pipe_buffer *buf = info->bufs + curbuf;
+			int curbuf = pipe->curbuf;
+			struct pipe_buffer *buf = pipe->bufs + curbuf;
 			struct pipe_buf_operations *ops = buf->ops;
 
 			sd.len = buf->len;
 			if (sd.len > sd.total_len)
 				sd.len = sd.total_len;
 
-			err = actor(info, buf, &sd);
+			err = actor(pipe, buf, &sd);
 			if (err) {
 				if (!ret && err != -ENODATA)
 					ret = err;
@@ -553,10 +553,10 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 			buf->len -= sd.len;
 			if (!buf->len) {
 				buf->ops = NULL;
-				ops->release(info, buf);
+				ops->release(pipe, buf);
 				curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
-				info->curbuf = curbuf;
-				info->nrbufs = --bufs;
+				pipe->curbuf = curbuf;
+				pipe->nrbufs = --bufs;
 				do_wakeup = 1;
 			}
 
@@ -568,9 +568,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 
 		if (bufs)
 			continue;
-		if (!PIPE_WRITERS(*inode))
+		if (!pipe->writers)
 			break;
-		if (!PIPE_WAITING_WRITERS(*inode)) {
+		if (!pipe->waiting_writers) {
 			if (ret)
 				break;
 		}
@@ -589,22 +589,23 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 
 		if (do_wakeup) {
 			smp_mb();
-			if (waitqueue_active(PIPE_WAIT(*inode)))
-				wake_up_interruptible_sync(PIPE_WAIT(*inode));
-			kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
+			if (waitqueue_active(&pipe->wait))
+				wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 			do_wakeup = 0;
 		}
 
-		pipe_wait(inode);
+		pipe_wait(pipe);
 	}
 
-	mutex_unlock(PIPE_MUTEX(*inode));
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 
 	if (do_wakeup) {
 		smp_mb();
-		if (waitqueue_active(PIPE_WAIT(*inode)))
-			wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 
 	mutex_lock(&out->f_mapping->host->i_mutex);
@@ -616,7 +617,7 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 
 /**
  * generic_file_splice_write - splice data from a pipe to a file
- * @inode:	pipe inode
+ * @pipe:	pipe info
  * @out:	file to write to
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
@@ -625,11 +626,14 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
  * the given pipe inode to the given file.
  *
  */
-ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
-				  size_t len, unsigned int flags)
+ssize_t
+generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  size_t len, unsigned int flags)
 {
 	struct address_space *mapping = out->f_mapping;
-	ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file);
+	ssize_t ret;
+
+	ret = move_from_pipe(pipe, out, len, flags, pipe_to_file);
 
 	/*
 	 * if file or inode is SYNC and we actually wrote some data, sync it
@@ -664,10 +668,10 @@ EXPORT_SYMBOL(generic_file_splice_write);
  * is involved.
  *
  */
-ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 				size_t len, unsigned int flags)
 {
-	return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
+	return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage);
 }
 
 EXPORT_SYMBOL(generic_splice_sendpage);
@@ -675,8 +679,8 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 /*
  * Attempt to initiate a splice from pipe to file.
  */
-static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
-			   unsigned int flags)
+static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   size_t len, unsigned int flags)
 {
 	loff_t pos;
 	int ret;
@@ -698,8 +702,8 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
-			 unsigned int flags)
+static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
+			 size_t len, unsigned int flags)
 {
 	loff_t pos, isize, left;
 	int ret;
@@ -732,14 +736,14 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
 static long do_splice(struct file *in, struct file *out, size_t len,
 		      unsigned int flags)
 {
-	struct inode *pipe;
+	struct pipe_inode_info *pipe;
 
-	pipe = in->f_dentry->d_inode;
-	if (pipe->i_pipe)
+	pipe = in->f_dentry->d_inode->i_pipe;
+	if (pipe)
 		return do_splice_from(pipe, out, len, flags);
 
-	pipe = out->f_dentry->d_inode;
-	if (pipe->i_pipe)
+	pipe = out->f_dentry->d_inode->i_pipe;
+	if (pipe)
 		return do_splice_to(in, pipe, len, flags);
 
 	return -EINVAL;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ae4c4754ed3..269721af02f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -252,7 +252,7 @@ xfs_file_sendfile_invis(
 STATIC ssize_t
 xfs_file_splice_read(
 	struct file		*infilp,
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	size_t			len,
 	unsigned int		flags)
 {
@@ -266,7 +266,7 @@ xfs_file_splice_read(
 STATIC ssize_t
 xfs_file_splice_read_invis(
 	struct file		*infilp,
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	size_t			len,
 	unsigned int		flags)
 {
@@ -279,7 +279,7 @@ xfs_file_splice_read_invis(
 
 STATIC ssize_t
 xfs_file_splice_write(
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
 	size_t			len,
 	unsigned int		flags)
@@ -293,7 +293,7 @@ xfs_file_splice_write(
 
 STATIC ssize_t
 xfs_file_splice_write_invis(
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
 	size_t			len,
 	unsigned int		flags)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 90cd314acba..74a52937f20 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -338,7 +338,7 @@ ssize_t
 xfs_splice_read(
 	bhv_desc_t		*bdp,
 	struct file		*infilp,
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	size_t			count,
 	int			flags,
 	int			ioflags,
@@ -380,7 +380,7 @@ xfs_splice_read(
 ssize_t
 xfs_splice_write(
 	bhv_desc_t		*bdp,
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
 	size_t			count,
 	int			flags,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index eaa5659713f..55c689a86ad 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -94,9 +94,9 @@ extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
 				loff_t *, int, size_t, read_actor_t,
 				void *, struct cred *);
 extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *,
-				struct inode *, size_t, int, int,
+				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
-extern ssize_t xfs_splice_write(struct bhv_desc *, struct inode *,
+extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *,
 				struct file *, size_t, int, int,
 				struct cred *);
 
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 6f1c79a28f8..88b09f18628 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -174,9 +174,9 @@ typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
 				loff_t *, int, size_t, read_actor_t,
 				void *, struct cred *);
 typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *,
-				struct inode *, size_t, int, int,
+				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
-typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct inode *,
+typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *,
 				struct file *, size_t, int, int,
 				struct cred *);
 typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
-- 
cgit v1.2.3


From 529565dcb1581c9a1e3f6df1c1763ca3e0f0d512 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 10 Apr 2006 15:18:58 +0200
Subject: [PATCH] splice: add optional input and output offsets

add optional input and output offsets to sys_splice(), for seekable file
descriptors:

 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
                            int fd_out, loff_t __user *off_out,
                            size_t len, unsigned int flags);

semantics are straightforward: f_pos will be updated with the offset
provided by user-space, before the splice transfer is about to begin.
Providing a NULL offset pointer means the existing f_pos will be used
(and updated in situ).  Providing an offset for a pipe results in
-ESPIPE. Providing an invalid offset pointer results in -EFAULT.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 54 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index ed91a62402e..a5326127aad 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -680,7 +680,8 @@ EXPORT_SYMBOL(generic_splice_sendpage);
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   size_t len, unsigned int flags)
+			   loff_t __user *off_out, size_t len,
+			   unsigned int flags)
 {
 	loff_t pos;
 	int ret;
@@ -691,7 +692,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (!(out->f_mode & FMODE_WRITE))
 		return -EBADF;
 
+	if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t)))
+		return -EFAULT;
+
 	pos = out->f_pos;
+
 	ret = rw_verify_area(WRITE, out, &pos, len);
 	if (unlikely(ret < 0))
 		return ret;
@@ -702,8 +707,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
-			 size_t len, unsigned int flags)
+static long do_splice_to(struct file *in, loff_t __user *off_in,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
 {
 	loff_t pos, isize, left;
 	int ret;
@@ -714,7 +720,11 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
 	if (!(in->f_mode & FMODE_READ))
 		return -EBADF;
 
+	if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
+		return -EFAULT;
+
 	pos = in->f_pos;
+
 	ret = rw_verify_area(READ, in, &pos, len);
 	if (unlikely(ret < 0))
 		return ret;
@@ -733,23 +743,39 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
 /*
  * Determine where to splice to/from.
  */
-static long do_splice(struct file *in, struct file *out, size_t len,
-		      unsigned int flags)
+static long do_splice(struct file *in, loff_t __user *off_in,
+		      struct file *out, loff_t __user *off_out,
+		      size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 
+	if (off_out && out->f_op->llseek == no_llseek)
+		return -EINVAL;
+	if (off_in && in->f_op->llseek == no_llseek)
+		return -EINVAL;
+
 	pipe = in->f_dentry->d_inode->i_pipe;
-	if (pipe)
-		return do_splice_from(pipe, out, len, flags);
+	if (pipe) {
+		if (off_in)
+			return -ESPIPE;
+
+		return do_splice_from(pipe, out, off_out, len, flags);
+	}
 
 	pipe = out->f_dentry->d_inode->i_pipe;
-	if (pipe)
-		return do_splice_to(in, pipe, len, flags);
+	if (pipe) {
+		if (off_out)
+			return -ESPIPE;
+
+		return do_splice_to(in, off_in, pipe, len, flags);
+	}
 
 	return -EINVAL;
 }
 
-asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
+asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
+			   int fd_out, loff_t __user *off_out,
+			   size_t len, unsigned int flags)
 {
 	long error;
 	struct file *in, *out;
@@ -759,13 +785,15 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
 		return 0;
 
 	error = -EBADF;
-	in = fget_light(fdin, &fput_in);
+	in = fget_light(fd_in, &fput_in);
 	if (in) {
 		if (in->f_mode & FMODE_READ) {
-			out = fget_light(fdout, &fput_out);
+			out = fget_light(fd_out, &fput_out);
 			if (out) {
 				if (out->f_mode & FMODE_WRITE)
-					error = do_splice(in, out, len, flags);
+					error = do_splice(in, off_in,
+							  out, off_out,
+							  len, flags);
 				fput_light(out, fput_out);
 			}
 		}
-- 
cgit v1.2.3


From cbca692c246874a3cc1b5a9b694add4c39e8bc18 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Thu, 23 Mar 2006 00:36:54 +0100
Subject: [PATCH] Bogus NULL pointer check in fs/configfs/dir.c

We check the "group" pointer after we dereference it.  This check is
bogus, as it cannot be NULL coming in.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8ed9b06a982..5638c8f9362 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -504,7 +504,7 @@ static int populate_groups(struct config_group *group)
 	int ret = 0;
 	int i;
 
-	if (group && group->default_groups) {
+	if (group->default_groups) {
 		/* FYI, we're faking mkdir here
 		 * I'm not sure we need this semaphore, as we're called
 		 * from our parent's mkdir.  That holds our parent's
-- 
cgit v1.2.3


From 65714b918415e06c92426f6544b2296dae694590 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 26 Mar 2006 14:25:52 +0200
Subject: [PATCH] CONFIGFS_FS must depend on SYSFS

This patch fixes the a compile error with CONFIG_SYSFS=n

Configfs is creating, as a matter of policy, the /sys/kernel/config
mountpoint.  This means it requires CONFIG_SYSFS.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e207be68d4c..97f31741312 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -861,7 +861,7 @@ config RAMFS
 
 config CONFIGFS_FS
 	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on SYSFS && EXPERIMENTAL
 	help
 	  configfs is a ram-based filesystem that provides the converse
 	  of sysfs's functionality. Where sysfs is a filesystem-based
-- 
cgit v1.2.3


From de12a7878c11f3b282d640888aa635e0711d0b5e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 10 Apr 2006 17:16:49 -0600
Subject: [PATCH] de_thread: Don't confuse users do_each_thread.

Oleg Nesterov spotted two interesting bugs with the current de_thread
code.  The simplest is a long standing double decrement of
__get_cpu_var(process_counts) in __unhash_process.  Caused by
two processes exiting when only one was created.

The other is that since we no longer detach from the thread_group list
it is possible for do_each_thread when run under the tasklist_lock to
see the same task_struct twice.  Once on the task list as a
thread_group_leader, and once on the thread list of another
thread.

The double appearance in do_each_thread can cause a double increment
of mm_core_waiters in zap_threads resulting in problems later on in
coredump_wait.

To remedy those two problems this patch takes the simple approach
of changing the old thread group leader into a child thread.
The only routine in release_task that cares is __unhash_process,
and it can be trivially seen that we handle cleaning up a
thread group leader properly.

Since de_thread doesn't change the pid of the exiting leader process
and instead shares it with the new leader process.  I change
thread_group_leader to recognize group leadership based on the
group_leader field and not based on pids.  This should also be
slightly cheaper then the existing thread_group_leader macro.

I performed a quick audit and I couldn't see any user of
thread_group_leader that cared about the difference.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 0291a68a362..4d38ad0b70d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -723,7 +723,12 @@ static int de_thread(struct task_struct *tsk)
 		current->parent = current->real_parent = leader->real_parent;
 		leader->parent = leader->real_parent = child_reaper;
 		current->group_leader = current;
-		leader->group_leader = leader;
+		leader->group_leader = current;
+
+		/* Reduce leader to a thread */
+		detach_pid(leader, PIDTYPE_PGID);
+		detach_pid(leader, PIDTYPE_SID);
+		list_del_init(&leader->tasks);
 
 		add_parent(current);
 		add_parent(leader);
-- 
cgit v1.2.3


From e50bd16fe49689bc5fb54fca5ed8b568dfba65c6 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 11 Apr 2006 15:10:45 +1000
Subject: [XFS] Fix superblock validation regression for the zero imaxpct case.
 Thanks to kjamieson for noticing.

SGI-PV: 951661
SGI-Modid: xfs-linux-melb:xfs-kern:25675a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_mount.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 049fabb7f7e..c0b1c290688 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -270,7 +270,7 @@ xfs_mount_validate_sb(
 	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
-	    (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) {
+	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
 		xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
-- 
cgit v1.2.3


From 8272145c05c6d01a34f5114357c5e8093fb66472 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 11 Apr 2006 15:10:55 +1000
Subject: [XFS] Fix a writepage regression where we accidentally stopped
 honouring nonblock mode with the new IO path code (since 2.6.16).

SGI-PV: 951662
SGI-Modid: xfs-linux-melb:xfs-kern:25676a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 6cbbd165c60..4d191ef39b6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -870,12 +870,14 @@ xfs_page_state_convert(
 	pgoff_t                 end_index, last_index, tlast;
 	ssize_t			size, len;
 	int			flags, err, iomap_valid = 0, uptodate = 1;
-	int			page_dirty, count = 0, trylock_flag = 0;
+	int			page_dirty, count = 0;
+	int			trylock = 0;
 	int			all_bh = unmapped;
 
-	/* wait for other IO threads? */
-	if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking))
-		trylock_flag |= BMAPI_TRYLOCK;
+	if (startio) {
+		if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+			trylock |= BMAPI_TRYLOCK;
+	}
 
 	/* Is this page beyond the end of the file? */
 	offset = i_size_read(inode);
@@ -956,15 +958,13 @@ xfs_page_state_convert(
 
 			if (buffer_unwritten(bh)) {
 				type = IOMAP_UNWRITTEN;
-				flags = BMAPI_WRITE|BMAPI_IGNSTATE;
+				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
 			} else if (buffer_delay(bh)) {
 				type = IOMAP_DELAY;
-				flags = BMAPI_ALLOCATE;
-				if (!startio)
-					flags |= trylock_flag;
+				flags = BMAPI_ALLOCATE | trylock;
 			} else {
 				type = IOMAP_NEW;
-				flags = BMAPI_WRITE|BMAPI_MMAP;
+				flags = BMAPI_WRITE | BMAPI_MMAP;
 			}
 
 			if (!iomap_valid) {
-- 
cgit v1.2.3


From 1fc5d959d88a5f77aa7e4435f6c9d0e2d2236704 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Tue, 11 Apr 2006 15:11:12 +1000
Subject: [XFS] Fix inode reclaim scalability regression. When a filesystem has
 millions of inodes cached and has sparse cluster population, removing inodes
 from the cluster hash consumes excessive amounts of CPU time. Reduce the CPU
 cost by making removal O(1) via use of a double linked list for the hash
 chains.

SGI-PV: 951551
SGI-Modid: xfs-linux-melb:xfs-kern:25683a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_iget.c  | 29 ++++++++++++-----------------
 fs/xfs/xfs_inode.h |  1 +
 2 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index bb33113eef9..b5385432526 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -421,7 +421,10 @@ finish_inode:
 			ip->i_chash = chlnew;
 			chlnew->chl_ip = ip;
 			chlnew->chl_blkno = ip->i_blkno;
+			if (ch->ch_list)
+				ch->ch_list->chl_prev = chlnew;
 			chlnew->chl_next = ch->ch_list;
+			chlnew->chl_prev = NULL;
 			ch->ch_list = chlnew;
 			chlnew = NULL;
 		}
@@ -723,23 +726,15 @@ xfs_iextract(
 		ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
 		ASSERT(ip->i_chash != NULL);
 		chm=NULL;
-		for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
-			if (chl->chl_blkno == ip->i_blkno) {
-				if (chm == NULL) {
-					/* first item on the list */
-					ch->ch_list = chl->chl_next;
-				} else {
-					chm->chl_next = chl->chl_next;
-				}
-				kmem_zone_free(xfs_chashlist_zone, chl);
-				break;
-			} else {
-				ASSERT(chl->chl_ip != ip);
-				chm = chl;
-			}
-		}
-		ASSERT_ALWAYS(chl != NULL);
-       } else {
+		chl = ip->i_chash;
+		if (chl->chl_prev)
+			chl->chl_prev->chl_next = chl->chl_next;
+		else
+			ch->ch_list = chl->chl_next;
+		if (chl->chl_next)
+			chl->chl_next->chl_prev = chl->chl_prev;
+		kmem_zone_free(xfs_chashlist_zone, chl);
+	} else {
 		/* delete one inode from a non-empty list */
 		iq = ip->i_cnext;
 		iq->i_cprev = ip->i_cprev;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 39ef9c36ea5..3b544db1790 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -189,6 +189,7 @@ typedef struct xfs_ihash {
  */
 typedef struct xfs_chashlist {
 	struct xfs_chashlist	*chl_next;
+	struct xfs_chashlist	*chl_prev;
 	struct xfs_inode	*chl_ip;
 	xfs_daddr_t		chl_blkno;	/* starting block number of
 						 * the cluster */
-- 
cgit v1.2.3


From 58829e490ee805f1c8b3009abc90e2a1a7a0d278 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Tue, 11 Apr 2006 15:11:20 +1000
Subject: [XFS] Fix an inode use-after-free durin an unpin. When reclaiming
 inodes that have been unlinked, we may need to execute transactions during
 reclaim. By the time the transaction has hit the disk, the linux inode and
 xfs vnode may already have been freed so we can't reference them safely. Use
 the known xfs inode state to determine if it is safe to reference the vnode
 and linux inode during the unpin operation.

SGI-PV: 946321
SGI-Modid: xfs-linux-melb:xfs-kern:25687a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_inode.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 48146bdc6bd..94b60dd0380 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2732,16 +2732,29 @@ xfs_iunpin(
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
 
 	if (atomic_dec_and_test(&ip->i_pincount)) {
-		vnode_t	*vp = XFS_ITOV_NULL(ip);
+		/*
+		 * If the inode is currently being reclaimed, the
+		 * linux inode _and_ the xfs vnode may have been
+		 * freed so we cannot reference either of them safely.
+		 * Hence we should not try to do anything to them
+		 * if the xfs inode is currently in the reclaim
+		 * path.
+		 *
+		 * However, we still need to issue the unpin wakeup
+		 * call as the inode reclaim may be blocked waiting for
+		 * the inode to become unpinned.
+		 */
+		if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
+			vnode_t	*vp = XFS_ITOV_NULL(ip);
 
-		/* make sync come back and flush this inode */
-		if (vp) {
-			struct inode	*inode = vn_to_inode(vp);
+			/* make sync come back and flush this inode */
+			if (vp) {
+				struct inode	*inode = vn_to_inode(vp);
 
-			if (!(inode->i_state & I_NEW))
-				mark_inode_dirty_sync(inode);
+				if (!(inode->i_state & I_NEW))
+					mark_inode_dirty_sync(inode);
+			}
 		}
-
 		wake_up(&ip->i_ipin_wait);
 	}
 }
-- 
cgit v1.2.3


From 8c0b5113a55c698f3190ec85925815640f1c2049 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 11 Apr 2006 15:12:45 +1000
Subject: [XFS] Fix utime(2) in the case that no times parameter was passed in.

SGI-PV: 949858
SGI-Modid: xfs-linux-melb:xfs-kern:25717a

Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 149237304fb..2e2e275c786 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -673,8 +673,7 @@ xfs_vn_setattr(
 	if (ia_valid & ATTR_ATIME) {
 		vattr.va_mask |= XFS_AT_ATIME;
 		vattr.va_atime = attr->ia_atime;
-		if (ia_valid & ATTR_ATIME_SET)
-			inode->i_atime = attr->ia_atime;
+		inode->i_atime = attr->ia_atime;
 	}
 	if (ia_valid & ATTR_MTIME) {
 		vattr.va_mask |= XFS_AT_MTIME;
-- 
cgit v1.2.3


From 019ff2d57b0bbe77d1eca19f5b634e5e7ff2a0b8 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 11 Apr 2006 15:45:05 +1000
Subject: [XFS] Fix a problem in aligning inode allocations to stripe unit
 boundaries.

SGI-PV: 951862
SGI-Modid: xfs-linux-melb:xfs-kern:25726a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 4eeb856183b..deddbd03c16 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -158,9 +158,10 @@ xfs_ialloc_ag_alloc(
  	 */
 	agi = XFS_BUF_TO_AGI(agbp);
 	newino = be32_to_cpu(agi->agi_newino);
-	if(likely(newino != NULLAGINO)) {
-		args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-				XFS_IALLOC_BLOCKS(args.mp);
+	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+			XFS_IALLOC_BLOCKS(args.mp);
+	if (likely(newino != NULLAGINO &&
+		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 		args.fsbno = XFS_AGB_TO_FSB(args.mp,
 				be32_to_cpu(agi->agi_seqno), args.agbno);
 		args.type = XFS_ALLOCTYPE_THIS_BNO;
@@ -182,8 +183,8 @@ xfs_ialloc_ag_alloc(
 		 * Set the alignment for the allocation.
 		 * If stripe alignment is turned on then align at stripe unit
 		 * boundary.
-		 * If the cluster size is smaller than a filesystem block 
-		 * then we're doing I/O for inodes in filesystem block size 
+		 * If the cluster size is smaller than a filesystem block
+		 * then we're doing I/O for inodes in filesystem block size
 		 * pieces, so don't need alignment anyway.
 		 */
 		isaligned = 0;
@@ -192,7 +193,7 @@ xfs_ialloc_ag_alloc(
 			args.alignment = args.mp->m_dalign;
 			isaligned = 1;
 		} else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
-			   args.mp->m_sb.sb_inoalignmt >= 
+			   args.mp->m_sb.sb_inoalignmt >=
 			   XFS_B_TO_FSBT(args.mp,
 			  	XFS_INODE_CLUSTER_SIZE(args.mp)))
 				args.alignment = args.mp->m_sb.sb_inoalignmt;
@@ -220,7 +221,7 @@ xfs_ialloc_ag_alloc(
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
- 
+
 	/*
 	 * If stripe alignment is turned on, then try again with cluster
 	 * alignment.
-- 
cgit v1.2.3


From b92ce55893745e011edae70830b8bc863be881f9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 13:52:07 +0200
Subject: [PATCH] splice: add direct fd <-> fd splicing support

It's more efficient for sendfile() emulation. Basically we cache an
internal private pipe and just use that as the intermediate area for
pages. Direct splicing is not available from sys_splice(), it is only
meant to be used for sendfile() emulation.

Additional patch from Ingo Molnar to avoid the PIPE_BUFFERS loop at
exit for the normal fast path.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c   |  10 ++--
 fs/splice.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 137 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 705b4869262..036536f072c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -691,12 +691,10 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 	return info;
 }
 
-void free_pipe_info(struct inode *inode)
+void __free_pipe_info(struct pipe_inode_info *info)
 {
 	int i;
-	struct pipe_inode_info *info = inode->i_pipe;
 
-	inode->i_pipe = NULL;
 	for (i = 0; i < PIPE_BUFFERS; i++) {
 		struct pipe_buffer *buf = info->bufs + i;
 		if (buf->ops)
@@ -707,6 +705,12 @@ void free_pipe_info(struct inode *inode)
 	kfree(info);
 }
 
+void free_pipe_info(struct inode *inode)
+{
+	__free_pipe_info(inode->i_pipe);
+	inode->i_pipe = NULL;
+}
+
 static struct vfsmount *pipe_mnt __read_mostly;
 static int pipefs_delete_dentry(struct dentry *dentry)
 {
diff --git a/fs/splice.c b/fs/splice.c
index a5326127aad..c47b561edac 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -680,8 +680,7 @@ EXPORT_SYMBOL(generic_splice_sendpage);
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   loff_t __user *off_out, size_t len,
-			   unsigned int flags)
+			   size_t len, unsigned int flags)
 {
 	loff_t pos;
 	int ret;
@@ -692,9 +691,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (!(out->f_mode & FMODE_WRITE))
 		return -EBADF;
 
-	if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t)))
-		return -EFAULT;
-
 	pos = out->f_pos;
 
 	ret = rw_verify_area(WRITE, out, &pos, len);
@@ -707,9 +703,8 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, loff_t __user *off_in,
-			 struct pipe_inode_info *pipe, size_t len,
-			 unsigned int flags)
+static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
+			 size_t len, unsigned int flags)
 {
 	loff_t pos, isize, left;
 	int ret;
@@ -720,9 +715,6 @@ static long do_splice_to(struct file *in, loff_t __user *off_in,
 	if (!(in->f_mode & FMODE_READ))
 		return -EBADF;
 
-	if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
-		return -EFAULT;
-
 	pos = in->f_pos;
 
 	ret = rw_verify_area(READ, in, &pos, len);
@@ -740,6 +732,118 @@ static long do_splice_to(struct file *in, loff_t __user *off_in,
 	return in->f_op->splice_read(in, pipe, len, flags);
 }
 
+long do_splice_direct(struct file *in, struct file *out, size_t len,
+		      unsigned int flags)
+{
+	struct pipe_inode_info *pipe;
+	long ret, bytes;
+	umode_t i_mode;
+	int i;
+
+	/*
+	 * We require the input being a regular file, as we don't want to
+	 * randomly drop data for eg socket -> socket splicing. Use the
+	 * piped splicing for that!
+	 */
+	i_mode = in->f_dentry->d_inode->i_mode;
+	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
+		return -EINVAL;
+
+	/*
+	 * neither in nor out is a pipe, setup an internal pipe attached to
+	 * 'out' and transfer the wanted data from 'in' to 'out' through that
+	 */
+	pipe = current->splice_pipe;
+	if (!pipe) {
+		pipe = alloc_pipe_info(NULL);
+		if (!pipe)
+			return -ENOMEM;
+
+		/*
+		 * We don't have an immediate reader, but we'll read the stuff
+		 * out of the pipe right after the move_to_pipe(). So set
+		 * PIPE_READERS appropriately.
+		 */
+		pipe->readers = 1;
+
+		current->splice_pipe = pipe;
+	}
+
+	/*
+	 * do the splice
+	 */
+	ret = 0;
+	bytes = 0;
+
+	while (len) {
+		size_t read_len, max_read_len;
+
+		/*
+		 * Do at most PIPE_BUFFERS pages worth of transfer:
+		 */
+		max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
+
+		ret = do_splice_to(in, pipe, max_read_len, flags);
+		if (unlikely(ret < 0))
+			goto out_release;
+
+		read_len = ret;
+
+		/*
+		 * NOTE: nonblocking mode only applies to the input. We
+		 * must not do the output in nonblocking mode as then we
+		 * could get stuck data in the internal pipe:
+		 */
+		ret = do_splice_from(pipe, out, read_len,
+				     flags & ~SPLICE_F_NONBLOCK);
+		if (unlikely(ret < 0))
+			goto out_release;
+
+		bytes += ret;
+		len -= ret;
+
+		/*
+		 * In nonblocking mode, if we got back a short read then
+		 * that was due to either an IO error or due to the
+		 * pagecache entry not being there. In the IO error case
+		 * the _next_ splice attempt will produce a clean IO error
+		 * return value (not a short read), so in both cases it's
+		 * correct to break out of the loop here:
+		 */
+		if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
+			break;
+	}
+
+	pipe->nrbufs = pipe->curbuf = 0;
+
+	return bytes;
+
+out_release:
+	/*
+	 * If we did an incomplete transfer we must release
+	 * the pipe buffers in question:
+	 */
+	for (i = 0; i < PIPE_BUFFERS; i++) {
+		struct pipe_buffer *buf = pipe->bufs + i;
+
+		if (buf->ops) {
+			buf->ops->release(pipe, buf);
+			buf->ops = NULL;
+		}
+	}
+	pipe->nrbufs = pipe->curbuf = 0;
+
+	/*
+	 * If we transferred some data, return the number of bytes:
+	 */
+	if (bytes > 0)
+		return bytes;
+
+	return ret;
+}
+
+EXPORT_SYMBOL(do_splice_direct);
+
 /*
  * Determine where to splice to/from.
  */
@@ -749,25 +853,33 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 {
 	struct pipe_inode_info *pipe;
 
-	if (off_out && out->f_op->llseek == no_llseek)
-		return -EINVAL;
-	if (off_in && in->f_op->llseek == no_llseek)
-		return -EINVAL;
-
 	pipe = in->f_dentry->d_inode->i_pipe;
 	if (pipe) {
 		if (off_in)
 			return -ESPIPE;
+		if (off_out) {
+			if (out->f_op->llseek == no_llseek)
+				return -EINVAL;
+			if (copy_from_user(&out->f_pos, off_out,
+					   sizeof(loff_t)))
+				return -EFAULT;
+		}
 
-		return do_splice_from(pipe, out, off_out, len, flags);
+		return do_splice_from(pipe, out, len, flags);
 	}
 
 	pipe = out->f_dentry->d_inode->i_pipe;
 	if (pipe) {
 		if (off_out)
 			return -ESPIPE;
+		if (off_in) {
+			if (in->f_op->llseek == no_llseek)
+				return -EINVAL;
+			if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
+				return -EFAULT;
+		}
 
-		return do_splice_to(in, off_in, pipe, len, flags);
+		return do_splice_to(in, pipe, len, flags);
 	}
 
 	return -EINVAL;
-- 
cgit v1.2.3


From 7480a90435673b4c717b6caf1350ec577d5f1adf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 13:52:47 +0200
Subject: [PATCH] splice: speedup __generic_file_splice_read

Using find_get_page() is a lot faster than find_or_create_page(). This
gets splice a lot closer to sendfile() for fd -> socket transfers.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index c47b561edac..e30743c2c06 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -240,7 +240,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 	struct page *pages[PIPE_BUFFERS];
 	struct page *page;
 	pgoff_t index;
-	int i;
+	int i, error;
 
 	index = in->f_pos >> PAGE_CACHE_SHIFT;
 	offset = in->f_pos & ~PAGE_CACHE_MASK;
@@ -260,32 +260,84 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 	/*
 	 * now fill in the holes
 	 */
+	error = 0;
 	for (i = 0; i < nr_pages; i++, index++) {
+find_page:
 		/*
-		 * no page there, look one up / create it
+		 * lookup the page for this index
 		 */
-		page = find_or_create_page(mapping, index,
-						   mapping_gfp_mask(mapping));
-		if (!page)
-			break;
+		page = find_get_page(mapping, index);
+		if (!page) {
+			/*
+			 * If in nonblock mode then dont block on
+			 * readpage (we've kicked readahead so there
+			 * will be asynchronous progress):
+			 */
+			if (flags & SPLICE_F_NONBLOCK)
+				break;
+
+			/*
+			 * page didn't exist, allocate one
+			 */
+			page = page_cache_alloc_cold(mapping);
+			if (!page)
+				break;
+
+			error = add_to_page_cache_lru(page, mapping, index,
+						mapping_gfp_mask(mapping));
+			if (unlikely(error)) {
+				page_cache_release(page);
+				break;
+			}
 
-		if (PageUptodate(page))
-			unlock_page(page);
-		else {
-			int error = mapping->a_ops->readpage(in, page);
+			goto readpage;
+		}
+
+		/*
+		 * If the page isn't uptodate, we may need to start io on it
+		 */
+		if (!PageUptodate(page)) {
+			lock_page(page);
+
+			/*
+			 * page was truncated, stop here. if this isn't the
+			 * first page, we'll just complete what we already
+			 * added
+			 */
+			if (!page->mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				break;
+			}
+			/*
+			 * page was already under io and is now done, great
+			 */
+			if (PageUptodate(page)) {
+				unlock_page(page);
+				goto fill_it;
+			}
+
+readpage:
+			/*
+			 * need to read in the page
+			 */
+			error = mapping->a_ops->readpage(in, page);
 
 			if (unlikely(error)) {
 				page_cache_release(page);
+				if (error == AOP_TRUNCATED_PAGE)
+					goto find_page;
 				break;
 			}
 		}
+fill_it:
 		pages[i] = page;
 	}
 
 	if (i)
 		return move_to_pipe(pipe, pages, i, offset, len, flags);
 
-	return 0;
+	return error;
 }
 
 /**
-- 
cgit v1.2.3


From 9aeedfc4712ed58d9f7ae41596185c72b8dc97e8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Apr 2006 13:53:10 +0200
Subject: [PATCH] get rid of the PIPE_*() macros

get rid of the PIPE_*() macros. Scripted transformation.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/fifo.c |  48 ++++++++++++++---------------
 fs/pipe.c | 104 +++++++++++++++++++++++++++++++-------------------------------
 2 files changed, 76 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/fifo.c b/fs/fifo.c
index b16e2f597d6..2c27f56d730 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -28,14 +28,14 @@ static void wait_for_partner(struct inode* inode, unsigned int *cnt)
 
 static void wake_up_partner(struct inode* inode)
 {
-	wake_up_interruptible(PIPE_WAIT(*inode));
+	wake_up_interruptible(&inode->i_pipe->wait);
 }
 
 static int fifo_open(struct inode *inode, struct file *filp)
 {
 	int ret;
 
-	mutex_lock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
 	if (!inode->i_pipe) {
 		ret = -ENOMEM;
 		inode->i_pipe = alloc_pipe_info(inode);
@@ -55,18 +55,18 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  opened, even when there is no process writing the FIFO.
 	 */
 		filp->f_op = &read_fifo_fops;
-		PIPE_RCOUNTER(*inode)++;
-		if (PIPE_READERS(*inode)++ == 0)
+		inode->i_pipe->r_counter++;
+		if (inode->i_pipe->readers++ == 0)
 			wake_up_partner(inode);
 
-		if (!PIPE_WRITERS(*inode)) {
+		if (!inode->i_pipe->writers) {
 			if ((filp->f_flags & O_NONBLOCK)) {
 				/* suppress POLLHUP until we have
 				 * seen a writer */
-				filp->f_version = PIPE_WCOUNTER(*inode);
+				filp->f_version = inode->i_pipe->w_counter;
 			} else 
 			{
-				wait_for_partner(inode, &PIPE_WCOUNTER(*inode));
+				wait_for_partner(inode, &inode->i_pipe->w_counter);
 				if(signal_pending(current))
 					goto err_rd;
 			}
@@ -80,16 +80,16 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  errno=ENXIO when there is no process reading the FIFO.
 	 */
 		ret = -ENXIO;
-		if ((filp->f_flags & O_NONBLOCK) && !PIPE_READERS(*inode))
+		if ((filp->f_flags & O_NONBLOCK) && !inode->i_pipe->readers)
 			goto err;
 
 		filp->f_op = &write_fifo_fops;
-		PIPE_WCOUNTER(*inode)++;
-		if (!PIPE_WRITERS(*inode)++)
+		inode->i_pipe->w_counter++;
+		if (!inode->i_pipe->writers++)
 			wake_up_partner(inode);
 
-		if (!PIPE_READERS(*inode)) {
-			wait_for_partner(inode, &PIPE_RCOUNTER(*inode));
+		if (!inode->i_pipe->readers) {
+			wait_for_partner(inode, &inode->i_pipe->r_counter);
 			if (signal_pending(current))
 				goto err_wr;
 		}
@@ -104,11 +104,11 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 */
 		filp->f_op = &rdwr_fifo_fops;
 
-		PIPE_READERS(*inode)++;
-		PIPE_WRITERS(*inode)++;
-		PIPE_RCOUNTER(*inode)++;
-		PIPE_WCOUNTER(*inode)++;
-		if (PIPE_READERS(*inode) == 1 || PIPE_WRITERS(*inode) == 1)
+		inode->i_pipe->readers++;
+		inode->i_pipe->writers++;
+		inode->i_pipe->r_counter++;
+		inode->i_pipe->w_counter++;
+		if (inode->i_pipe->readers == 1 || inode->i_pipe->writers == 1)
 			wake_up_partner(inode);
 		break;
 
@@ -118,27 +118,27 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 	return 0;
 
 err_rd:
-	if (!--PIPE_READERS(*inode))
-		wake_up_interruptible(PIPE_WAIT(*inode));
+	if (!--inode->i_pipe->readers)
+		wake_up_interruptible(&inode->i_pipe->wait);
 	ret = -ERESTARTSYS;
 	goto err;
 
 err_wr:
-	if (!--PIPE_WRITERS(*inode))
-		wake_up_interruptible(PIPE_WAIT(*inode));
+	if (!--inode->i_pipe->writers)
+		wake_up_interruptible(&inode->i_pipe->wait);
 	ret = -ERESTARTSYS;
 	goto err;
 
 err:
-	if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode))
+	if (!inode->i_pipe->readers && !inode->i_pipe->writers)
 		free_pipe_info(inode);
 
 err_nocleanup:
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 036536f072c..0602fc9f7eb 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -158,7 +158,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	mutex_lock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
 	info = inode->i_pipe;
 	for (;;) {
 		int bufs = info->nrbufs;
@@ -202,9 +202,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 		}
 		if (bufs)	/* More to do? */
 			continue;
-		if (!PIPE_WRITERS(*inode))
+		if (!inode->i_pipe->writers)
 			break;
-		if (!PIPE_WAITING_WRITERS(*inode)) {
+		if (!inode->i_pipe->waiting_writers) {
 			/* syscall merging: Usually we must not sleep
 			 * if O_NONBLOCK is set, or if we got some data.
 			 * But if a writer sleeps in kernel space, then
@@ -222,16 +222,16 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
- 			kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+			wake_up_interruptible_sync(&inode->i_pipe->wait);
+ 			kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
 		pipe_wait(inode->i_pipe);
 	}
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 	/* Signal writers asynchronously that there is more room.  */
 	if (do_wakeup) {
-		wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+		wake_up_interruptible(&inode->i_pipe->wait);
+		kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	if (ret > 0)
 		file_accessed(filp);
@@ -264,10 +264,10 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	mutex_lock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
 	info = inode->i_pipe;
 
-	if (!PIPE_READERS(*inode)) {
+	if (!inode->i_pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
 		ret = -EPIPE;
 		goto out;
@@ -306,7 +306,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	for (;;) {
 		int bufs;
-		if (!PIPE_READERS(*inode)) {
+		if (!inode->i_pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret) ret = -EPIPE;
 			break;
@@ -367,19 +367,19 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
-			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+			wake_up_interruptible_sync(&inode->i_pipe->wait);
+			kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
-		PIPE_WAITING_WRITERS(*inode)++;
+		inode->i_pipe->waiting_writers++;
 		pipe_wait(inode->i_pipe);
-		PIPE_WAITING_WRITERS(*inode)--;
+		inode->i_pipe->waiting_writers--;
 	}
 out:
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
-		wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+		wake_up_interruptible(&inode->i_pipe->wait);
+		kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	if (ret > 0)
 		file_update_time(filp);
@@ -416,7 +416,7 @@ pipe_ioctl(struct inode *pino, struct file *filp,
 
 	switch (cmd) {
 		case FIONREAD:
-			mutex_lock(PIPE_MUTEX(*inode));
+			mutex_lock(&inode->i_mutex);
 			info =  inode->i_pipe;
 			count = 0;
 			buf = info->curbuf;
@@ -425,7 +425,7 @@ pipe_ioctl(struct inode *pino, struct file *filp,
 				count += info->bufs[buf].len;
 				buf = (buf+1) & (PIPE_BUFFERS-1);
 			}
-			mutex_unlock(PIPE_MUTEX(*inode));
+			mutex_unlock(&inode->i_mutex);
 			return put_user(count, (int __user *)arg);
 		default:
 			return -EINVAL;
@@ -441,14 +441,14 @@ pipe_poll(struct file *filp, poll_table *wait)
 	struct pipe_inode_info *info = inode->i_pipe;
 	int nrbufs;
 
-	poll_wait(filp, PIPE_WAIT(*inode), wait);
+	poll_wait(filp, &inode->i_pipe->wait, wait);
 
 	/* Reading only -- no need for acquiring the semaphore.  */
 	nrbufs = info->nrbufs;
 	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
 		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
-		if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
+		if (!inode->i_pipe->writers && filp->f_version != inode->i_pipe->w_counter)
 			mask |= POLLHUP;
 	}
 
@@ -458,7 +458,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
 		 */
-		if (!PIPE_READERS(*inode))
+		if (!inode->i_pipe->readers)
 			mask |= POLLERR;
 	}
 
@@ -468,17 +468,17 @@ pipe_poll(struct file *filp, poll_table *wait)
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
-	mutex_lock(PIPE_MUTEX(*inode));
-	PIPE_READERS(*inode) -= decr;
-	PIPE_WRITERS(*inode) -= decw;
-	if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
+	mutex_lock(&inode->i_mutex);
+	inode->i_pipe->readers -= decr;
+	inode->i_pipe->writers -= decw;
+	if (!inode->i_pipe->readers && !inode->i_pipe->writers) {
 		free_pipe_info(inode);
 	} else {
-		wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
-		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+		wake_up_interruptible(&inode->i_pipe->wait);
+		kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN);
+		kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
@@ -489,9 +489,9 @@ pipe_read_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	mutex_lock(PIPE_MUTEX(*inode));
-	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
+	mutex_unlock(&inode->i_mutex);
 
 	if (retval < 0)
 		return retval;
@@ -506,9 +506,9 @@ pipe_write_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	mutex_lock(PIPE_MUTEX(*inode));
-	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
+	mutex_unlock(&inode->i_mutex);
 
 	if (retval < 0)
 		return retval;
@@ -523,14 +523,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	mutex_lock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
 
-	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
+	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
 
 	if (retval >= 0)
-		retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
+		retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
 
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 
 	if (retval < 0)
 		return retval;
@@ -569,9 +569,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 {
 	/* We could have perhaps used atomic_t, but this and friends
 	   below are the only places.  So it doesn't seem worthwhile.  */
-	mutex_lock(PIPE_MUTEX(*inode));
-	PIPE_READERS(*inode)++;
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
+	inode->i_pipe->readers++;
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
@@ -579,9 +579,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 static int
 pipe_write_open(struct inode *inode, struct file *filp)
 {
-	mutex_lock(PIPE_MUTEX(*inode));
-	PIPE_WRITERS(*inode)++;
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
+	inode->i_pipe->writers++;
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
@@ -589,12 +589,12 @@ pipe_write_open(struct inode *inode, struct file *filp)
 static int
 pipe_rdwr_open(struct inode *inode, struct file *filp)
 {
-	mutex_lock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
 	if (filp->f_mode & FMODE_READ)
-		PIPE_READERS(*inode)++;
+		inode->i_pipe->readers++;
 	if (filp->f_mode & FMODE_WRITE)
-		PIPE_WRITERS(*inode)++;
-	mutex_unlock(PIPE_MUTEX(*inode));
+		inode->i_pipe->writers++;
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
@@ -731,7 +731,7 @@ static struct inode * get_pipe_inode(void)
 	if (!inode->i_pipe)
 		goto fail_iput;
 
-	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
+	inode->i_pipe->readers = inode->i_pipe->writers = 1;
 	inode->i_fop = &rdwr_pipe_fops;
 
 	/*
-- 
cgit v1.2.3


From 923f4f23940d2361e8d5c4245982163a8e9d1c91 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Apr 2006 13:53:33 +0200
Subject: [PATCH] pipe.c/fifo.c code cleanups

more code cleanups after the macro conversion:

 - standardize on 'struct pipe_inode_info *pipe' variable names
 - introduce 'pipe' temporaries to reduce mass inode->i_pipe dereferencing

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/fifo.c |  49 ++++++++++---------
 fs/pipe.c | 163 ++++++++++++++++++++++++++++++++------------------------------
 2 files changed, 111 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/fifo.c b/fs/fifo.c
index 2c27f56d730..49035b174b4 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -33,14 +33,17 @@ static void wake_up_partner(struct inode* inode)
 
 static int fifo_open(struct inode *inode, struct file *filp)
 {
+	struct pipe_inode_info *pipe;
 	int ret;
 
 	mutex_lock(&inode->i_mutex);
-	if (!inode->i_pipe) {
+	pipe = inode->i_pipe;
+	if (!pipe) {
 		ret = -ENOMEM;
-		inode->i_pipe = alloc_pipe_info(inode);
-		if (!inode->i_pipe)
+		pipe = alloc_pipe_info(inode);
+		if (!pipe)
 			goto err_nocleanup;
+		inode->i_pipe = pipe;
 	}
 	filp->f_version = 0;
 
@@ -55,18 +58,18 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  opened, even when there is no process writing the FIFO.
 	 */
 		filp->f_op = &read_fifo_fops;
-		inode->i_pipe->r_counter++;
-		if (inode->i_pipe->readers++ == 0)
+		pipe->r_counter++;
+		if (pipe->readers++ == 0)
 			wake_up_partner(inode);
 
-		if (!inode->i_pipe->writers) {
+		if (!pipe->writers) {
 			if ((filp->f_flags & O_NONBLOCK)) {
 				/* suppress POLLHUP until we have
 				 * seen a writer */
-				filp->f_version = inode->i_pipe->w_counter;
+				filp->f_version = pipe->w_counter;
 			} else 
 			{
-				wait_for_partner(inode, &inode->i_pipe->w_counter);
+				wait_for_partner(inode, &pipe->w_counter);
 				if(signal_pending(current))
 					goto err_rd;
 			}
@@ -80,16 +83,16 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  errno=ENXIO when there is no process reading the FIFO.
 	 */
 		ret = -ENXIO;
-		if ((filp->f_flags & O_NONBLOCK) && !inode->i_pipe->readers)
+		if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
 			goto err;
 
 		filp->f_op = &write_fifo_fops;
-		inode->i_pipe->w_counter++;
-		if (!inode->i_pipe->writers++)
+		pipe->w_counter++;
+		if (!pipe->writers++)
 			wake_up_partner(inode);
 
-		if (!inode->i_pipe->readers) {
-			wait_for_partner(inode, &inode->i_pipe->r_counter);
+		if (!pipe->readers) {
+			wait_for_partner(inode, &pipe->r_counter);
 			if (signal_pending(current))
 				goto err_wr;
 		}
@@ -104,11 +107,11 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 */
 		filp->f_op = &rdwr_fifo_fops;
 
-		inode->i_pipe->readers++;
-		inode->i_pipe->writers++;
-		inode->i_pipe->r_counter++;
-		inode->i_pipe->w_counter++;
-		if (inode->i_pipe->readers == 1 || inode->i_pipe->writers == 1)
+		pipe->readers++;
+		pipe->writers++;
+		pipe->r_counter++;
+		pipe->w_counter++;
+		if (pipe->readers == 1 || pipe->writers == 1)
 			wake_up_partner(inode);
 		break;
 
@@ -122,19 +125,19 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	return 0;
 
 err_rd:
-	if (!--inode->i_pipe->readers)
-		wake_up_interruptible(&inode->i_pipe->wait);
+	if (!--pipe->readers)
+		wake_up_interruptible(&pipe->wait);
 	ret = -ERESTARTSYS;
 	goto err;
 
 err_wr:
-	if (!--inode->i_pipe->writers)
-		wake_up_interruptible(&inode->i_pipe->wait);
+	if (!--pipe->writers)
+		wake_up_interruptible(&pipe->wait);
 	ret = -ERESTARTSYS;
 	goto err;
 
 err:
-	if (!inode->i_pipe->readers && !inode->i_pipe->writers)
+	if (!pipe->readers && !pipe->writers)
 		free_pipe_info(inode);
 
 err_nocleanup:
diff --git a/fs/pipe.c b/fs/pipe.c
index 0602fc9f7eb..b941e1951ea 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -93,7 +93,7 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
 	return 0;
 }
 
-static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 
@@ -104,8 +104,8 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff
 	 * temporary page, let's keep track of it as a one-deep
 	 * allocation cache
 	 */
-	if (page_count(page) == 1 && !info->tmp_page) {
-		info->tmp_page = page;
+	if (page_count(page) == 1 && !pipe->tmp_page) {
+		pipe->tmp_page = page;
 		return;
 	}
 
@@ -115,17 +115,17 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff
 	page_cache_release(page);
 }
 
-static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
 	return kmap(buf->page);
 }
 
-static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
 	kunmap(buf->page);
 }
 
-static int anon_pipe_buf_steal(struct pipe_inode_info *info,
+static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 			       struct pipe_buffer *buf)
 {
 	buf->flags |= PIPE_BUF_FLAG_STOLEN;
@@ -145,7 +145,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 	   unsigned long nr_segs, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct pipe_inode_info *info;
+	struct pipe_inode_info *pipe;
 	int do_wakeup;
 	ssize_t ret;
 	struct iovec *iov = (struct iovec *)_iov;
@@ -159,12 +159,12 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 	do_wakeup = 0;
 	ret = 0;
 	mutex_lock(&inode->i_mutex);
-	info = inode->i_pipe;
+	pipe = inode->i_pipe;
 	for (;;) {
-		int bufs = info->nrbufs;
+		int bufs = pipe->nrbufs;
 		if (bufs) {
-			int curbuf = info->curbuf;
-			struct pipe_buffer *buf = info->bufs + curbuf;
+			int curbuf = pipe->curbuf;
+			struct pipe_buffer *buf = pipe->bufs + curbuf;
 			struct pipe_buf_operations *ops = buf->ops;
 			void *addr;
 			size_t chars = buf->len;
@@ -173,14 +173,14 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			if (chars > total_len)
 				chars = total_len;
 
-			addr = ops->map(filp, info, buf);
+			addr = ops->map(filp, pipe, buf);
 			if (IS_ERR(addr)) {
 				if (!ret)
 					ret = PTR_ERR(addr);
 				break;
 			}
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
-			ops->unmap(info, buf);
+			ops->unmap(pipe, buf);
 			if (unlikely(error)) {
 				if (!ret) ret = -EFAULT;
 				break;
@@ -190,10 +190,10 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			buf->len -= chars;
 			if (!buf->len) {
 				buf->ops = NULL;
-				ops->release(info, buf);
+				ops->release(pipe, buf);
 				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
-				info->curbuf = curbuf;
-				info->nrbufs = --bufs;
+				pipe->curbuf = curbuf;
+				pipe->nrbufs = --bufs;
 				do_wakeup = 1;
 			}
 			total_len -= chars;
@@ -202,9 +202,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 		}
 		if (bufs)	/* More to do? */
 			continue;
-		if (!inode->i_pipe->writers)
+		if (!pipe->writers)
 			break;
-		if (!inode->i_pipe->waiting_writers) {
+		if (!pipe->waiting_writers) {
 			/* syscall merging: Usually we must not sleep
 			 * if O_NONBLOCK is set, or if we got some data.
 			 * But if a writer sleeps in kernel space, then
@@ -222,16 +222,16 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync(&inode->i_pipe->wait);
- 			kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT);
+			wake_up_interruptible_sync(&pipe->wait);
+ 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
-		pipe_wait(inode->i_pipe);
+		pipe_wait(pipe);
 	}
 	mutex_unlock(&inode->i_mutex);
 	/* Signal writers asynchronously that there is more room.  */
 	if (do_wakeup) {
-		wake_up_interruptible(&inode->i_pipe->wait);
-		kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT);
+		wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	if (ret > 0)
 		file_accessed(filp);
@@ -250,7 +250,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 	    unsigned long nr_segs, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct pipe_inode_info *info;
+	struct pipe_inode_info *pipe;
 	ssize_t ret;
 	int do_wakeup;
 	struct iovec *iov = (struct iovec *)_iov;
@@ -265,9 +265,9 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 	do_wakeup = 0;
 	ret = 0;
 	mutex_lock(&inode->i_mutex);
-	info = inode->i_pipe;
+	pipe = inode->i_pipe;
 
-	if (!inode->i_pipe->readers) {
+	if (!pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
 		ret = -EPIPE;
 		goto out;
@@ -275,23 +275,23 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	/* We try to merge small writes */
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
-	if (info->nrbufs && chars != 0) {
-		int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1);
-		struct pipe_buffer *buf = info->bufs + lastbuf;
+	if (pipe->nrbufs && chars != 0) {
+		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (PIPE_BUFFERS-1);
+		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
 			void *addr;
 			int error;
 
-			addr = ops->map(filp, info, buf);
+			addr = ops->map(filp, pipe, buf);
 			if (IS_ERR(addr)) {
 				error = PTR_ERR(addr);
 				goto out;
 			}
 			error = pipe_iov_copy_from_user(offset + addr, iov,
 							chars);
-			ops->unmap(info, buf);
+			ops->unmap(pipe, buf);
 			ret = error;
 			do_wakeup = 1;
 			if (error)
@@ -306,16 +306,16 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	for (;;) {
 		int bufs;
-		if (!inode->i_pipe->readers) {
+		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret) ret = -EPIPE;
 			break;
 		}
-		bufs = info->nrbufs;
+		bufs = pipe->nrbufs;
 		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1);
-			struct pipe_buffer *buf = info->bufs + newbuf;
-			struct page *page = info->tmp_page;
+			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+			struct pipe_buffer *buf = pipe->bufs + newbuf;
+			struct page *page = pipe->tmp_page;
 			int error;
 
 			if (!page) {
@@ -324,7 +324,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 					ret = ret ? : -ENOMEM;
 					break;
 				}
-				info->tmp_page = page;
+				pipe->tmp_page = page;
 			}
 			/* Always wakeup, even if the copy fails. Otherwise
 			 * we lock up (O_NONBLOCK-)readers that sleep due to
@@ -349,8 +349,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			buf->ops = &anon_pipe_buf_ops;
 			buf->offset = 0;
 			buf->len = chars;
-			info->nrbufs = ++bufs;
-			info->tmp_page = NULL;
+			pipe->nrbufs = ++bufs;
+			pipe->tmp_page = NULL;
 
 			total_len -= chars;
 			if (!total_len)
@@ -367,19 +367,19 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync(&inode->i_pipe->wait);
-			kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN);
+			wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
-		inode->i_pipe->waiting_writers++;
-		pipe_wait(inode->i_pipe);
-		inode->i_pipe->waiting_writers--;
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
-		wake_up_interruptible(&inode->i_pipe->wait);
-		kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN);
+		wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	if (ret > 0)
 		file_update_time(filp);
@@ -411,21 +411,22 @@ pipe_ioctl(struct inode *pino, struct file *filp,
 	   unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct pipe_inode_info *info;
+	struct pipe_inode_info *pipe;
 	int count, buf, nrbufs;
 
 	switch (cmd) {
 		case FIONREAD:
 			mutex_lock(&inode->i_mutex);
-			info =  inode->i_pipe;
+			pipe = inode->i_pipe;
 			count = 0;
-			buf = info->curbuf;
-			nrbufs = info->nrbufs;
+			buf = pipe->curbuf;
+			nrbufs = pipe->nrbufs;
 			while (--nrbufs >= 0) {
-				count += info->bufs[buf].len;
+				count += pipe->bufs[buf].len;
 				buf = (buf+1) & (PIPE_BUFFERS-1);
 			}
 			mutex_unlock(&inode->i_mutex);
+
 			return put_user(count, (int __user *)arg);
 		default:
 			return -EINVAL;
@@ -438,17 +439,17 @@ pipe_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask;
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct pipe_inode_info *info = inode->i_pipe;
+	struct pipe_inode_info *pipe = inode->i_pipe;
 	int nrbufs;
 
-	poll_wait(filp, &inode->i_pipe->wait, wait);
+	poll_wait(filp, &pipe->wait, wait);
 
 	/* Reading only -- no need for acquiring the semaphore.  */
-	nrbufs = info->nrbufs;
+	nrbufs = pipe->nrbufs;
 	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
 		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
-		if (!inode->i_pipe->writers && filp->f_version != inode->i_pipe->w_counter)
+		if (!pipe->writers && filp->f_version != pipe->w_counter)
 			mask |= POLLHUP;
 	}
 
@@ -458,7 +459,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
 		 */
-		if (!inode->i_pipe->readers)
+		if (!pipe->readers)
 			mask |= POLLERR;
 	}
 
@@ -468,15 +469,18 @@ pipe_poll(struct file *filp, poll_table *wait)
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
+	struct pipe_inode_info *pipe;
+
 	mutex_lock(&inode->i_mutex);
-	inode->i_pipe->readers -= decr;
-	inode->i_pipe->writers -= decw;
-	if (!inode->i_pipe->readers && !inode->i_pipe->writers) {
+	pipe = inode->i_pipe;
+	pipe->readers -= decr;
+	pipe->writers -= decw;
+	if (!pipe->readers && !pipe->writers) {
 		free_pipe_info(inode);
 	} else {
-		wake_up_interruptible(&inode->i_pipe->wait);
-		kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN);
-		kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT);
+		wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	mutex_unlock(&inode->i_mutex);
 
@@ -679,30 +683,30 @@ static struct file_operations rdwr_pipe_fops = {
 
 struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 {
-	struct pipe_inode_info *info;
+	struct pipe_inode_info *pipe;
 
-	info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
-	if (info) {
-		init_waitqueue_head(&info->wait);
-		info->r_counter = info->w_counter = 1;
-		info->inode = inode;
+	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	if (pipe) {
+		init_waitqueue_head(&pipe->wait);
+		pipe->r_counter = pipe->w_counter = 1;
+		pipe->inode = inode;
 	}
 
-	return info;
+	return pipe;
 }
 
-void __free_pipe_info(struct pipe_inode_info *info)
+void __free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
 	for (i = 0; i < PIPE_BUFFERS; i++) {
-		struct pipe_buffer *buf = info->bufs + i;
+		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
-			buf->ops->release(info, buf);
+			buf->ops->release(pipe, buf);
 	}
-	if (info->tmp_page)
-		__free_page(info->tmp_page);
-	kfree(info);
+	if (pipe->tmp_page)
+		__free_page(pipe->tmp_page);
+	kfree(pipe);
 }
 
 void free_pipe_info(struct inode *inode)
@@ -723,15 +727,17 @@ static struct dentry_operations pipefs_dentry_operations = {
 static struct inode * get_pipe_inode(void)
 {
 	struct inode *inode = new_inode(pipe_mnt->mnt_sb);
+	struct pipe_inode_info *pipe;
 
 	if (!inode)
 		goto fail_inode;
 
-	inode->i_pipe = alloc_pipe_info(inode);
-	if (!inode->i_pipe)
+	pipe = alloc_pipe_info(inode);
+	if (!pipe)
 		goto fail_iput;
+	inode->i_pipe = pipe;
 
-	inode->i_pipe->readers = inode->i_pipe->writers = 1;
+	pipe->readers = pipe->writers = 1;
 	inode->i_fop = &rdwr_pipe_fops;
 
 	/*
@@ -746,6 +752,7 @@ static struct inode * get_pipe_inode(void)
 	inode->i_gid = current->fsgid;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blksize = PAGE_SIZE;
+
 	return inode;
 
 fail_iput:
-- 
cgit v1.2.3


From 6f767b0425f5902e4817648632230b512e81c963 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 13:53:56 +0200
Subject: [PATCH] splice: speedups and optimizations

- Kill the local variables that cache ->nrbufs, they just take up space.

- Only set do_wakeup for a real pipe. This is a big win for direct splicing.

- Kill i_mutex lock around ->f_pos update, regular io paths don't do this
  either.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index e30743c2c06..36bc262dfbd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -150,8 +150,6 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 		mutex_lock(&pipe->inode->i_mutex);
 
 	for (;;) {
-		int bufs;
-
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret)
@@ -159,9 +157,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 			break;
 		}
 
-		bufs = pipe->nrbufs;
-		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS - 1);
+		if (pipe->nrbufs < PIPE_BUFFERS) {
+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pages[i++];
 			unsigned long this_len;
@@ -174,8 +171,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 			buf->offset = offset;
 			buf->len = this_len;
 			buf->ops = &page_cache_pipe_buf_ops;
-			pipe->nrbufs = ++bufs;
-			do_wakeup = 1;
+			pipe->nrbufs++;
+			if (pipe->inode)
+				do_wakeup = 1;
 
 			ret += this_len;
 			len -= this_len;
@@ -184,7 +182,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 				break;
 			if (!len)
 				break;
-			if (bufs < PIPE_BUFFERS)
+			if (pipe->nrbufs < PIPE_BUFFERS)
 				continue;
 
 			break;
@@ -581,11 +579,8 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 		mutex_lock(&pipe->inode->i_mutex);
 
 	for (;;) {
-		int bufs = pipe->nrbufs;
-
-		if (bufs) {
-			int curbuf = pipe->curbuf;
-			struct pipe_buffer *buf = pipe->bufs + curbuf;
+		if (pipe->nrbufs) {
+			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 			struct pipe_buf_operations *ops = buf->ops;
 
 			sd.len = buf->len;
@@ -606,10 +601,10 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
-				curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
-				pipe->curbuf = curbuf;
-				pipe->nrbufs = --bufs;
-				do_wakeup = 1;
+				pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+				pipe->nrbufs--;
+				if (pipe->inode)
+					do_wakeup = 1;
 			}
 
 			sd.pos += sd.len;
@@ -618,7 +613,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 				break;
 		}
 
-		if (bufs)
+		if (pipe->nrbufs)
 			continue;
 		if (!pipe->writers)
 			break;
@@ -660,9 +655,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 
-	mutex_lock(&out->f_mapping->host->i_mutex);
 	out->f_pos = sd.pos;
-	mutex_unlock(&out->f_mapping->host->i_mutex);
 	return ret;
 
 }
-- 
cgit v1.2.3


From 49570e9b29a3d78950b5eba6b73bdcca955f0877 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 13:56:09 +0200
Subject: [PATCH] splice: unlikely() optimizations

Also corrects a few comments. Patch mainly from Ingo, changes by me.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/read_write.c |  2 +-
 fs/splice.c     | 15 +++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 6256ca81a71..5bc0e9234f9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -202,7 +202,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 		goto Einval;
 
 	inode = file->f_dentry->d_inode;
-	if (inode->i_flock && MANDATORY_LOCK(inode)) {
+	if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) {
 		int retval = locks_mandatory_area(
 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 			inode, file, pos, count);
diff --git a/fs/splice.c b/fs/splice.c
index 36bc262dfbd..77b026baff7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -346,7 +346,6 @@ fill_it:
  * @flags:	splice modifier flags
  *
  * Will read pages from given file and fill them into a pipe.
- *
  */
 ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 				 size_t len, unsigned int flags)
@@ -690,7 +689,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 		mutex_lock(&inode->i_mutex);
 		err = generic_osync_inode(mapping->host, mapping,
-						OSYNC_METADATA|OSYNC_DATA);
+					  OSYNC_METADATA|OSYNC_DATA);
 		mutex_unlock(&inode->i_mutex);
 
 		if (err)
@@ -730,10 +729,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	loff_t pos;
 	int ret;
 
-	if (!out->f_op || !out->f_op->splice_write)
+	if (unlikely(!out->f_op || !out->f_op->splice_write))
 		return -EINVAL;
 
-	if (!(out->f_mode & FMODE_WRITE))
+	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
 	pos = out->f_pos;
@@ -754,10 +753,10 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
 	loff_t pos, isize, left;
 	int ret;
 
-	if (!in->f_op || !in->f_op->splice_read)
+	if (unlikely(!in->f_op || !in->f_op->splice_read))
 		return -EINVAL;
 
-	if (!(in->f_mode & FMODE_READ))
+	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
 	pos = in->f_pos;
@@ -771,7 +770,7 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
 		return 0;
 	
 	left = isize - in->f_pos;
-	if (left < len)
+	if (unlikely(left < len))
 		len = left;
 
 	return in->f_op->splice_read(in, pipe, len, flags);
@@ -799,7 +798,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len,
 	 * 'out' and transfer the wanted data from 'in' to 'out' through that
 	 */
 	pipe = current->splice_pipe;
-	if (!pipe) {
+	if (unlikely(!pipe)) {
 		pipe = alloc_pipe_info(NULL);
 		if (!pipe)
 			return -ENOMEM;
-- 
cgit v1.2.3


From c2058e0611c2fb4a00e63db61ef3a638cf760a98 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 13:56:34 +0200
Subject: [PATCH] splice: add Ingo as addition copyright holder

The comment is also somewhat out of date, correct that as well.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 77b026baff7..48b4382cc11 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -9,11 +9,12 @@
  * that transfers data buffers to or from a pipe buffer.
  *
  * Named by Larry McVoy, original implementation from Linus, extended by
- * Jens to support splicing to files and fixing the initial implementation
- * bugs.
+ * Jens to support splicing to files, network, direct splicing, etc and
+ * fixing lots of bugs.
  *
- * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
+ * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
+ * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  *
  */
 #include <linux/fs.h>
-- 
cgit v1.2.3


From 73d62d83ec3627782ba6f55defc76f3ffbef46ee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Apr 2006 13:57:21 +0200
Subject: [PATCH] splice: comment styles

 - capitalize consistently
 - end sentences in one way or another
 - update comment text to match the implementation

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 48b4382cc11..e50a460239d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -92,7 +92,7 @@ static void *page_cache_pipe_buf_map(struct file *file,
 
 		/*
 		 * Page got truncated/unhashed. This will cause a 0-byte
-		 * splice, if this is the first page
+		 * splice, if this is the first page.
 		 */
 		if (!page->mapping) {
 			err = -ENODATA;
@@ -100,7 +100,7 @@ static void *page_cache_pipe_buf_map(struct file *file,
 		}
 
 		/*
-		 * uh oh, read-error from disk
+		 * Uh oh, read-error from disk.
 		 */
 		if (!PageUptodate(page)) {
 			err = -EIO;
@@ -108,7 +108,7 @@ static void *page_cache_pipe_buf_map(struct file *file,
 		}
 
 		/*
-		 * page is ok afterall, fall through to mapping
+		 * Page is ok afterall, fall through to mapping.
 		 */
 		unlock_page(page);
 	}
@@ -249,7 +249,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 		nr_pages = PIPE_BUFFERS;
 
 	/*
-	 * initiate read-ahead on this page range. however, don't call into
+	 * Initiate read-ahead on this page range. however, don't call into
 	 * read-ahead if this is a non-zero offset (we are likely doing small
 	 * chunk splice and the page is already there) for a single page.
 	 */
@@ -257,7 +257,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 		do_page_cache_readahead(mapping, in, index, nr_pages);
 
 	/*
-	 * now fill in the holes
+	 * Now fill in the holes:
 	 */
 	error = 0;
 	for (i = 0; i < nr_pages; i++, index++) {
@@ -396,10 +396,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
 	int more;
 
 	/*
-	 * sub-optimal, but we are limited by the pipe ->map. we don't
+	 * Sub-optimal, but we are limited by the pipe ->map. We don't
 	 * need a kmap'ed buffer here, we just want to make sure we
 	 * have the page pinned if the pipe page originates from the
-	 * page cache
+	 * page cache.
 	 */
 	ptr = buf->ops->map(file, info, buf);
 	if (IS_ERR(ptr))
@@ -460,7 +460,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 
 	/*
-	 * reuse buf page, if SPLICE_F_MOVE is set
+	 * Reuse buf page, if SPLICE_F_MOVE is set.
 	 */
 	if (sd->flags & SPLICE_F_MOVE) {
 		/*
@@ -501,7 +501,7 @@ find_page:
 
 				if (!PageUptodate(page)) {
 					/*
-					 * page got invalidated, repeat
+					 * Page got invalidated, repeat.
 					 */
 					if (!page->mapping) {
 						unlock_page(page);
@@ -598,6 +598,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 			ret += sd.len;
 			buf->offset += sd.len;
 			buf->len -= sd.len;
+
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
@@ -681,7 +682,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	ret = move_from_pipe(pipe, out, len, flags, pipe_to_file);
 
 	/*
-	 * if file or inode is SYNC and we actually wrote some data, sync it
+	 * If file or inode is SYNC and we actually wrote some data, sync it.
 	 */
 	if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host))
 	    && ret > 0) {
@@ -815,7 +816,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len,
 	}
 
 	/*
-	 * do the splice
+	 * Do the splice.
 	 */
 	ret = 0;
 	bytes = 0;
-- 
cgit v1.2.3


From 341b446bc5aa36d1d5b8159c1e66716b5d89024d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Apr 2006 13:57:45 +0200
Subject: [PATCH] another round of fs/pipe.c cleanups

make pipe.c a bit more readable and hackable.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c | 76 ++++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index b941e1951ea..e984beb93a0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -44,7 +44,8 @@ void pipe_wait(struct pipe_inode_info *pipe)
 	 * Pipes are system-local resources, so sleeping on them
 	 * is considered a noninteractive wait:
 	 */
-	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
+	prepare_to_wait(&pipe->wait, &wait,
+			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
 	if (pipe->inode)
 		mutex_unlock(&pipe->inode->i_mutex);
 	schedule();
@@ -93,7 +94,8 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
 	return 0;
 }
 
-static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
+				  struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 
@@ -102,25 +104,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buff
 	/*
 	 * If nobody else uses this page, and we don't already have a
 	 * temporary page, let's keep track of it as a one-deep
-	 * allocation cache
+	 * allocation cache. (Otherwise just release our reference to it)
 	 */
-	if (page_count(page) == 1 && !pipe->tmp_page) {
+	if (page_count(page) == 1 && !pipe->tmp_page)
 		pipe->tmp_page = page;
-		return;
-	}
-
-	/*
-	 * Otherwise just release our reference to it
-	 */
-	page_cache_release(page);
+	else
+		page_cache_release(page);
 }
 
-static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
 {
 	return kmap(buf->page);
 }
 
-static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
 {
 	kunmap(buf->page);
 }
@@ -182,7 +181,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
 			ops->unmap(pipe, buf);
 			if (unlikely(error)) {
-				if (!ret) ret = -EFAULT;
+				if (!ret)
+					ret = -EFAULT;
 				break;
 			}
 			ret += chars;
@@ -218,7 +218,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			}
 		}
 		if (signal_pending(current)) {
-			if (!ret) ret = -ERESTARTSYS;
+			if (!ret)
+				ret = -ERESTARTSYS;
 			break;
 		}
 		if (do_wakeup) {
@@ -228,7 +229,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 		pipe_wait(pipe);
 	}
 	mutex_unlock(&inode->i_mutex);
-	/* Signal writers asynchronously that there is more room.  */
+
+	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
@@ -242,6 +244,7 @@ static ssize_t
 pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = buf, .iov_len = count };
+
 	return pipe_readv(filp, &iov, 1, ppos);
 }
 
@@ -276,10 +279,12 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 	/* We try to merge small writes */
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 	if (pipe->nrbufs && chars != 0) {
-		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (PIPE_BUFFERS-1);
+		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
+							(PIPE_BUFFERS-1);
 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
+
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
 			void *addr;
 			int error;
@@ -306,9 +311,11 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	for (;;) {
 		int bufs;
+
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
-			if (!ret) ret = -EPIPE;
+			if (!ret)
+				ret = -EPIPE;
 			break;
 		}
 		bufs = pipe->nrbufs;
@@ -326,7 +333,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 				}
 				pipe->tmp_page = page;
 			}
-			/* Always wakeup, even if the copy fails. Otherwise
+			/* Always wake up, even if the copy fails. Otherwise
 			 * we lock up (O_NONBLOCK-)readers that sleep due to
 			 * syscall merging.
 			 * FIXME! Is this really true?
@@ -339,7 +346,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			error = pipe_iov_copy_from_user(kmap(page), iov, chars);
 			kunmap(page);
 			if (unlikely(error)) {
-				if (!ret) ret = -EFAULT;
+				if (!ret)
+					ret = -EFAULT;
 				break;
 			}
 			ret += chars;
@@ -359,11 +367,13 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 		if (bufs < PIPE_BUFFERS)
 			continue;
 		if (filp->f_flags & O_NONBLOCK) {
-			if (!ret) ret = -EAGAIN;
+			if (!ret)
+				ret = -EAGAIN;
 			break;
 		}
 		if (signal_pending(current)) {
-			if (!ret) ret = -ERESTARTSYS;
+			if (!ret)
+				ret = -ERESTARTSYS;
 			break;
 		}
 		if (do_wakeup) {
@@ -391,6 +401,7 @@ pipe_write(struct file *filp, const char __user *buf,
 	   size_t count, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+
 	return pipe_writev(filp, &iov, 1, ppos);
 }
 
@@ -401,7 +412,8 @@ bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 }
 
 static ssize_t
-bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos)
+bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
+	   loff_t *ppos)
 {
 	return -EBADF;
 }
@@ -475,6 +487,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 	pipe = inode->i_pipe;
 	pipe->readers -= decr;
 	pipe->writers -= decw;
+
 	if (!pipe->readers && !pipe->writers) {
 		free_pipe_info(inode);
 	} else {
@@ -525,14 +538,15 @@ static int
 pipe_rdwr_fasync(int fd, struct file *filp, int on)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
+	struct pipe_inode_info *pipe = inode->i_pipe;
 	int retval;
 
 	mutex_lock(&inode->i_mutex);
 
-	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
+	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 
 	if (retval >= 0)
-		retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
+		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 
 	mutex_unlock(&inode->i_mutex);
 
@@ -720,6 +734,7 @@ static int pipefs_delete_dentry(struct dentry *dentry)
 {
 	return 1;
 }
+
 static struct dentry_operations pipefs_dentry_operations = {
 	.d_delete	= pipefs_delete_dentry,
 };
@@ -757,6 +772,7 @@ static struct inode * get_pipe_inode(void)
 
 fail_iput:
 	iput(inode);
+
 fail_inode:
 	return NULL;
 }
@@ -769,7 +785,7 @@ int do_pipe(int *fd)
 	struct inode * inode;
 	struct file *f1, *f2;
 	int error;
-	int i,j;
+	int i, j;
 
 	error = -ENFILE;
 	f1 = get_empty_filp();
@@ -802,6 +818,7 @@ int do_pipe(int *fd)
 	dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
 	if (!dentry)
 		goto close_f12_inode_i_j;
+
 	dentry->d_op = &pipefs_dentry_operations;
 	d_add(dentry, inode);
 	f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt));
@@ -825,6 +842,7 @@ int do_pipe(int *fd)
 	fd_install(j, f2);
 	fd[0] = i;
 	fd[1] = j;
+
 	return 0;
 
 close_f12_inode_i_j:
@@ -849,8 +867,9 @@ no_files:
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
 
-static struct super_block *pipefs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static struct super_block *
+pipefs_get_sb(struct file_system_type *fs_type, int flags,
+	      const char *dev_name, void *data)
 {
 	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
 }
@@ -864,6 +883,7 @@ static struct file_system_type pipe_fs_type = {
 static int __init init_pipe_fs(void)
 {
 	int err = register_filesystem(&pipe_fs_type);
+
 	if (!err) {
 		pipe_mnt = kern_mount(&pipe_fs_type);
 		if (IS_ERR(pipe_mnt)) {
-- 
cgit v1.2.3


From 29ff2db55196717e2e67e0f04adc833ee7edd491 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 10 Apr 2006 22:52:46 -0700
Subject: [PATCH] select() warning fixes

fs/select.c: In function `core_sys_select':
fs/select.c:339: warning: assignment from incompatible pointer type
fs/select.c:376: warning: comparison of distinct pointer types lacks a cast

By using a void* we can remove lots of casts rather than adding more.

Cc: Jes Sorensen <jes@trained-monkey.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/select.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 071660fa7b0..fce0fd1bb1d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -310,7 +310,7 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			   fd_set __user *exp, s64 *timeout)
 {
 	fd_set_bits fds;
-	char *bits;
+	void *bits;
 	int ret, size, max_fdset;
 	struct fdtable *fdt;
 	/* Allocate small arguments on the stack to save memory and be faster */
@@ -341,12 +341,12 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 		bits = kmalloc(6 * size, GFP_KERNEL);
 	if (!bits)
 		goto out_nofds;
-	fds.in      = (unsigned long *)  bits;
-	fds.out     = (unsigned long *) (bits +   size);
-	fds.ex      = (unsigned long *) (bits + 2*size);
-	fds.res_in  = (unsigned long *) (bits + 3*size);
-	fds.res_out = (unsigned long *) (bits + 4*size);
-	fds.res_ex  = (unsigned long *) (bits + 5*size);
+	fds.in      = bits;
+	fds.out     = bits +   size;
+	fds.ex      = bits + 2*size;
+	fds.res_in  = bits + 3*size;
+	fds.res_out = bits + 4*size;
+	fds.res_ex  = bits + 5*size;
 
 	if ((ret = get_fd_set(n, inp, fds.in)) ||
 	    (ret = get_fd_set(n, outp, fds.out)) ||
-- 
cgit v1.2.3


From 7b04d7170e9af805cac19f97b28fff10db897893 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 10 Apr 2006 22:53:27 -0700
Subject: [PATCH] Add GFP_NOWAIT

Introduce GFP_NOWAIT, as an alias for GFP_ATOMIC & ~__GFP_HIGH.

This also changes XFS, which is the only in-tree user of this idiom that I
could find.  The XFS piece is compile-tested only.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Acked-by: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9fb0312665c..26fed0756f0 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -182,7 +182,7 @@ free_address(
 {
 	a_list_t	*aentry;
 
-	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
+	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
 	if (likely(aentry)) {
 		spin_lock(&as_lock);
 		aentry->next = as_free_head;
-- 
cgit v1.2.3


From 5246d0503130fa58904c8beb987fcf93b96d8ab6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 10 Apr 2006 22:53:57 -0700
Subject: [PATCH] sync_file_range(): use unsigned for flags

Ulrich suggested that the `flags' arg to sync_file_range() become unsigned.

Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/sync.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index 8616006d209..aab5ffe77e9 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -61,7 +61,7 @@
  * will be available after a crash.
  */
 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-					int flags)
+					unsigned int flags)
 {
 	int ret;
 	struct file *file;
@@ -126,7 +126,7 @@ out:
  * `endbyte' is inclusive
  */
 int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
-			int flags)
+			unsigned int flags)
 {
 	int ret;
 	struct address_space *mapping;
-- 
cgit v1.2.3


From f6422f17d3a480f21917a3895e2a46b968f56a08 Mon Sep 17 00:00:00 2001
From: Herbert Poetzl <herbert@13thfloor.at>
Date: Mon, 10 Apr 2006 22:54:03 -0700
Subject: [PATCH] vfs: propagate mnt_flags into do_loopback/vfsmount

The mnt_flags are propagated into do_loopback(), so that they can be stored
with the vfsmount

Signed-off-by: Herbert Poetzl <herbert@13thfloor.at>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb85..2c5f1f80bdc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -899,11 +899,13 @@ static int do_change_type(struct nameidata *nd, int flag)
 /*
  * do loopback mount.
  */
-static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
+static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags)
 {
 	struct nameidata old_nd;
 	struct vfsmount *mnt = NULL;
+	int recurse = flags & MS_REC;
 	int err = mount_is_safe(nd);
+
 	if (err)
 		return err;
 	if (!old_name || !*old_name)
@@ -937,6 +939,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 		spin_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
+	mnt->mnt_flags = mnt_flags;
 
 out:
 	up_write(&namespace_sem);
@@ -1350,7 +1353,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
-		retval = do_loopback(&nd, dev_name, flags & MS_REC);
+		retval = do_loopback(&nd, dev_name, flags, mnt_flags);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
-- 
cgit v1.2.3


From 00fbc6dfe7c4487f812829bff79c3121c8fd3bca Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@hera.kernel.org>
Date: Mon, 10 Apr 2006 22:54:06 -0700
Subject: [PATCH] 9p: handle sget() failure

Handle a failing sget() in v9fs_get_sb().

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_super.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index b0a0ae509c0..61c599b4a1e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		kfree(v9ses);
-		return ERR_PTR(newfid);
+		sb = ERR_PTR(newfid);
+		goto out_free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-
+	if (IS_ERR(sb))
+		goto out_close_session;
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	return sb;
 
+out_close_session:
+	v9fs_session_close(v9ses);
+out_free_session:
+	kfree(v9ses);
+	return sb;
+
 put_back_sb:
 	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
-- 
cgit v1.2.3


From b04eb6aa08ecc3e24df2f78ebc486011ebd74feb Mon Sep 17 00:00:00 2001
From: Mitchell Blank Jr <mitch@sfgoth.com>
Date: Mon, 10 Apr 2006 22:54:08 -0700
Subject: [PATCH] select: don't overflow if (SELECT_STACK_ALLOC % sizeof(long)
 != 0)

If SELECT_STACK_ALLOC is not a multiple of sizeof(long) then stack_fds[]
would be shorter than SELECT_STACK_ALLOC bytes and could overflow later in
the function.  Fixed by simply rearranging the test later to work on
sizeof(stack_fds) Currently SELECT_STACK_ALLOC is 256 so this doesn't
happen, but it's nasty to have things like this hidden in the code.  What
if later someone decides to change SELECT_STACK_ALLOC to 300?

Signed-off-by: Mitchell Blank Jr <mitch@sfgoth.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/select.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index fce0fd1bb1d..a8109baa5e4 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -311,7 +311,8 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 {
 	fd_set_bits fds;
 	void *bits;
-	int ret, size, max_fdset;
+	int ret, max_fdset;
+	unsigned int size;
 	struct fdtable *fdt;
 	/* Allocate small arguments on the stack to save memory and be faster */
 	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
@@ -333,14 +334,15 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	 * since we used fdset we need to allocate memory in units of
 	 * long-words. 
 	 */
-	ret = -ENOMEM;
 	size = FDS_BYTES(n);
-	if (6*size < SELECT_STACK_ALLOC)
-		bits = stack_fds;
-	else
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		/* Not enough space in on-stack array; must use kmalloc */
+		ret = -ENOMEM;
 		bits = kmalloc(6 * size, GFP_KERNEL);
-	if (!bits)
-		goto out_nofds;
+		if (!bits)
+			goto out_nofds;
+	}
 	fds.in      = bits;
 	fds.out     = bits +   size;
 	fds.ex      = bits + 2*size;
-- 
cgit v1.2.3


From 80e8ff634169be3fc2ac48f258cc7638e898cd46 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 10 Apr 2006 22:54:10 -0700
Subject: [PATCH] kdump proc vmcore size oveflow fix

A couple of /proc/vmcore data structures overflow with 32bit systems having
memory more than 4G.  This patch fixes those.

Signed-off-by: Ken'ichi Ohmichi <oomichi@mxs.nes.nec.co.jp>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/vmcore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7efa73d44c9..20d4b2237fc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 				size_t buflen, loff_t *fpos)
 {
 	ssize_t acc = 0, tmp;
-	size_t tsz, nr_bytes;
-	u64 start;
+	size_t tsz;
+	u64 start, nr_bytes;
 	struct vmcore *curr_m = NULL;
 
 	if (buflen == 0 || *fpos >= vmcore_size)
-- 
cgit v1.2.3


From 2395140ee2bffe38b1c8a59318f62882b797f5e6 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Mon, 10 Apr 2006 22:54:12 -0700
Subject: [PATCH] uniform POLLRDHUP handling between epoll and poll/select

As reported by Michael Kerrisk, POLLRDHUP handling was not consistent
between epoll and poll/select, since in epoll it was unmaskeable.  This
patch brings uniformity in POLLRDHUP handling.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/eventpoll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 242fe1a66ce..1b4491cdd11 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
 	switch (op) {
 	case EPOLL_CTL_ADD:
 		if (!epi) {
-			epds.events |= POLLERR | POLLHUP | POLLRDHUP;
+			epds.events |= POLLERR | POLLHUP;
 
 			error = ep_insert(ep, &epds, tfile, fd);
 		} else
@@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
 		break;
 	case EPOLL_CTL_MOD:
 		if (epi) {
-			epds.events |= POLLERR | POLLHUP | POLLRDHUP;
+			epds.events |= POLLERR | POLLHUP;
 			error = ep_modify(ep, epi, &epds);
 		} else
 			error = -ENOENT;
-- 
cgit v1.2.3


From f5e902817fee1589badca1284f49eecc0ef0c200 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 10 Apr 2006 22:54:16 -0700
Subject: [PATCH] process accounting: take original leader's start_time in
 non-leader exec

The only record we have of the real-time age of a process, regardless of
execs it's done, is start_time.  When a non-leader thread exec, the
original start_time of the process is lost.  Things looking at the
real-time age of the process are fooled, for example the process accounting
record when the process finally dies.  This change makes the oldest
start_time stick around with the process after a non-leader exec.  This way
the association between PID and start_time is kept constant, which seems
correct to me.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 4d38ad0b70d..3234a0c32d5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -678,6 +678,18 @@ static int de_thread(struct task_struct *tsk)
 		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
+		/*
+		 * The only record we have of the real-time age of a
+		 * process, regardless of execs it's done, is start_time.
+		 * All the past CPU time is accumulated in signal_struct
+		 * from sister threads now dead.  But in this non-leader
+		 * exec, nothing survives from the original leader thread,
+		 * whose birth marks the true age of this process now.
+		 * When we take on its identity by switching to its PID, we
+		 * also take its birthdate (always earlier than our own).
+		 */
+		current->start_time = leader->start_time;
+
 		spin_lock(&leader->proc_lock);
 		spin_lock(&current->proc_lock);
 		proc_dentry1 = proc_pid_unhash(current);
-- 
cgit v1.2.3


From 68250ba5df4c9d00d3064a0ba9a894035436916b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 10 Apr 2006 22:54:30 -0700
Subject: [PATCH] kdump: enable CONFIG_PROC_VMCORE by default

Everybody seems to be using /proc/vmcore as a method to access the kernel
crash dump.  Hence probably it makes sense to enable CONFIG_PROC_VMCORE by
default if CONFIG_CRASH_DUMP is selected.  This makes kdump configuration
further easier for a user.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 97f31741312..2524629dc83 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -799,6 +799,7 @@ config PROC_KCORE
 config PROC_VMCORE
         bool "/proc/vmcore support (EXPERIMENTAL)"
         depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+	default y
         help
         Exports the dump image of crashed kernel in ELF format.
 
-- 
cgit v1.2.3


From 091e881d0e55496d8887b61446ae1c598b0995b6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Date: Mon, 10 Apr 2006 22:54:31 -0700
Subject: [PATCH] inotify: check for NULL inode in inotify_d_instantiate

The spufs file system creates files in a directory before instantiating the
directory itself, which causes a NULL pointer access in
inotify_d_instantiate since c32ccd87bfd1414b0aabfcd8dbc7539ad23bcbaa.

I'd like to keep this behavior since it means that the user will not have
access to files in the directory before I know that I succeed in creating
everything in it.  This patch adds a simple check for the inode to keep
that working.

Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inotify.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index 367c487c014..1f50302849c 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
 	WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED);
 	spin_lock(&entry->d_lock);
 	parent = entry->d_parent;
-	if (inotify_inode_watched(parent->d_inode))
+	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
 		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
 	spin_unlock(&entry->d_lock);
 }
-- 
cgit v1.2.3


From 389ed39b9711bbe5210d5e118e1f1af36ca88b7c Mon Sep 17 00:00:00 2001
From: "Ananiev, Leonid I" <leonid.i.ananiev@intel.com>
Date: Mon, 10 Apr 2006 22:54:38 -0700
Subject: [PATCH] ext3: Fix missed mutex unlock

Missed unlock_super()call is added in error condition code path.

Signed-off-by: Leonid Ananiev <leonid.i.ananiev@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/resize.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 1041dab6de2..14f5f6ea3e7 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
+		unlock_super(sb);
 		err = -EBUSY;
 		goto exit_put;
 	}
-- 
cgit v1.2.3


From d3406ffa4af8af1d7c14cff06e003eb0a557d4ad Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 10 Apr 2006 22:54:49 -0700
Subject: [PATCH] fuse: fix oops in fuse_send_readpages()

During heavy parallel filesystem activity it was possible to Oops the kernel.
The reason is that read_cache_pages() could skip pages which have already been
inserted into the cache by another task.  Occasionally this may result in zero
pages actually being sent, while fuse_send_readpages() relies on at least one
page being in the request.

So check this corner case and just free the request instead of trying to send
it.

Reported and tested by Konstantin Isakov.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 975f2697e86..3ac39c0288d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -397,8 +397,12 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 		return -EINTR;
 
 	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
-	if (!err)
-		fuse_send_readpages(data.req, file, inode);
+	if (!err) {
+		if (data.req->num_pages)
+			fuse_send_readpages(data.req, file, inode);
+		else
+			fuse_put_request(fc, data.req);
+	}
 	return err;
 }
 
-- 
cgit v1.2.3


From 7025d9ad10a38dadef8b286e0092731c2d3cdc53 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 10 Apr 2006 22:54:50 -0700
Subject: [PATCH] fuse: fix fuse_dev_poll() return value

fuse_dev_poll() returned an error value instead of a poll mask.  Luckily (or
unluckily) -ENODEV does contain the POLLERR bit.

There's also a race if filesystem is unmounted between fuse_get_conn() and
spin_lock(), in which case this event will be missed by poll().

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 23d1f52eb1b..b2e8613a26d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -804,17 +804,18 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
 
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
-	struct fuse_conn *fc = fuse_get_conn(file);
 	unsigned mask = POLLOUT | POLLWRNORM;
-
+	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
-		return -ENODEV;
+		return POLLERR;
 
 	poll_wait(file, &fc->waitq, wait);
 
 	spin_lock(&fuse_lock);
-	if (!list_empty(&fc->pending))
-                mask |= POLLIN | POLLRDNORM;
+	if (!fc->connected)
+		mask = POLLERR;
+	else if (!list_empty(&fc->pending))
+		mask |= POLLIN | POLLRDNORM;
 	spin_unlock(&fuse_lock);
 
 	return mask;
-- 
cgit v1.2.3


From 385a17bfc3cb035333c8a91eddc78a6e04c4625e Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 10 Apr 2006 22:54:52 -0700
Subject: [PATCH] fuse: add O_ASYNC support to FUSE device

This adds asynchronous notification to FUSE - a FUSE server can request
O_ASYNC on a /dev/fuse file descriptor and receive SIGIO when there is input
available.

One subtlety - fuse_dev_fasync, which is called when O_ASYNC is requested,
does no locking, unlink the other methods.  I think it's unnecessary, as the
fuse_conn.fasync list is manipulated only by fasync_helper and kill_fasync,
which provide their own locking.  It would also be wrong to use the fuse_lock,
as it's a spin lock and fasync_helper can sleep.  My one concern with this is
the fuse_conn going away underneath fuse_dev_fasync - sys_fcntl takes a
reference on the file struct, so this seems not to be a problem.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 17 ++++++++++++++++-
 fs/fuse/fuse_i.h |  3 +++
 fs/fuse/inode.c  |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b2e8613a26d..438770f8867 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -317,6 +317,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	list_add_tail(&req->list, &fc->pending);
 	req->state = FUSE_REQ_PENDING;
 	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
 
 /*
@@ -901,6 +902,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		wake_up_all(&fc->waitq);
+		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	}
 	spin_unlock(&fuse_lock);
 }
@@ -917,12 +919,24 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 		end_requests(fc, &fc->processing);
 	}
 	spin_unlock(&fuse_lock);
-	if (fc)
+	if (fc) {
+		fasync_helper(-1, file, 0, &fc->fasync);
 		kobject_put(&fc->kobj);
+	}
 
 	return 0;
 }
 
+static int fuse_dev_fasync(int fd, struct file *file, int on)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -ENODEV;
+
+	/* No locking - fasync_helper does its own locking */
+	return fasync_helper(fd, file, on, &fc->fasync);
+}
+
 const struct file_operations fuse_dev_operations = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
@@ -932,6 +946,7 @@ const struct file_operations fuse_dev_operations = {
 	.writev		= fuse_dev_writev,
 	.poll		= fuse_dev_poll,
 	.release	= fuse_dev_release,
+	.fasync		= fuse_dev_fasync,
 };
 
 static struct miscdevice fuse_miscdevice = {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index a16a04fcf41..e5cb46b7843 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -318,6 +318,9 @@ struct fuse_conn {
 
 	/** kobject */
 	struct kobject kobj;
+
+	/** O_ASYNC requests */
+	struct fasync_struct *fasync;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 879e6fba948..78700cbb9cd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -216,6 +216,7 @@ static void fuse_put_super(struct super_block *sb)
 	spin_unlock(&fuse_lock);
 	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
 	kobject_del(&fc->kobj);
 	kobject_put(&fc->kobj);
-- 
cgit v1.2.3


From e5ac1d1e70a8c19a65a959d73650203df7a2e168 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 10 Apr 2006 22:54:53 -0700
Subject: [PATCH] fuse: add O_NONBLOCK support to FUSE device

I don't like duplicating the connected and list_empty tests in fuse_dev_readv,
but this seemed cleaner than adding the f_flags test to request_wait.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 438770f8867..75c6e9166c3 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -619,6 +619,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	err = -EPERM;
 	if (!fc)
 		goto err_unlock;
+
+	err = -EAGAIN;
+	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
+	    list_empty(&fc->pending))
+		goto err_unlock;
+
 	request_wait(fc);
 	err = -ENODEV;
 	if (!fc->connected)
-- 
cgit v1.2.3


From 0720b315976447cba3f0c3e211223b8cb82b0f93 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 10 Apr 2006 22:54:55 -0700
Subject: [PATCH] fuse: simplify locking

This is in preparation for removing the global spinlock in favor of a
per-mount one.

The only critical part is the interaction between fuse_dev_release() and
fuse_fill_super(): fuse_dev_release() must see the assignment to
file->private_data, otherwise it will leak the reference to fuse_conn.

This is ensured by the fput() operation, which will synchronize the assignment
with other CPU's that may do a final fput() soon after this.

Also redundant locking is removed from fuse_fill_super(), where exclusion is
already ensured by the BKL held for this function by the VFS.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c   | 31 ++++++++++------------------
 fs/fuse/inode.c | 64 ++++++++++++++++++++-------------------------------------
 2 files changed, 33 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 75c6e9166c3..c510533c684 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep;
 
 static struct fuse_conn *fuse_get_conn(struct file *file)
 {
-	struct fuse_conn *fc;
-	spin_lock(&fuse_lock);
-	fc = file->private_data;
-	if (fc && !fc->connected)
-		fc = NULL;
-	spin_unlock(&fuse_lock);
-	return fc;
+	/*
+	 * Lockless access is OK, because file->private data is set
+	 * once during mount and is valid until the file is released.
+	 */
+	return file->private_data;
 }
 
 static void fuse_request_init(struct fuse_req *req)
@@ -607,19 +605,16 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 			      unsigned long nr_segs, loff_t *off)
 {
 	int err;
-	struct fuse_conn *fc;
 	struct fuse_req *req;
 	struct fuse_in *in;
 	struct fuse_copy_state cs;
 	unsigned reqsize;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
 
  restart:
 	spin_lock(&fuse_lock);
-	fc = file->private_data;
-	err = -EPERM;
-	if (!fc)
-		goto err_unlock;
-
 	err = -EAGAIN;
 	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
 	    list_empty(&fc->pending))
@@ -915,17 +910,13 @@ void fuse_abort_conn(struct fuse_conn *fc)
 
 static int fuse_dev_release(struct inode *inode, struct file *file)
 {
-	struct fuse_conn *fc;
-
-	spin_lock(&fuse_lock);
-	fc = file->private_data;
+	struct fuse_conn *fc = fuse_get_conn(file);
 	if (fc) {
+		spin_lock(&fuse_lock);
 		fc->connected = 0;
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
-	}
-	spin_unlock(&fuse_lock);
-	if (fc) {
+		spin_unlock(&fuse_lock);
 		fasync_helper(-1, file, 0, &fc->fasync);
 		kobject_put(&fc->kobj);
 	}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 78700cbb9cd..620579a6910 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -414,37 +414,6 @@ static struct fuse_conn *new_conn(void)
 	return fc;
 }
 
-static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
-{
-	struct fuse_conn *fc;
-	int err;
-
-	err = -EINVAL;
-	if (file->f_op != &fuse_dev_operations)
-		goto out_err;
-
-	err = -ENOMEM;
-	fc = new_conn();
-	if (!fc)
-		goto out_err;
-
-	spin_lock(&fuse_lock);
-	err = -EINVAL;
-	if (file->private_data)
-		goto out_unlock;
-
-	kobject_get(&fc->kobj);
-	file->private_data = fc;
-	spin_unlock(&fuse_lock);
-	return fc;
-
- out_unlock:
-	spin_unlock(&fuse_lock);
-	kobject_put(&fc->kobj);
- out_err:
-	return ERR_PTR(err);
-}
-
 static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 {
 	struct fuse_attr attr;
@@ -526,12 +495,9 @@ static void fuse_send_init(struct fuse_conn *fc)
 
 static unsigned long long conn_id(void)
 {
+	/* BKL is held for ->get_sb() */
 	static unsigned long long ctr = 1;
-	unsigned long long val;
-	spin_lock(&fuse_lock);
-	val = ctr++;
-	spin_unlock(&fuse_lock);
-	return val;
+	return ctr++;
 }
 
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -556,10 +522,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!file)
 		return -EINVAL;
 
-	fc = get_conn(file, sb);
-	fput(file);
-	if (IS_ERR(fc))
-		return PTR_ERR(fc);
+	if (file->f_op != &fuse_dev_operations)
+		return -EINVAL;
+
+	/* Setting file->private_data can't race with other mount()
+	   instances, since BKL is held for ->get_sb() */
+	if (file->private_data)
+		return -EINVAL;
+
+	fc = new_conn();
+	if (!fc)
+		return -ENOMEM;
 
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
@@ -589,10 +562,16 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_put_root;
 
 	sb->s_root = root_dentry;
-	spin_lock(&fuse_lock);
 	fc->mounted = 1;
 	fc->connected = 1;
-	spin_unlock(&fuse_lock);
+	kobject_get(&fc->kobj);
+	file->private_data = fc;
+	/*
+	 * atomic_dec_and_test() in fput() provides the necessary
+	 * memory barrier for file->private_data to be visible on all
+	 * CPUs after this
+	 */
+	fput(file);
 
 	fuse_send_init(fc);
 
@@ -601,6 +580,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
  err_put_root:
 	dput(root_dentry);
  err:
+	fput(file);
 	kobject_put(&fc->kobj);
 	return err;
 }
-- 
cgit v1.2.3


From d713311464bcca73c990d1a1b5c9467eae87f5b4 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 10 Apr 2006 22:54:55 -0700
Subject: [PATCH] fuse: use a per-mount spinlock

Remove the global spinlock in favor of a per-mount one.

This patch is basically find & replace.  The difficult part has already been
done by the previous patch.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 122 ++++++++++++++++++++++++++++---------------------------
 fs/fuse/fuse_i.h |  24 +++--------
 fs/fuse/inode.c  |  12 +++---
 3 files changed, 74 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c510533c684..63d2cf43b5e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -94,11 +94,11 @@ static struct fuse_req *do_get_request(struct fuse_conn *fc)
 {
 	struct fuse_req *req;
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	BUG_ON(list_empty(&fc->unused_list));
 	req = list_entry(fc->unused_list.next, struct fuse_req, list);
 	list_del_init(&req->list);
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	fuse_request_init(req);
 	req->preallocated = 1;
 	req->in.h.uid = current->fsuid;
@@ -124,7 +124,7 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc)
 	return do_get_request(fc);
 }
 
-/* Must be called with fuse_lock held */
+/* Must be called with fc->lock held */
 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (req->preallocated) {
@@ -143,9 +143,9 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		fuse_putback_request(fc, req);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 }
 
@@ -155,15 +155,15 @@ static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req)
 		fuse_putback_request(fc, req);
 }
 
-void fuse_release_background(struct fuse_req *req)
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	iput(req->inode);
 	iput(req->inode2);
 	if (req->file)
 		fput(req->file);
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	list_del(&req->bg_entry);
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 }
 
 /*
@@ -182,7 +182,7 @@ void fuse_release_background(struct fuse_req *req)
  * interrupted and put in the background, it will return with an error
  * and hence never be reset and reused.
  *
- * Called with fuse_lock, unlocks it
+ * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 {
@@ -191,14 +191,14 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 	if (!req->background) {
 		wake_up(&req->waitq);
 		fuse_put_request_locked(fc, req);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	} else {
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 		req->end = NULL;
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		down_read(&fc->sbput_sem);
 		if (fc->mounted)
-			fuse_release_background(req);
+			fuse_release_background(fc, req);
 		up_read(&fc->sbput_sem);
 		if (end)
 			end(fc, req);
@@ -248,16 +248,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
 		get_file(req->file);
 }
 
-/* Called with fuse_lock held.  Releases, and then reacquires it. */
+/* Called with fc->lock held.  Releases, and then reacquires it. */
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 {
 	sigset_t oldset;
 
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	block_sigs(&oldset);
 	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
 	restore_sigs(&oldset);
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
 		return;
 
@@ -271,9 +271,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 		   locked state, there mustn't be any filesystem
 		   operation (e.g. page fault), since that could lead
 		   to deadlock */
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		wait_event(req->waitq, !req->locked);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 	if (req->state == FUSE_REQ_PENDING) {
 		list_del(&req->list);
@@ -324,7 +324,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (!fc->connected)
 		req->out.h.error = -ENOTCONN;
 	else if (fc->conn_error)
@@ -337,15 +337,15 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 
 		request_wait_answer(fc, req);
 	}
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 }
 
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (fc->connected) {
 		queue_request(fc, req);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	} else {
 		req->out.h.error = -ENOTCONN;
 		request_end(fc, req);
@@ -361,9 +361,9 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	background_request(fc, req);
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	request_send_nowait(fc, req);
 }
 
@@ -372,16 +372,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
  * anything that could cause a page-fault.  If the request was already
  * interrupted bail out.
  */
-static int lock_request(struct fuse_req *req)
+static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int err = 0;
 	if (req) {
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		if (req->interrupted)
 			err = -ENOENT;
 		else
 			req->locked = 1;
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 	return err;
 }
@@ -391,18 +391,19 @@ static int lock_request(struct fuse_req *req)
  * requester thread is currently waiting for it to be unlocked, so
  * wake it up.
  */
-static void unlock_request(struct fuse_req *req)
+static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (req) {
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		req->locked = 0;
 		if (req->interrupted)
 			wake_up(&req->waitq);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 }
 
 struct fuse_copy_state {
+	struct fuse_conn *fc;
 	int write;
 	struct fuse_req *req;
 	const struct iovec *iov;
@@ -415,11 +416,12 @@ struct fuse_copy_state {
 	unsigned len;
 };
 
-static void fuse_copy_init(struct fuse_copy_state *cs, int write,
-			   struct fuse_req *req, const struct iovec *iov,
-			   unsigned long nr_segs)
+static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
+			   int write, struct fuse_req *req,
+			   const struct iovec *iov, unsigned long nr_segs)
 {
 	memset(cs, 0, sizeof(*cs));
+	cs->fc = fc;
 	cs->write = write;
 	cs->req = req;
 	cs->iov = iov;
@@ -449,7 +451,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 	unsigned long offset;
 	int err;
 
-	unlock_request(cs->req);
+	unlock_request(cs->fc, cs->req);
 	fuse_copy_finish(cs);
 	if (!cs->seglen) {
 		BUG_ON(!cs->nr_segs);
@@ -472,7 +474,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 	cs->seglen -= cs->len;
 	cs->addr += cs->len;
 
-	return lock_request(cs->req);
+	return lock_request(cs->fc, cs->req);
 }
 
 /* Do as much copy to/from userspace buffer as we can */
@@ -584,9 +586,9 @@ static void request_wait(struct fuse_conn *fc)
 		if (signal_pending(current))
 			break;
 
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		schedule();
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&fc->waitq, &wait);
@@ -614,7 +616,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 		return -EPERM;
 
  restart:
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	err = -EAGAIN;
 	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
 	    list_empty(&fc->pending))
@@ -643,14 +645,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 		request_end(fc, req);
 		goto restart;
 	}
-	spin_unlock(&fuse_lock);
-	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	spin_unlock(&fc->lock);
+	fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
 	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
 	if (!err)
 		err = fuse_copy_args(&cs, in->numargs, in->argpages,
 				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(&cs);
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (!err && req->interrupted)
 		err = -ENOENT;
@@ -665,12 +667,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	else {
 		req->state = FUSE_REQ_SENT;
 		list_move_tail(&req->list, &fc->processing);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 	return reqsize;
 
  err_unlock:
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	return err;
 }
 
@@ -739,7 +741,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	if (!fc)
 		return -ENODEV;
 
-	fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
+	fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
 	if (nbytes < sizeof(struct fuse_out_header))
 		return -EINVAL;
 
@@ -751,7 +753,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	    oh.len != nbytes)
 		goto err_finish;
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	err = -ENOENT;
 	if (!fc->connected)
 		goto err_unlock;
@@ -762,9 +764,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 		goto err_unlock;
 
 	if (req->interrupted) {
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		fuse_copy_finish(&cs);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		request_end(fc, req);
 		return -ENOENT;
 	}
@@ -772,12 +774,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	req->out.h = oh;
 	req->locked = 1;
 	cs.req = req;
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 
 	err = copy_out_args(&cs, &req->out, nbytes);
 	fuse_copy_finish(&cs);
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (!err) {
 		if (req->interrupted)
@@ -789,7 +791,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	return err ? err : nbytes;
 
  err_unlock:
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
  err_finish:
 	fuse_copy_finish(&cs);
 	return err;
@@ -813,12 +815,12 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &fc->waitq, wait);
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (!fc->connected)
 		mask = POLLERR;
 	else if (!list_empty(&fc->pending))
 		mask |= POLLIN | POLLRDNORM;
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 
 	return mask;
 }
@@ -826,7 +828,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 /*
  * Abort all requests on the given list (pending or processing)
  *
- * This function releases and reacquires fuse_lock
+ * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
 {
@@ -835,7 +837,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 		req = list_entry(head->next, struct fuse_req, list);
 		req->out.h.error = -ECONNABORTED;
 		request_end(fc, req);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 }
 
@@ -866,10 +868,10 @@ static void end_io_requests(struct fuse_conn *fc)
 			req->end = NULL;
 			/* The end function will consume this reference */
 			__fuse_get_request(req);
-			spin_unlock(&fuse_lock);
+			spin_unlock(&fc->lock);
 			wait_event(req->waitq, !req->locked);
 			end(fc, req);
-			spin_lock(&fuse_lock);
+			spin_lock(&fc->lock);
 		}
 	}
 }
@@ -896,7 +898,7 @@ static void end_io_requests(struct fuse_conn *fc)
  */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (fc->connected) {
 		fc->connected = 0;
 		end_io_requests(fc);
@@ -905,18 +907,18 @@ void fuse_abort_conn(struct fuse_conn *fc)
 		wake_up_all(&fc->waitq);
 		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	}
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 }
 
 static int fuse_dev_release(struct inode *inode, struct file *file)
 {
 	struct fuse_conn *fc = fuse_get_conn(file);
 	if (fc) {
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		fc->connected = 0;
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		fasync_helper(-1, file, 0, &fc->fasync);
 		kobject_put(&fc->kobj);
 	}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e5cb46b7843..6ed812fd620 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -144,7 +144,7 @@ struct fuse_req {
 	/*
 	 * The following bitfields are either set once before the
 	 * request is queued or setting/clearing them is protected by
-	 * fuse_lock
+	 * fuse_conn->lock
 	 */
 
 	/** True if the request has reply */
@@ -213,6 +213,9 @@ struct fuse_req {
  * unmounted.
  */
 struct fuse_conn {
+	/** Lock protecting accessess to  members of this structure */
+	spinlock_t lock;
+
 	/** The user id for this mount */
 	uid_t user_id;
 
@@ -351,21 +354,6 @@ static inline u64 get_node_id(struct inode *inode)
 /** Device operations */
 extern const struct file_operations fuse_dev_operations;
 
-/**
- * This is the single global spinlock which protects FUSE's structures
- *
- * The following data is protected by this lock:
- *
- *  - the private_data field of the device file
- *  - the s_fs_info field of the super block
- *  - unused_list, pending, processing lists in fuse_conn
- *  - background list in fuse_conn
- *  - the unique request ID counter reqctr in fuse_conn
- *  - the sb (super_block) field in fuse_conn
- *  - the file (device file) field in fuse_conn
- */
-extern spinlock_t fuse_lock;
-
 /**
  * Get a filled in inode
  */
@@ -490,7 +478,7 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 /**
  * Release inodes and file associated with background request
  */
-void fuse_release_background(struct fuse_req *req);
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
 
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 620579a6910..cc58debeabd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 
-spinlock_t fuse_lock;
 static kmem_cache_t *fuse_inode_cachep;
 static struct subsystem connections_subsys;
 
@@ -207,13 +206,14 @@ static void fuse_put_super(struct super_block *sb)
 
 	down_write(&fc->sbput_sem);
 	while (!list_empty(&fc->background))
-		fuse_release_background(list_entry(fc->background.next,
+		fuse_release_background(fc,
+					list_entry(fc->background.next,
 						   struct fuse_req, bg_entry));
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	fc->mounted = 0;
 	fc->connected = 0;
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -388,6 +388,7 @@ static struct fuse_conn *new_conn(void)
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc) {
 		int i;
+		spin_lock_init(&fc->lock);
 		init_waitqueue_head(&fc->waitq);
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
@@ -734,7 +735,6 @@ static int __init fuse_init(void)
 	printk("fuse init (API version %i.%i)\n",
 	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
-	spin_lock_init(&fuse_lock);
 	res = fuse_fs_init();
 	if (res)
 		goto err;
-- 
cgit v1.2.3


From a87046d822f2d982d25b24c4a644d34f22d4888a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 10 Apr 2006 22:54:56 -0700
Subject: [PATCH] fuse: consolidate device errors

Return consistent error values for the case when the opened device file has no
mount associated yet.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 63d2cf43b5e..6b8843d4ad8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -739,7 +739,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	struct fuse_copy_state cs;
 	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
-		return -ENODEV;
+		return -EPERM;
 
 	fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
 	if (nbytes < sizeof(struct fuse_out_header))
@@ -930,7 +930,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
 {
 	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
-		return -ENODEV;
+		return -EPERM;
 
 	/* No locking - fasync_helper does its own locking */
 	return fasync_helper(fd, file, on, &fc->fasync);
-- 
cgit v1.2.3


From ce1d5a491f0ee50560416a73faa5e4ddbab074bd Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 10 Apr 2006 22:54:58 -0700
Subject: [PATCH] fuse: clean up request accounting

FUSE allocated most requests from a fixed size pool filled at mount time.
However in some cases (release/forget) non-pool requests were used.  File
locking operations aren't well served by the request pool, since they may
block indefinetly thus exhausting the pool.

This patch removes the request pool and always allocates requests on demand.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    |  73 +++++-----------------------------
 fs/fuse/dir.c    | 118 +++++++++++++++++++++++++++----------------------------
 fs/fuse/file.c   |  48 +++++++++++-----------
 fs/fuse/fuse_i.h |  26 +++---------
 fs/fuse/inode.c  |  54 +++++++------------------
 5 files changed, 111 insertions(+), 208 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6b8843d4ad8..4dc104c0e95 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -72,10 +72,8 @@ static void restore_sigs(sigset_t *oldset)
  */
 void fuse_reset_request(struct fuse_req *req)
 {
-	int preallocated = req->preallocated;
 	BUG_ON(atomic_read(&req->count) != 1);
 	fuse_request_init(req);
-	req->preallocated = preallocated;
 }
 
 static void __fuse_get_request(struct fuse_req *req)
@@ -90,71 +88,28 @@ static void __fuse_put_request(struct fuse_req *req)
 	atomic_dec(&req->count);
 }
 
-static struct fuse_req *do_get_request(struct fuse_conn *fc)
+struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
-	struct fuse_req *req;
+	struct fuse_req *req = fuse_request_alloc();
+	if (!req)
+		return ERR_PTR(-ENOMEM);
 
-	spin_lock(&fc->lock);
-	BUG_ON(list_empty(&fc->unused_list));
-	req = list_entry(fc->unused_list.next, struct fuse_req, list);
-	list_del_init(&req->list);
-	spin_unlock(&fc->lock);
+	atomic_inc(&fc->num_waiting);
 	fuse_request_init(req);
-	req->preallocated = 1;
 	req->in.h.uid = current->fsuid;
 	req->in.h.gid = current->fsgid;
 	req->in.h.pid = current->pid;
 	return req;
 }
 
-/* This can return NULL, but only in case it's interrupted by a SIGKILL */
-struct fuse_req *fuse_get_request(struct fuse_conn *fc)
-{
-	int intr;
-	sigset_t oldset;
-
-	atomic_inc(&fc->num_waiting);
-	block_sigs(&oldset);
-	intr = down_interruptible(&fc->outstanding_sem);
-	restore_sigs(&oldset);
-	if (intr) {
-		atomic_dec(&fc->num_waiting);
-		return NULL;
-	}
-	return do_get_request(fc);
-}
-
-/* Must be called with fc->lock held */
-static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
-{
-	if (req->preallocated) {
-		atomic_dec(&fc->num_waiting);
-		list_add(&req->list, &fc->unused_list);
-	} else
-		fuse_request_free(req);
-
-	/* If we are in debt decrease that first */
-	if (fc->outstanding_debt)
-		fc->outstanding_debt--;
-	else
-		up(&fc->outstanding_sem);
-}
-
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
-		spin_lock(&fc->lock);
-		fuse_putback_request(fc, req);
-		spin_unlock(&fc->lock);
+		atomic_dec(&fc->num_waiting);
+		fuse_request_free(req);
 	}
 }
 
-static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req)
-{
-	if (atomic_dec_and_test(&req->count))
-		fuse_putback_request(fc, req);
-}
-
 void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	iput(req->inode);
@@ -189,9 +144,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 	list_del(&req->list);
 	req->state = FUSE_REQ_FINISHED;
 	if (!req->background) {
-		wake_up(&req->waitq);
-		fuse_put_request_locked(fc, req);
 		spin_unlock(&fc->lock);
+		wake_up(&req->waitq);
+		fuse_put_request(fc, req);
 	} else {
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 		req->end = NULL;
@@ -302,16 +257,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	req->in.h.unique = fc->reqctr;
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
-	if (!req->preallocated) {
-		/* If request is not preallocated (either FORGET or
-		   RELEASE), then still decrease outstanding_sem, so
-		   user can't open infinite number of files while not
-		   processing the RELEASE requests.  However for
-		   efficiency do it without blocking, so if down()
-		   would block, just increase the debt instead */
-		if (down_trylock(&fc->outstanding_sem))
-			fc->outstanding_debt++;
-	}
 	list_add_tail(&req->list, &fc->pending);
 	req->state = FUSE_REQ_PENDING;
 	wake_up(&fc->waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 256355b8025..8d7546e832e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 			return 0;
 
 		fc = get_fuse_conn(inode);
-		req = fuse_get_request(fc);
-		if (!req)
+		req = fuse_get_req(fc);
+		if (IS_ERR(req))
 			return 0;
 
 		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
@@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	if (entry->d_name.len > FUSE_NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return ERR_PTR(-EINTR);
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
@@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct file *file;
 	int flags = nd->intent.open.flags - 1;
 
-	err = -ENOSYS;
 	if (fc->no_create)
-		goto out;
+		return -ENOSYS;
 
-	err = -EINTR;
-	req = fuse_get_request(fc);
-	if (!req)
-		goto out;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
+	err = -ENOMEM;
 	ff = fuse_file_alloc();
 	if (!ff)
 		goto out_put_request;
@@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	fuse_file_free(ff);
  out_put_request:
 	fuse_put_request(fc, req);
- out:
 	return err;
 }
 
@@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
 {
 	struct fuse_mknod_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
@@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
 {
 	struct fuse_mkdir_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
@@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_SYMLINK;
 	req->in.numargs = 2;
@@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_UNLINK;
 	req->in.h.nodeid = get_node_id(dir);
@@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
@@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	int err;
 	struct fuse_rename_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(olddir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.newdir = get_node_id(newdir);
@@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	struct fuse_link_in inarg;
 	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
@@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode)
 	int err;
 	struct fuse_attr_out arg;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_GETATTR;
 	req->in.h.nodeid = get_node_id(inode);
@@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask)
 	if (fc->no_access)
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mask = mask;
@@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req = fuse_get_req(fc);
 	char *link;
 
-	if (!req)
-		return ERR_PTR(-EINTR);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
 
 	link = (char *) __get_free_page(GFP_KERNEL);
 	if (!link) {
@@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 		}
 	}
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	iattr_to_fattr(attr, &inarg);
@@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 	if (fc->no_getxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 	if (fc->no_listxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 	if (fc->no_removexattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_REMOVEXATTR;
 	req->in.h.nodeid = get_node_id(inode);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 3ac39c0288d..e4f041a11bb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 	struct fuse_req *req;
 	int err;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -184,9 +184,9 @@ static int fuse_flush(struct file *file)
 	if (fc->no_flush)
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page)
 	if (is_bad_inode(inode))
 		goto out;
 
-	err = -EINTR;
-	req = fuse_get_request(fc);
-	if (!req)
+	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
+	if (IS_ERR(req))
 		goto out;
 
 	req->out.page_zeroing = 1;
@@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
 	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
 		fuse_send_readpages(req, data->file, inode);
-		data->req = req = fuse_get_request(fc);
-		if (!req) {
+		data->req = req = fuse_get_req(fc);
+		if (IS_ERR(req)) {
 			unlock_page(page);
-			return -EINTR;
+			return PTR_ERR(req);
 		}
 	}
 	req->pages[req->num_pages] = page;
@@ -392,9 +392,9 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 
 	data.file = file;
 	data.inode = inode;
-	data.req = fuse_get_request(fc);
-	if (!data.req)
-		return -EINTR;
+	data.req = fuse_get_req(fc);
+	if (IS_ERR(data.req))
+		return PTR_ERR(data.req);
 
 	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
 	if (!err) {
@@ -455,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->num_pages = 1;
 	req->pages[0] = page;
@@ -532,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	while (count) {
 		size_t nres;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6ed812fd620..242e69cb125 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -18,9 +18,6 @@
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
 
-/** If more requests are outstanding, then the operation will block */
-#define FUSE_MAX_OUTSTANDING 10
-
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
 
@@ -131,8 +128,8 @@ struct fuse_conn;
  * A request to the client
  */
 struct fuse_req {
-	/** This can be on either unused_list, pending processing or
-	    io lists in fuse_conn */
+	/** This can be on either pending processing or io lists in
+	    fuse_conn */
 	struct list_head list;
 
 	/** Entry on the background list */
@@ -150,9 +147,6 @@ struct fuse_req {
 	/** True if the request has reply */
 	unsigned isreply:1;
 
-	/** The request is preallocated */
-	unsigned preallocated:1;
-
 	/** The request was interrupted */
 	unsigned interrupted:1;
 
@@ -247,19 +241,9 @@ struct fuse_conn {
 	    interrupted request) */
 	struct list_head background;
 
-	/** Controls the maximum number of outstanding requests */
-	struct semaphore outstanding_sem;
-
-	/** This counts the number of outstanding requests if
-	    outstanding_sem would go negative */
-	unsigned outstanding_debt;
-
 	/** RW semaphore for exclusion with fuse_put_super() */
 	struct rw_semaphore sbput_sem;
 
-	/** The list of unused requests */
-	struct list_head unused_list;
-
 	/** The next unique request id */
 	u64 reqctr;
 
@@ -452,11 +436,11 @@ void fuse_reset_request(struct fuse_req *req);
 /**
  * Reserve a preallocated request
  */
-struct fuse_req *fuse_get_request(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc);
 
 /**
- * Decrement reference count of a request.  If count goes to zero put
- * on unused list (preallocated) or free request (not preallocated).
+ * Decrement reference count of a request.  If count goes to zero free
+ * the request.
  */
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cc58debeabd..824ebbc428e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -243,9 +243,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
 	struct fuse_statfs_out outarg;
 	int err;
 
-        req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&outarg, 0, sizeof(outarg));
 	req->in.numargs = 0;
@@ -370,15 +370,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 static void fuse_conn_release(struct kobject *kobj)
 {
-	struct fuse_conn *fc = get_fuse_conn_kobj(kobj);
-
-	while (!list_empty(&fc->unused_list)) {
-		struct fuse_req *req;
-		req = list_entry(fc->unused_list.next, struct fuse_req, list);
-		list_del(&req->list);
-		fuse_request_free(req);
-	}
-	kfree(fc);
+	kfree(get_fuse_conn_kobj(kobj));
 }
 
 static struct fuse_conn *new_conn(void)
@@ -387,27 +379,16 @@ static struct fuse_conn *new_conn(void)
 
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc) {
-		int i;
 		spin_lock_init(&fc->lock);
 		init_waitqueue_head(&fc->waitq);
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
-		INIT_LIST_HEAD(&fc->unused_list);
 		INIT_LIST_HEAD(&fc->background);
-		sema_init(&fc->outstanding_sem, 1); /* One for INIT */
 		init_rwsem(&fc->sbput_sem);
 		kobj_set_kset_s(fc, connections_subsys);
 		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
-		for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
-			struct fuse_req *req = fuse_request_alloc();
-			if (!req) {
-				kobject_put(&fc->kobj);
-				return NULL;
-			}
-			list_add(&req->list, &fc->unused_list);
-		}
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
 		fc->reqctr = 0;
@@ -438,7 +419,6 @@ static struct super_operations fuse_super_operations = {
 
 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
-	int i;
 	struct fuse_init_out *arg = &req->misc.init_out;
 
 	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
@@ -457,22 +437,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
 	}
-
-	/* After INIT reply is received other requests can go
-	   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
-	   up()s on outstanding_sem.  The last up() is done in
-	   fuse_putback_request() */
-	for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
-		up(&fc->outstanding_sem);
-
 	fuse_put_request(fc, req);
 }
 
-static void fuse_send_init(struct fuse_conn *fc)
+static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 {
-	/* This is called from fuse_read_super() so there's guaranteed
-	   to be exactly one request available */
-	struct fuse_req *req = fuse_get_request(fc);
 	struct fuse_init_in *arg = &req->misc.init_in;
 
 	arg->major = FUSE_KERNEL_VERSION;
@@ -508,6 +477,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	struct fuse_mount_data d;
 	struct file *file;
 	struct dentry *root_dentry;
+	struct fuse_req *init_req;
 	int err;
 
 	if (!parse_fuse_opt((char *) data, &d))
@@ -554,13 +524,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err;
 	}
 
+	init_req = fuse_request_alloc();
+	if (!init_req)
+		goto err_put_root;
+
 	err = kobject_set_name(&fc->kobj, "%llu", conn_id());
 	if (err)
-		goto err_put_root;
+		goto err_free_req;
 
 	err = kobject_add(&fc->kobj);
 	if (err)
-		goto err_put_root;
+		goto err_free_req;
 
 	sb->s_root = root_dentry;
 	fc->mounted = 1;
@@ -574,10 +548,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	fput(file);
 
-	fuse_send_init(fc);
+	fuse_send_init(fc, init_req);
 
 	return 0;
 
+ err_free_req:
+	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
  err:
-- 
cgit v1.2.3


From 08a53cdce62d37d918530bbbf726cc01b21dc3d1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 10 Apr 2006 22:54:59 -0700
Subject: [PATCH] fuse: account background requests

The previous patch removed limiting the number of outstanding requests.  This
patch adds a much simpler limiting, that is also compatible with file locking
operations.

A task may have at most one synchronous request allocated.  So these requests
need not be otherwise limited.

However the number of background requests (release, forget, asynchronous
reads, interrupted requests) can grow indefinitely.  This can be used by a
malicous user to cause FUSE to allocate arbitrary amounts of unswappable
kernel memory, denying service.

For this reason add a limit for the number of background requests, and block
allocations of new requests until the number goes bellow the limit.

Also use this mechanism to block all requests until the INIT reply is
received.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 24 ++++++++++++++++++++----
 fs/fuse/fuse_i.h | 14 ++++++++++++++
 fs/fuse/inode.c  |  4 ++++
 3 files changed, 38 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4dc104c0e95..6c740f86066 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -90,7 +90,17 @@ static void __fuse_put_request(struct fuse_req *req)
 
 struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
-	struct fuse_req *req = fuse_request_alloc();
+	struct fuse_req *req;
+	sigset_t oldset;
+	int err;
+
+	block_sigs(&oldset);
+	err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
+	restore_sigs(&oldset);
+	if (err)
+		return ERR_PTR(-EINTR);
+
+	req = fuse_request_alloc();
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 
@@ -118,6 +128,11 @@ void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
 		fput(req->file);
 	spin_lock(&fc->lock);
 	list_del(&req->bg_entry);
+	if (fc->num_background == FUSE_MAX_BACKGROUND) {
+		fc->blocked = 0;
+		wake_up_all(&fc->blocked_waitq);
+	}
+	fc->num_background--;
 	spin_unlock(&fc->lock);
 }
 
@@ -195,6 +210,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->background = 1;
 	list_add(&req->bg_entry, &fc->background);
+	fc->num_background++;
+	if (fc->num_background == FUSE_MAX_BACKGROUND)
+		fc->blocked = 1;
 	if (req->inode)
 		req->inode = igrab(req->inode);
 	if (req->inode2)
@@ -288,6 +306,7 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fc->lock);
+	background_request(fc, req);
 	if (fc->connected) {
 		queue_request(fc, req);
 		spin_unlock(&fc->lock);
@@ -306,9 +325,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	spin_lock(&fc->lock);
-	background_request(fc, req);
-	spin_unlock(&fc->lock);
 	request_send_nowait(fc, req);
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 242e69cb125..19c7185a754 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -18,6 +18,9 @@
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
 
+/** Maximum number of outstanding background requests */
+#define FUSE_MAX_BACKGROUND 10
+
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
 
@@ -241,6 +244,17 @@ struct fuse_conn {
 	    interrupted request) */
 	struct list_head background;
 
+	/** Number of requests currently in the background */
+	unsigned num_background;
+
+	/** Flag indicating if connection is blocked.  This will be
+	    the case before the INIT reply is received, and if there
+	    are too many outstading backgrounds requests */
+	int blocked;
+
+	/** waitq for blocked connection */
+	wait_queue_head_t blocked_waitq;
+
 	/** RW semaphore for exclusion with fuse_put_super() */
 	struct rw_semaphore sbput_sem;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 824ebbc428e..fd34037b058 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -381,6 +381,7 @@ static struct fuse_conn *new_conn(void)
 	if (fc) {
 		spin_lock_init(&fc->lock);
 		init_waitqueue_head(&fc->waitq);
+		init_waitqueue_head(&fc->blocked_waitq);
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
@@ -392,6 +393,7 @@ static struct fuse_conn *new_conn(void)
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
 		fc->reqctr = 0;
+		fc->blocked = 1;
 	}
 	return fc;
 }
@@ -438,6 +440,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
 	}
 	fuse_put_request(fc, req);
+	fc->blocked = 0;
+	wake_up_all(&fc->blocked_waitq);
 }
 
 static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
-- 
cgit v1.2.3


From 7775f4c85dcbd1175f21b2fbb7221c79ec70b722 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:20 -0700
Subject: [PATCH] knfsd: Correct reserved reply space for read requests.

NFSd makes sure there is enough space to hold the maximum possible reply
before accepting a request.  The units for this maximum is (4byte) words.
However in three places, particularly for read request, the number given is
a number of bytes.

This means too much space is reserved which is slightly wasteful.

This is the sort of patch that could uncover a deeper bug, and it is not
critical, so it would be best for it to spend a while in -mm before going
in to mainline.

(akpm: target 2.6.17-rc2, 2.6.16.3 (approx))

Discovered-by: "Eivind  Sarto" <ivan@kasenna.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs3proc.c | 2 +-
 fs/nfsd/nfs4proc.c | 2 +-
 fs/nfsd/nfsproc.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 6d2dfed1de0..f61142afea4 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -682,7 +682,7 @@ static struct svc_procedure		nfsd_procedures3[22] = {
   PROC(lookup,	 dirop,		dirop,		fhandle2, RC_NOCACHE, ST+FH+pAT+pAT),
   PROC(access,	 access,	access,		fhandle,  RC_NOCACHE, ST+pAT+1),
   PROC(readlink, readlink,	readlink,	fhandle,  RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4),
-  PROC(read,	 read,		read,		fhandle,  RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE),
+  PROC(read,	 read,		read,		fhandle,  RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4),
   PROC(write,	 write,		write,		fhandle,  RC_REPLBUFF, ST+WC+4),
   PROC(create,	 create,	create,		fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
   PROC(mkdir,	 mkdir,		create,		fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6d63f1d9e5f..ca8a4c410de 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -975,7 +975,7 @@ struct nfsd4_voidargs { int dummy; };
  */
 static struct svc_procedure		nfsd_procedures4[2] = {
   PROC(null,	 void,		void,		void,	  RC_NOCACHE, 1),
-  PROC(compound, compound,	compound,	compound, RC_NOCACHE, NFSD_BUFSIZE)
+  PROC(compound, compound,	compound,	compound, RC_NOCACHE, NFSD_BUFSIZE/4)
 };
 
 struct svc_version	nfsd_version4 = {
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 3e6b75cd90f..06cd0db0f32 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -553,7 +553,7 @@ static struct svc_procedure		nfsd_procedures2[18] = {
   PROC(none,	 void,		void,		none,		RC_NOCACHE, ST),
   PROC(lookup,	 diropargs,	diropres,	fhandle,	RC_NOCACHE, ST+FH+AT),
   PROC(readlink, readlinkargs,	readlinkres,	none,		RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
-  PROC(read,	 readargs,	readres,	fhandle,	RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE),
+  PROC(read,	 readargs,	readres,	fhandle,	RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4),
   PROC(none,	 void,		void,		none,		RC_NOCACHE, ST),
   PROC(write,	 writeargs,	attrstat,	fhandle,	RC_REPLBUFF, ST+AT),
   PROC(create,	 createargs,	diropres,	fhandle,	RC_REPLBUFF, ST+FH+AT),
-- 
cgit v1.2.3


From d5b9026a670fdb404e6e2e2e0a1b447e9ea9c1f6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:22 -0700
Subject: [PATCH] knfsd: locks: flag NFSv4-owned locks

Use the fl_lmops field to identify which locks are ours, instead of trying to
look them up in our private hash.  This is safer and more efficient.

Earlier versions of this patch used a lock flag instead, but Trond pointed out
that adding a new flag for each lock manager wasn't going to scale well, and
suggested this approach instead; a separate patch converts lockd to using
fl_lmops in the same way.

In the NFSv4 case this looks like a bit of a hack, since the NFSv4 server
isn't currently actually defining a lock_manager_operations struct, so we end
up defining one *just* to serve as a cookie to identify our locks.

But it works, and we actually do expect to start using the
lock_manager_operations at some point anyway.

Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 47ec112b266..ffedce08b4c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2495,36 +2495,27 @@ nfs4_transform_lock_offset(struct file_lock *lock)
 		lock->fl_end = OFFSET_MAX;
 }
 
-static int
-nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval)
-{
-	struct nfs4_stateowner *local = NULL;
-	int status = 0;
-			        
-	if (hashval >= LOCK_HASH_SIZE)
-		goto out;
-	list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) {
-		if (local == sop) {
-			status = 1;
-			goto out;
-		}
-	}
-out:
-	return status;
-}
-
+/* Hack!: For now, we're defining this just so we can use a pointer to it
+ * as a unique cookie to identify our (NFSv4's) posix locks. */
+struct lock_manager_operations nfsd_posix_mng_ops  = {
+};
 
 static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
-	struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner;
-	unsigned int hval = lockownerid_hashval(sop->so_id);
+	struct nfs4_stateowner *sop;
+	unsigned int hval;
 
-	deny->ld_sop = NULL;
-	if (nfs4_verify_lock_stateowner(sop, hval)) {
+	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
+		sop = (struct nfs4_stateowner *) fl->fl_owner;
+		hval = lockownerid_hashval(sop->so_id);
 		kref_get(&sop->so_ref);
 		deny->ld_sop = sop;
 		deny->ld_clientid = sop->so_client->cl_clientid;
+	} else {
+		deny->ld_sop = NULL;
+		deny->ld_clientid.cl_boot = 0;
+		deny->ld_clientid.cl_id = 0;
 	}
 	deny->ld_start = fl->fl_start;
 	deny->ld_length = ~(u64)0;
@@ -2736,6 +2727,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX;
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lock->lk_offset;
 	if ((lock->lk_length == ~(u64)0) || 
@@ -2841,6 +2833,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 		file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_flags = FL_POSIX;
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lockt->lt_offset;
 	if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
@@ -2900,6 +2893,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX; 
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 	file_lock.fl_start = locku->lu_offset;
 
 	if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
-- 
cgit v1.2.3


From e465a77f943f51df1a169426df879340bd0db3f3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 10 Apr 2006 22:55:23 -0700
Subject: [PATCH] fs/nfsd/nfs4state.c: make a struct static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Marc Eshel <eshel@almaden.ibm.com>
Cc: Andy Adamson <andros@citi.umich.edu>
Cc: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ffedce08b4c..a8c2122a481 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2497,7 +2497,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
 
 /* Hack!: For now, we're defining this just so we can use a pointer to it
  * as a unique cookie to identify our (NFSv4's) posix locks. */
-struct lock_manager_operations nfsd_posix_mng_ops  = {
+static struct lock_manager_operations nfsd_posix_mng_ops  = {
 };
 
 static inline void
-- 
cgit v1.2.3


From 249920527f9e6e5c305538bbf1ea882ee7dc1c06 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:24 -0700
Subject: [PATCH] knfsd: nfsd4: Wrong error handling in nfs4acl

this fixes coverity id #3.  Coverity detected dead code, since the == -1
comparison only returns 0 or 1 to error.  Therefore the if ( error < 0 )
statement was always false.  Seems that this was an if( error = nfs4...  )
statement some time ago, which got broken during cleanup.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 7391f4aabed..63818a51c05 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
 			continue;
 
 		error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
-				ace->access_mask, ace->whotype, ace->who) == -1;
+				ace->access_mask, ace->whotype, ace->who);
 		if (error < 0)
 			goto out;
 
-- 
cgit v1.2.3


From b905b7b0a054d2ab3e0c9304def998546c93f6b5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:25 -0700
Subject: [PATCH] knfsd: nfsd4: better nfs4acl errors

We're returning -1 in a few places in the NFSv4<->POSIX acl translation code
where we could return a reasonable error.

Also allows some minor simplification elsewhere.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4acl.c | 6 +++---
 fs/nfsd/nfs4xdr.c | 7 +++----
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 63818a51c05..edb107e61b9 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl)
 		/* Also, the remaining entries are for named users and
 		 * groups, and come in threes (mask, allow, deny): */
 		if (n4acl->naces < 7)
-			return -1;
+			return -EINVAL;
 		if ((n4acl->naces - 7) % 3)
-			return -1;
+			return -EINVAL;
 		return 4 + (n4acl->naces - 7)/3;
 	}
 }
@@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
 	struct nfs4_ace *ace;
 
 	if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL)
-		return -1;
+		return -ENOMEM;
 
 	ace->type = type;
 	ace->flag = flag;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 03857fd8112..845f25251d8 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 						buf, dummy32, &ace.who);
 			if (status)
 				goto out_nfserr;
-			if (nfs4_acl_add_ace(*acl, ace.type, ace.flag,
-				 ace.access_mask, ace.whotype, ace.who) != 0) {
-				status = -ENOMEM;
+			status = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
+				 ace.access_mask, ace.whotype, ace.who);
+			if (status)
 				goto out_nfserr;
-			}
 		}
 	} else
 		*acl = NULL;
-- 
cgit v1.2.3


From b5872b0dcc0501035d5ae53c60f8cbbb3798da8a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:26 -0700
Subject: [PATCH] knfsd: nfsd4: fix acl xattr length return

We should be using the length from the second vfs_getxattr, in case it
changed.  (Note: there's still a small race here; we could end up returning
-ENOMEM if the length increased between the first and second call.  I don't
know whether it's worth spending a lot of effort to fix that.)

This makes XFS ACLs usable on NFS exports, which they currently aren't, since
XFS appears to be returning a too-large value for vfs_getxattr() when it's
passed a NULL buffer.  So there's probably an XFS bug here too, though since
getxattr with a NULL buffer is usually used to decide how much memory to
allocate, it may be a fairly harmless bug in most cases.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 31018333dc3..6aa92d0e687 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -371,7 +371,6 @@ out_nfserr:
 static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 {
 	ssize_t buflen;
-	int error;
 
 	buflen = vfs_getxattr(dentry, key, NULL, 0);
 	if (buflen <= 0)
@@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 	if (!*buf)
 		return -ENOMEM;
 
-	error = vfs_getxattr(dentry, key, *buf, buflen);
-	if (error < 0)
-		return error;
-	return buflen;
+	return vfs_getxattr(dentry, key, *buf, buflen);
 }
 #endif
 
-- 
cgit v1.2.3


From cd15654963cf7e4dd938a403de3ec5bcd09f8350 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:27 -0700
Subject: [PATCH] knfsd: nfsd: oops exporting nonexistent directory

Export a directory that does not exist:
	exportfs -orw,fsid=0,insecure,no_subtree_check client:/home/NFS4

Try to mount from client with nfs4. Mount hangs (I'm not sure why -
that's another issue).

While client is hung, back on server

	mkdir /home/NFS4

The server panics in dput.  I traced the problem back to svc_export_parse()
calling path_release() even though path_lookup() failed (it happens to fill in
the nameidata structure with a negative dentry - so the test after out:
succeeds).

After patching, an recreating the problem, the client mount still takes some
time before finally exiting with a message "couldn't read superblock".

Here is a simple patch to resolve this issue:

Signed-off-by: Frank Filz <ffilzlnx@us.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c340be0a3f5..4e0578121d9 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
 		goto out;
 	err = path_lookup(buf, 0, &nd);
-	if (err) goto out;
+	if (err) goto out_no_path;
 
 	exp.h.flags = 0;
 	exp.ex_client = dom;
@@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
  out:
 	if (nd.dentry)
 		path_release(&nd);
+ out_no_path:
 	if (dom)
 		auth_domain_put(dom);
 	kfree(buf);
-- 
cgit v1.2.3


From 54cceebb679a8d10fa382422aa2035cdc65fe7ce Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:30 -0700
Subject: [PATCH] knfsd: nfsd: nfsd_setuser doesn't really need to modify
 rqstp->rq_cred.

In addition to setting the processes filesystem id's, nfsd_setuser also
modifies the value of the rq_cred which stores the id's that originally came
from the rpc call, for example to reflect root squashing.

There's no real reason to do that--the only case where rqstp->rq_cred is
actually used later on is in the NFSv4 SETCLIENTID/SETCLIENTID_CONFIRM
operations, and there the results are the opposite of what we want--those two
operations don't deal with the filesystem at all, they only record the
credentials used with the rpc call for later reference (so that we may require
the same credentials be used on later operations), and the credentials
shouldn't vary just because there was or wasn't a previous operation in the
compound that referred to some export

This fixes a bug which caused mounts from Solaris clients to fail.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/auth.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index cfe9ce88161..6e92b0fe532 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -14,46 +14,46 @@
 
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
-	struct svc_cred	*cred = &rqstp->rq_cred;
+	struct svc_cred	cred = rqstp->rq_cred;
 	int i;
 	int ret;
 
 	if (exp->ex_flags & NFSEXP_ALLSQUASH) {
-		cred->cr_uid = exp->ex_anon_uid;
-		cred->cr_gid = exp->ex_anon_gid;
-		put_group_info(cred->cr_group_info);
-		cred->cr_group_info = groups_alloc(0);
+		cred.cr_uid = exp->ex_anon_uid;
+		cred.cr_gid = exp->ex_anon_gid;
+		cred.cr_group_info = groups_alloc(0);
 	} else if (exp->ex_flags & NFSEXP_ROOTSQUASH) {
 		struct group_info *gi;
-		if (!cred->cr_uid)
-			cred->cr_uid = exp->ex_anon_uid;
-		if (!cred->cr_gid)
-			cred->cr_gid = exp->ex_anon_gid;
-		gi = groups_alloc(cred->cr_group_info->ngroups);
+		if (!cred.cr_uid)
+			cred.cr_uid = exp->ex_anon_uid;
+		if (!cred.cr_gid)
+			cred.cr_gid = exp->ex_anon_gid;
+		gi = groups_alloc(cred.cr_group_info->ngroups);
 		if (gi)
-			for (i = 0; i < cred->cr_group_info->ngroups; i++) {
-				if (!GROUP_AT(cred->cr_group_info, i))
+			for (i = 0; i < cred.cr_group_info->ngroups; i++) {
+				if (!GROUP_AT(cred.cr_group_info, i))
 					GROUP_AT(gi, i) = exp->ex_anon_gid;
 				else
-					GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i);
+					GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i);
 			}
-		put_group_info(cred->cr_group_info);
-		cred->cr_group_info = gi;
-	}
+		cred.cr_group_info = gi;
+	} else
+		get_group_info(cred.cr_group_info);
 
-	if (cred->cr_uid != (uid_t) -1)
-		current->fsuid = cred->cr_uid;
+	if (cred.cr_uid != (uid_t) -1)
+		current->fsuid = cred.cr_uid;
 	else
 		current->fsuid = exp->ex_anon_uid;
-	if (cred->cr_gid != (gid_t) -1)
-		current->fsgid = cred->cr_gid;
+	if (cred.cr_gid != (gid_t) -1)
+		current->fsgid = cred.cr_gid;
 	else
 		current->fsgid = exp->ex_anon_gid;
 
-	if (!cred->cr_group_info)
+	if (!cred.cr_group_info)
 		return -ENOMEM;
-	ret = set_current_groups(cred->cr_group_info);
-	if ((cred->cr_uid)) {
+	ret = set_current_groups(cred.cr_group_info);
+	put_group_info(cred.cr_group_info);
+	if ((cred.cr_uid)) {
 		cap_t(current->cap_effective) &= ~CAP_NFSD_MASK;
 	} else {
 		cap_t(current->cap_effective) |= (CAP_NFSD_MASK &
-- 
cgit v1.2.3


From f0e2993e9e73e8f38b05a89c98b9db94fec2199d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:31 -0700
Subject: [PATCH] knfsd: nfsd4: remove nfsd_setuser from putrootfh

Since nfsd_setuser() is already called from any operation that uses the
current filehandle (because it's called from fh_verify), there's no reason to
call it from putrootfh.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4proc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ca8a4c410de..b0e095ea0c0 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 	fh_put(current_fh);
 	status = exp_pseudoroot(rqstp->rq_client, current_fh,
 			      &rqstp->rq_chandle);
-	if (!status)
-		status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export));
 	return status;
 }
 
-- 
cgit v1.2.3


From 6ed6decccf544970664757464cfb67e081775e6a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:32 -0700
Subject: [PATCH] knfsd: nfsd4: fix corruption of returned data when using 64k
 pages

In v4 we grab an extra page just for the padding of returned data.  The
formula that the rpc server uses to allocate pages for the response doesn't
take into account this extra page.

Instead of adjusting those formulae, we adopt the same solution as v2 and v3,
and put the "tail" data in the same page as the "head" data.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4xdr.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 845f25251d8..f2710cfc0bc 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2084,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
 	WRITE32(eof);
 	WRITE32(maxcount);
 	ADJUST_ARGS();
-	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
-
+	resp->xbuf->head[0].iov_len = (char*)p
+					- (char*)resp->xbuf->head[0].iov_base;
 	resp->xbuf->page_len = maxcount;
 
-	/* read zero bytes -> don't set up tail */
-	if(!maxcount)
-		return 0;        
-
-	/* set up page for remaining responses */
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = p;
 	resp->xbuf->tail[0].iov_len = 0;
-	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
-
 	if (maxcount&3) {
-		*(resp->p)++ = 0;
+		RESERVE_SPACE(4);
+		WRITE32(0);
 		resp->xbuf->tail[0].iov_base += maxcount&3;
 		resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+		ADJUST_ARGS();
 	}
 	return 0;
 }
@@ -2141,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
 
 	WRITE32(maxcount);
 	ADJUST_ARGS();
-	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+	resp->xbuf->head[0].iov_len = (char*)p
+				- (char*)resp->xbuf->head[0].iov_base;
+	resp->xbuf->page_len = maxcount;
 
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = p;
 	resp->xbuf->tail[0].iov_len = 0;
-	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
-
-	resp->xbuf->page_len = maxcount;
 	if (maxcount&3) {
-		*(resp->p)++ = 0;
+		RESERVE_SPACE(4);
+		WRITE32(0);
 		resp->xbuf->tail[0].iov_base += maxcount&3;
 		resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+		ADJUST_ARGS();
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From bb6e8a9f4005237401a45f1ea43db060ea5f9725 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:33 -0700
Subject: [PATCH] knfsd: nfsd4: fix corruption on readdir encoding with 64k
 pages

Fix corruption on readdir encoding with 64k pages.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4xdr.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f2710cfc0bc..de3998f15f1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2157,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 {
 	int maxcount;
 	loff_t offset;
-	u32 *page, *savep;
+	u32 *page, *savep, *tailbase;
 	ENCODE_HEAD;
 
 	if (nfserr)
@@ -2173,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 	WRITE32(0);
 	ADJUST_ARGS();
 	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+	tailbase = p;
 
 	maxcount = PAGE_SIZE;
 	if (maxcount > readdir->rd_maxcount)
@@ -2217,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 	*p++ = htonl(readdir->common.err == nfserr_eof);
 	resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
 
-	/* allocate a page for the tail */
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = tailbase;
 	resp->xbuf->tail[0].iov_len = 0;
 	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
+	resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4;
 
 	return 0;
 err_no_verf:
-- 
cgit v1.2.3


From 5e8d5c29482dc56de5971ddc99c6e7f69e4d16f6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:37 -0700
Subject: [PATCH] knfsd: nfsd4: fix laundromat shutdown race

We need to make sure the laundromat work doesn't reschedule itself just when
we try to cancel it.  Also, we shouldn't be waiting for it to finish running
while holding the state lock, as that's a potential deadlock.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a8c2122a481..01ff544dc1f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3238,8 +3238,6 @@ __nfs4_state_shutdown(void)
 	}
 
 	cancel_delayed_work(&laundromat_work);
-	flush_workqueue(laundry_wq);
-	destroy_workqueue(laundry_wq);
 	nfsd4_shutdown_recdir();
 	nfs4_init = 0;
 }
@@ -3247,6 +3245,8 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
+	cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+	destroy_workqueue(laundry_wq);
 	nfs4_lock_state();
 	nfs4_release_reclaim();
 	__nfs4_state_shutdown();
-- 
cgit v1.2.3


From 541e0e09814594e907e18fb8d9fc9b746aa4b18a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:38 -0700
Subject: [PATCH] knfsd: nfsd4: nfsd4_probe_callback cleanup

Some obvious cleanup.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4callback.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c872bd07fc1..dbaf3f93f32 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 		goto out_clnt;
 	}
 
-	/* the task holds a reference to the nfs4_client struct */
 	cb->cb_client = clnt;
+
+	/* the task holds a reference to the nfs4_client struct */
 	atomic_inc(&clp->cl_count);
 
 	msg.rpc_cred = nfsd4_lookupcred(clp,0);
@@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 out_rpciod:
 	atomic_dec(&clp->cl_count);
 	rpciod_down();
+	cb->cb_client = NULL;
 out_clnt:
 	rpc_shutdown_client(clnt);
-	goto out_err;
 out_err:
 	dprintk("NFSD: warning: no callback path to client %.*s\n",
 		(int)clp->cl_name.len, clp->cl_name.data);
-	cb->cb_client = NULL;
 }
 
 static void
-- 
cgit v1.2.3


From 4e2fd495b520b51e4ba83340f13520b7f07e3743 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:39 -0700
Subject: [PATCH] knfsd: nfsd4: add missing rpciod_down()

We should be shutting down rpciod for the callback channel when we shut down
the server.

Also note that we do rpciod_up() and create the callback client *before*
setting cb_set--the cb_set only determines whether the initial null was
succesful.  So cb_set is not a reliable determiner of whether we need to clean
up, only cb_client is.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 01ff544dc1f..e97c58aafde 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -329,23 +329,30 @@ put_nfs4_client(struct nfs4_client *clp)
 		free_client(clp);
 }
 
+static void
+shutdown_callback_client(struct nfs4_client *clp)
+{
+	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+
+	/* shutdown rpc client, ending any outstanding recall rpcs */
+	if (clnt) {
+		clp->cl_callback.cb_client = NULL;
+		rpc_shutdown_client(clnt);
+		rpciod_down();
+	}
+}
+
 static void
 expire_client(struct nfs4_client *clp)
 {
 	struct nfs4_stateowner *sop;
 	struct nfs4_delegation *dp;
-	struct nfs4_callback *cb = &clp->cl_callback;
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
 	struct list_head reaplist;
 
 	dprintk("NFSD: expire_client cl_count %d\n",
 	                    atomic_read(&clp->cl_count));
 
-	/* shutdown rpc client, ending any outstanding recall rpcs */
-	if (atomic_read(&cb->cb_set) == 1 && clnt) {
-		rpc_shutdown_client(clnt);
-		clnt = clp->cl_callback.cb_client = NULL;
-	}
+	shutdown_callback_client(clp);
 
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
-- 
cgit v1.2.3


From ef0f3390ebedac78bff1936bbb26606bca83e891 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:41 -0700
Subject: [PATCH] knfsd: nfsd4: limit number of delegations handed out.

It's very easy for the server to DOS itself by just giving out too many
delegations.

For now we just solve the problem with a dumb hard limit.  Eventually we'll
want a smarter policy.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 74 +++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e97c58aafde..1e2a89aaf89 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi)
 	kref_get(&fi->fi_ref);
 }
 
+static int num_delegations;
+
+/*
+ * Open owner state (share locks)
+ */
+
+/* hash tables for nfs4_stateowner */
+#define OWNER_HASH_BITS              8
+#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
+
+#define ownerid_hashval(id) \
+        ((id) & OWNER_HASH_MASK)
+#define ownerstr_hashval(clientid, ownername) \
+        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+
+static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
+static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
+
+/* hash table for nfs4_file */
+#define FILE_HASH_BITS                   8
+#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
+#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
+/* hash table for (open)nfs4_stateid */
+#define STATEID_HASH_BITS              10
+#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
+#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
+
+#define file_hashval(x) \
+        hash_ptr(x, FILE_HASH_BITS)
+#define stateid_hashval(owner_id, file_id)  \
+        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+
+static struct list_head file_hashtbl[FILE_HASH_SIZE];
+static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
 
 	dprintk("NFSD alloc_init_deleg\n");
+	if (num_delegations > STATEID_HASH_SIZE * 4)
+		return NULL;
 	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
 	if (dp == NULL)
 		return dp;
+	num_delegations++;
 	INIT_LIST_HEAD(&dp->dl_perfile);
 	INIT_LIST_HEAD(&dp->dl_perclnt);
 	INIT_LIST_HEAD(&dp->dl_recall_lru);
@@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 		dprintk("NFSD: freeing dp %p\n",dp);
 		put_nfs4_file(dp->dl_file);
 		kmem_cache_free(deleg_slab, dp);
+		num_delegations--;
 	}
 }
 
@@ -943,40 +983,6 @@ out:
 	return status;
 }
 
-/* 
- * Open owner state (share locks)
- */
-
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS              8
-#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
-
-#define ownerid_hashval(id) \
-        ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
-        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
-
-static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
-
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS                   8
-#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
-/* hash table for (open)nfs4_stateid */
-#define STATEID_HASH_BITS              10
-#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
-
-#define file_hashval(x) \
-        hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id)  \
-        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
-
-static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
-
 /* OPEN Share state helper functions */
 static inline struct nfs4_file *
 alloc_init_file(struct inode *ino)
-- 
cgit v1.2.3


From 358dd55aa3a77fbbae482b83d96733d9ad441d05 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Apr 2006 22:55:42 -0700
Subject: [PATCH] knfsd: nfsd4: grant delegations more frequently

Keep unused openowners around for at least one lease period, to avoid the need
for as many open confirmations and to allow handing out more delegations.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1e2a89aaf89..96c7578cbe1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1199,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop)
 {
 	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
 
-	unhash_stateowner(sop);
-	list_add_tail(&sop->so_close_lru, &close_lru);
+	list_move_tail(&sop->so_close_lru, &close_lru);
 	sop->so_time = get_seconds();
 }
 
@@ -1929,8 +1928,7 @@ nfs4_laundromat(void)
 		}
 		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
 			sop->so_id);
-		list_del(&sop->so_close_lru);
-		nfs4_put_stateowner(sop);
+		release_stateowner(sop);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
 		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -3218,15 +3216,8 @@ __nfs4_state_shutdown(void)
 	int i;
 	struct nfs4_client *clp = NULL;
 	struct nfs4_delegation *dp = NULL;
-	struct nfs4_stateowner *sop = NULL;
 	struct list_head *pos, *next, reaplist;
 
-	list_for_each_safe(pos, next, &close_lru) {
-		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
-		list_del(&sop->so_close_lru);
-		nfs4_put_stateowner(sop);
-	}
-
 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
 		while (!list_empty(&conf_id_hashtbl[i])) {
 			clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
-- 
cgit v1.2.3


From cbb7e577e732f576b9f399bc2600bdc0626c68dc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 14:57:50 +0200
Subject: [PATCH] splice: pass offset around for ->splice_read() and
 ->splice_write()

We need not use ->f_pos as the offset for the file input/output. If the
user passed an offset pointer in through sys_splice(), just use that and
leave ->f_pos alone.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c                  | 86 ++++++++++++++++++++++----------------------
 fs/xfs/linux-2.6/xfs_file.c  | 12 ++++---
 fs/xfs/linux-2.6/xfs_lrw.c   | 14 ++++----
 fs/xfs/linux-2.6/xfs_lrw.h   |  4 +--
 fs/xfs/linux-2.6/xfs_vnode.h | 12 +++----
 5 files changed, 68 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index e50a460239d..5d3eda64703 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -231,8 +231,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 }
 
 static int
-__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
-			   size_t len, unsigned int flags)
+__generic_file_splice_read(struct file *in, loff_t *ppos,
+			   struct pipe_inode_info *pipe, size_t len,
+			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int offset, nr_pages;
@@ -241,8 +242,8 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 	pgoff_t index;
 	int i, error;
 
-	index = in->f_pos >> PAGE_CACHE_SHIFT;
-	offset = in->f_pos & ~PAGE_CACHE_MASK;
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
@@ -348,8 +349,9 @@ fill_it:
  *
  * Will read pages from given file and fill them into a pipe.
  */
-ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
-				 size_t len, unsigned int flags)
+ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
 {
 	ssize_t spliced;
 	int ret;
@@ -358,12 +360,12 @@ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 	spliced = 0;
 
 	while (len) {
-		ret = __generic_file_splice_read(in, pipe, len, flags);
+		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 
 		if (ret <= 0)
 			break;
 
-		in->f_pos += ret;
+		*ppos += ret;
 		len -= ret;
 		spliced += ret;
 
@@ -561,7 +563,7 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
 static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
-			      size_t len, unsigned int flags,
+			      loff_t *ppos, size_t len, unsigned int flags,
 			      splice_actor *actor)
 {
 	int ret, do_wakeup, err;
@@ -573,7 +575,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	sd.total_len = len;
 	sd.flags = flags;
 	sd.file = out;
-	sd.pos = out->f_pos;
+	sd.pos = *ppos;
 
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
@@ -656,9 +658,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 
-	out->f_pos = sd.pos;
 	return ret;
-
 }
 
 /**
@@ -674,12 +674,12 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
  */
 ssize_t
 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
-			  size_t len, unsigned int flags)
+			  loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct address_space *mapping = out->f_mapping;
 	ssize_t ret;
 
-	ret = move_from_pipe(pipe, out, len, flags, pipe_to_file);
+	ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
 
 	/*
 	 * If file or inode is SYNC and we actually wrote some data, sync it.
@@ -715,9 +715,9 @@ EXPORT_SYMBOL(generic_file_splice_write);
  *
  */
 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
-				size_t len, unsigned int flags)
+				loff_t *ppos, size_t len, unsigned int flags)
 {
-	return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage);
+	return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 }
 
 EXPORT_SYMBOL(generic_splice_sendpage);
@@ -726,9 +726,8 @@ EXPORT_SYMBOL(generic_splice_sendpage);
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   size_t len, unsigned int flags)
+			   loff_t *ppos, size_t len, unsigned int flags)
 {
-	loff_t pos;
 	int ret;
 
 	if (unlikely(!out->f_op || !out->f_op->splice_write))
@@ -737,22 +736,21 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
-	pos = out->f_pos;
-
-	ret = rw_verify_area(WRITE, out, &pos, len);
+	ret = rw_verify_area(WRITE, out, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
 
-	return out->f_op->splice_write(pipe, out, len, flags);
+	return out->f_op->splice_write(pipe, out, ppos, len, flags);
 }
 
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
-			 size_t len, unsigned int flags)
+static long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
 {
-	loff_t pos, isize, left;
+	loff_t isize, left;
 	int ret;
 
 	if (unlikely(!in->f_op || !in->f_op->splice_read))
@@ -761,28 +759,27 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
 	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
-	pos = in->f_pos;
-
-	ret = rw_verify_area(READ, in, &pos, len);
+	ret = rw_verify_area(READ, in, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
 
 	isize = i_size_read(in->f_mapping->host);
-	if (unlikely(in->f_pos >= isize))
+	if (unlikely(*ppos >= isize))
 		return 0;
 	
-	left = isize - in->f_pos;
+	left = isize - *ppos;
 	if (unlikely(left < len))
 		len = left;
 
-	return in->f_op->splice_read(in, pipe, len, flags);
+	return in->f_op->splice_read(in, ppos, pipe, len, flags);
 }
 
-long do_splice_direct(struct file *in, struct file *out, size_t len,
-		      unsigned int flags)
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 	long ret, bytes;
+	loff_t out_off;
 	umode_t i_mode;
 	int i;
 
@@ -820,6 +817,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len,
 	 */
 	ret = 0;
 	bytes = 0;
+	out_off = 0;
 
 	while (len) {
 		size_t read_len, max_read_len;
@@ -829,7 +827,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len,
 		 */
 		max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
 
-		ret = do_splice_to(in, pipe, max_read_len, flags);
+		ret = do_splice_to(in, ppos, pipe, max_read_len, flags);
 		if (unlikely(ret < 0))
 			goto out_release;
 
@@ -840,7 +838,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len,
 		 * must not do the output in nonblocking mode as then we
 		 * could get stuck data in the internal pipe:
 		 */
-		ret = do_splice_from(pipe, out, read_len,
+		ret = do_splice_from(pipe, out, &out_off, read_len,
 				     flags & ~SPLICE_F_NONBLOCK);
 		if (unlikely(ret < 0))
 			goto out_release;
@@ -898,6 +896,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		      size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
+	loff_t offset, *off;
 
 	pipe = in->f_dentry->d_inode->i_pipe;
 	if (pipe) {
@@ -906,12 +905,13 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_out) {
 			if (out->f_op->llseek == no_llseek)
 				return -EINVAL;
-			if (copy_from_user(&out->f_pos, off_out,
-					   sizeof(loff_t)))
+			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
 				return -EFAULT;
-		}
+			off = &offset;
+		} else
+			off = &out->f_pos;
 
-		return do_splice_from(pipe, out, len, flags);
+		return do_splice_from(pipe, out, off, len, flags);
 	}
 
 	pipe = out->f_dentry->d_inode->i_pipe;
@@ -921,11 +921,13 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_in) {
 			if (in->f_op->llseek == no_llseek)
 				return -EINVAL;
-			if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
+			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
 				return -EFAULT;
-		}
+			off = &offset;
+		} else
+			off = &in->f_pos;
 
-		return do_splice_to(in, pipe, len, flags);
+		return do_splice_to(in, off, pipe, len, flags);
 	}
 
 	return -EINVAL;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 269721af02f..c847416f6d1 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -252,6 +252,7 @@ xfs_file_sendfile_invis(
 STATIC ssize_t
 xfs_file_splice_read(
 	struct file		*infilp,
+	loff_t			*ppos,
 	struct pipe_inode_info	*pipe,
 	size_t			len,
 	unsigned int		flags)
@@ -259,13 +260,14 @@ xfs_file_splice_read(
 	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_READ(vp, infilp, pipe, len, flags, 0, NULL, rval);
+	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval);
 	return rval;
 }
 
 STATIC ssize_t
 xfs_file_splice_read_invis(
 	struct file		*infilp,
+	loff_t			*ppos,
 	struct pipe_inode_info	*pipe,
 	size_t			len,
 	unsigned int		flags)
@@ -273,7 +275,7 @@ xfs_file_splice_read_invis(
 	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_READ(vp, infilp, pipe, len, flags, IO_INVIS, NULL, rval);
+	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval);
 	return rval;
 }
 
@@ -281,13 +283,14 @@ STATIC ssize_t
 xfs_file_splice_write(
 	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			len,
 	unsigned int		flags)
 {
 	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, 0, NULL, rval);
+	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval);
 	return rval;
 }
 
@@ -295,13 +298,14 @@ STATIC ssize_t
 xfs_file_splice_write_invis(
 	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			len,
 	unsigned int		flags)
 {
 	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, IO_INVIS, NULL, rval);
+	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval);
 	return rval;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 74a52937f20..67efe330898 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -338,6 +338,7 @@ ssize_t
 xfs_splice_read(
 	bhv_desc_t		*bdp,
 	struct file		*infilp,
+	loff_t			*ppos,
 	struct pipe_inode_info	*pipe,
 	size_t			count,
 	int			flags,
@@ -360,7 +361,7 @@ xfs_splice_read(
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
-					infilp->f_pos, count,
+					*ppos, count,
 					FILP_DELAY_FLAG(infilp), &locktype);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -368,8 +369,8 @@ xfs_splice_read(
 		}
 	}
 	xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore,
-			   pipe, count, infilp->f_pos, ioflags);
-	ret = generic_file_splice_read(infilp, pipe, count, flags);
+			   pipe, count, *ppos, ioflags);
+	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
@@ -382,6 +383,7 @@ xfs_splice_write(
 	bhv_desc_t		*bdp,
 	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			count,
 	int			flags,
 	int			ioflags,
@@ -403,7 +405,7 @@ xfs_splice_write(
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp),
-					outfilp->f_pos, count,
+					*ppos, count,
 					FILP_DELAY_FLAG(outfilp), &locktype);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -411,8 +413,8 @@ xfs_splice_write(
 		}
 	}
 	xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
-			   pipe, count, outfilp->f_pos, ioflags);
-	ret = generic_file_splice_write(pipe, outfilp, count, flags);
+			   pipe, count, *ppos, ioflags);
+	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_write_bytes, ret);
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index 55c689a86ad..8f453995235 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -93,11 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
 extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
 				loff_t *, int, size_t, read_actor_t,
 				void *, struct cred *);
-extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *,
+extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *,
 				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
 extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *,
-				struct file *, size_t, int, int,
+				struct file *, loff_t *, size_t, int, int,
 				struct cred *);
 
 #endif	/* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 88b09f18628..2a8e16c2235 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -173,11 +173,11 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
 typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
 				loff_t *, int, size_t, read_actor_t,
 				void *, struct cred *);
-typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *,
+typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *,
 				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
 typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *,
-				struct file *, size_t, int, int,
+				struct file *, loff_t *, size_t, int, int,
 				struct cred *);
 typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
 				int, unsigned int, void __user *);
@@ -284,10 +284,10 @@ typedef struct vnodeops {
 	rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
 #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv)		\
 	rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_SPLICE_READ(vp,f,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr)
-#define VOP_SPLICE_WRITE(vp,f,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr)
+#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
+	rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
+#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
+	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
 #define VOP_BMAP(vp,of,sz,rw,b,n,rv)					\
 	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
 #define VOP_OPEN(vp, cr, rv)						\
-- 
cgit v1.2.3


From 70524490ee2ea1bbf6cee6c106597b3ac25a3fc2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 15:51:17 +0200
Subject: [PATCH] splice: add support for sys_tee()

Basically an in-kernel implementation of tee, which uses splice and the
pipe buffers as an intelligent way to pass data around by reference.

Where the user space tee consumes the input and produces a stdout and
file output, this syscall merely duplicates the data inside a pipe to
another pipe. No data is copied, the output just grabs a reference to the
input pipe data.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c   |   7 +++
 fs/splice.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index e984beb93a0..7fefb10db8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -131,12 +131,19 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 	return 0;
 }
 
+static void anon_pipe_buf_get(struct pipe_inode_info *info,
+			      struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+
 static struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = anon_pipe_buf_map,
 	.unmap = anon_pipe_buf_unmap,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
+	.get = anon_pipe_buf_get,
 };
 
 static ssize_t
diff --git a/fs/splice.c b/fs/splice.c
index 5d3eda64703..8d57e89924a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -125,12 +125,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
 	kunmap(buf->page);
 }
 
+static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
+				    struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+
 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.can_merge = 0,
 	.map = page_cache_pipe_buf_map,
 	.unmap = page_cache_pipe_buf_unmap,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
+	.get = page_cache_pipe_buf_get,
 };
 
 /*
@@ -963,3 +970,182 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 
 	return error;
 }
+
+/*
+ * Link contents of ipipe to opipe.
+ */
+static int link_pipe(struct pipe_inode_info *ipipe,
+		     struct pipe_inode_info *opipe,
+		     size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, do_wakeup = 0, i;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by inode address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	if (ipipe->inode < opipe->inode) {
+		mutex_lock(&ipipe->inode->i_mutex);
+		mutex_lock(&opipe->inode->i_mutex);
+	} else {
+		mutex_lock(&opipe->inode->i_mutex);
+		mutex_lock(&ipipe->inode->i_mutex);
+	}
+
+	for (i = 0;; i++) {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+		if (ipipe->nrbufs - i) {
+			ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
+
+			/*
+			 * If we have room, fill this buffer
+			 */
+			if (opipe->nrbufs < PIPE_BUFFERS) {
+				int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+
+				/*
+				 * Get a reference to this pipe buffer,
+				 * so we can copy the contents over.
+				 */
+				ibuf->ops->get(ipipe, ibuf);
+
+				obuf = opipe->bufs + nbuf;
+				*obuf = *ibuf;
+
+				if (obuf->len > len)
+					obuf->len = len;
+
+				opipe->nrbufs++;
+				do_wakeup = 1;
+				ret += obuf->len;
+				len -= obuf->len;
+
+				if (!len)
+					break;
+				if (opipe->nrbufs < PIPE_BUFFERS)
+					continue;
+			}
+
+			/*
+			 * We have input available, but no output room.
+			 * If we already copied data, return that.
+			 */
+			if (flags & SPLICE_F_NONBLOCK) {
+				if (!ret)
+					ret = -EAGAIN;
+				break;
+			}
+			if (signal_pending(current)) {
+				if (!ret)
+					ret = -ERESTARTSYS;
+				break;
+			}
+			if (do_wakeup) {
+				smp_mb();
+				if (waitqueue_active(&opipe->wait))
+					wake_up_interruptible(&opipe->wait);
+				kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+				do_wakeup = 0;
+			}
+
+			opipe->waiting_writers++;
+			pipe_wait(opipe);
+			opipe->waiting_writers--;
+			continue;
+		}
+
+		/*
+		 * No input buffers, do the usual checks for available
+		 * writers and blocking and wait if necessary
+		 */
+		if (!ipipe->writers)
+			break;
+		if (!ipipe->waiting_writers) {
+			if (ret)
+				break;
+		}
+		if (flags & SPLICE_F_NONBLOCK) {
+			if (!ret)
+				ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (waitqueue_active(&ipipe->wait))
+			wake_up_interruptible_sync(&ipipe->wait);
+		kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
+
+		pipe_wait(ipipe);
+	}
+
+	mutex_unlock(&ipipe->inode->i_mutex);
+	mutex_unlock(&opipe->inode->i_mutex);
+
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&opipe->wait))
+			wake_up_interruptible(&opipe->wait);
+		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+	}
+
+	return ret;
+}
+
+/*
+ * This is a tee(1) implementation that works on pipes. It doesn't copy
+ * any data, it simply references the 'in' pages on the 'out' pipe.
+ * The 'flags' used are the SPLICE_F_* variants, currently the only
+ * applicable one is SPLICE_F_NONBLOCK.
+ */
+static long do_tee(struct file *in, struct file *out, size_t len,
+		   unsigned int flags)
+{
+	struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
+	struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
+
+	/*
+	 * Link ipipe to the two output pipes, consuming as we go along.
+	 */
+	if (ipipe && opipe)
+		return link_pipe(ipipe, opipe, len, flags);
+
+	return -EINVAL;
+}
+
+asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
+{
+	struct file *in;
+	int error, fput_in;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fget_light(fdin, &fput_in);
+	if (in) {
+		if (in->f_mode & FMODE_READ) {
+			int fput_out;
+			struct file *out = fget_light(fdout, &fput_out);
+
+			if (out) {
+				if (out->f_mode & FMODE_WRITE)
+					error = do_tee(in, out, len, flags);
+				fput_light(out, fput_out);
+			}
+		}
+ 		fput_light(in, fput_in);
+ 	}
+
+	return error;
+}
-- 
cgit v1.2.3


From 73ce8355c243a434524a34c05cc417dd0467996e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 11 Apr 2006 21:14:26 +0200
Subject: [fuse] fix deadlock between fuse_put_super() and request_end()

A deadlock was possible, when the last reference to the superblock was
held due to a background request containing a file reference.

Releasing the file would release the vfsmount which in turn would
release the superblock.  Since sbput_sem is held during the fput() and
fuse_put_super() tries to acquire this same semaphore, a deadlock
results.

The chosen soltuion is to get rid of sbput_sem, and instead use the
spinlock to ensure the referenced inodes/file are released only once.
Since the actual release may sleep, defer these outside the locked
region, but using local variables instead of the structure members.

This is a much more rubust solution.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/dev.c    | 28 ++++++++++++++++------------
 fs/fuse/fuse_i.h | 12 +++---------
 fs/fuse/inode.c  | 27 +++++++++++++++++----------
 3 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6c740f86066..d4efb6223e2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -120,20 +120,14 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req)
 {
-	iput(req->inode);
-	iput(req->inode2);
-	if (req->file)
-		fput(req->file);
-	spin_lock(&fc->lock);
-	list_del(&req->bg_entry);
+	list_del_init(&req->bg_entry);
 	if (fc->num_background == FUSE_MAX_BACKGROUND) {
 		fc->blocked = 0;
 		wake_up_all(&fc->blocked_waitq);
 	}
 	fc->num_background--;
-	spin_unlock(&fc->lock);
 }
 
 /*
@@ -163,17 +157,27 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		wake_up(&req->waitq);
 		fuse_put_request(fc, req);
 	} else {
+		struct inode *inode = req->inode;
+		struct inode *inode2 = req->inode2;
+		struct file *file = req->file;
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 		req->end = NULL;
+		req->inode = NULL;
+		req->inode2 = NULL;
+		req->file = NULL;
+		if (!list_empty(&req->bg_entry))
+			fuse_remove_background(fc, req);
 		spin_unlock(&fc->lock);
-		down_read(&fc->sbput_sem);
-		if (fc->mounted)
-			fuse_release_background(fc, req);
-		up_read(&fc->sbput_sem);
+
 		if (end)
 			end(fc, req);
 		else
 			fuse_put_request(fc, req);
+
+		if (file)
+			fput(file);
+		iput(inode);
+		iput(inode2);
 	}
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 19c7185a754..ee9b8304251 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -255,15 +255,9 @@ struct fuse_conn {
 	/** waitq for blocked connection */
 	wait_queue_head_t blocked_waitq;
 
-	/** RW semaphore for exclusion with fuse_put_super() */
-	struct rw_semaphore sbput_sem;
-
 	/** The next unique request id */
 	u64 reqctr;
 
-	/** Mount is active */
-	unsigned mounted;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -474,11 +468,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
- * Release inodes and file associated with background request
+ * Remove request from the the background list
  */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req);
 
-/* Abort all requests */
+/** Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 
 /**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fd34037b058..43a6fc0db8a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -204,17 +204,26 @@ static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
-	down_write(&fc->sbput_sem);
-	while (!list_empty(&fc->background))
-		fuse_release_background(fc,
-					list_entry(fc->background.next,
-						   struct fuse_req, bg_entry));
-
 	spin_lock(&fc->lock);
-	fc->mounted = 0;
 	fc->connected = 0;
+	while (!list_empty(&fc->background)) {
+		struct fuse_req *req = list_entry(fc->background.next,
+						  struct fuse_req, bg_entry);
+		struct inode *inode = req->inode;
+		struct inode *inode2 = req->inode2;
+
+		/* File would hold a reference to vfsmount */
+		BUG_ON(req->file);
+		req->inode = NULL;
+		req->inode2 = NULL;
+		fuse_remove_background(fc, req);
+
+		spin_unlock(&fc->lock);
+		iput(inode);
+		iput(inode2);
+		spin_lock(&fc->lock);
+	}
 	spin_unlock(&fc->lock);
-	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
@@ -386,7 +395,6 @@ static struct fuse_conn *new_conn(void)
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
 		INIT_LIST_HEAD(&fc->background);
-		init_rwsem(&fc->sbput_sem);
 		kobj_set_kset_s(fc, connections_subsys);
 		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
@@ -541,7 +549,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_free_req;
 
 	sb->s_root = root_dentry;
-	fc->mounted = 1;
 	fc->connected = 1;
 	kobject_get(&fc->kobj);
 	file->private_data = fc;
-- 
cgit v1.2.3


From 9bc5dddad1294955e70eeb87325ba1505fb5fe2e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 11 Apr 2006 21:16:09 +0200
Subject: [fuse] Fix accounting the number of waiting requests

Properly accounting the number of waiting requests was forgotten in
"clean up request accounting" patch.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/dev.c    | 25 +++++++++++++++++++------
 fs/fuse/fuse_i.h |  3 +++
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index d4efb6223e2..8538b298a6b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -92,30 +92,39 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
 	struct fuse_req *req;
 	sigset_t oldset;
+	int intr;
 	int err;
 
+	atomic_inc(&fc->num_waiting);
 	block_sigs(&oldset);
-	err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
+	intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
 	restore_sigs(&oldset);
-	if (err)
-		return ERR_PTR(-EINTR);
+	err = -EINTR;
+	if (intr)
+		goto out;
 
 	req = fuse_request_alloc();
+	err = -ENOMEM;
 	if (!req)
-		return ERR_PTR(-ENOMEM);
+		goto out;
 
-	atomic_inc(&fc->num_waiting);
 	fuse_request_init(req);
 	req->in.h.uid = current->fsuid;
 	req->in.h.gid = current->fsgid;
 	req->in.h.pid = current->pid;
+	req->waiting = 1;
 	return req;
+
+ out:
+	atomic_dec(&fc->num_waiting);
+	return ERR_PTR(err);
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
-		atomic_dec(&fc->num_waiting);
+		if (req->waiting)
+			atomic_dec(&fc->num_waiting);
 		fuse_request_free(req);
 	}
 }
@@ -281,6 +290,10 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
 	list_add_tail(&req->list, &fc->pending);
 	req->state = FUSE_REQ_PENDING;
+	if (!req->waiting) {
+		req->waiting = 1;
+		atomic_inc(&fc->num_waiting);
+	}
 	wake_up(&fc->waitq);
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ee9b8304251..59661c481d9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -159,6 +159,9 @@ struct fuse_req {
 	/** Data is being copied to/from the request */
 	unsigned locked:1;
 
+	/** Request is counted as "waiting" */
+	unsigned waiting:1;
+
 	/** State of the request */
 	enum fuse_req_state state;
 
-- 
cgit v1.2.3


From 4858cae4f0904681eab58a16891c22397618a2a2 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 11 Apr 2006 21:16:38 +0200
Subject: [fuse] Don't init request twice

Request is already initialized in fuse_request_alloc() so no need to
do it again in fuse_get_req().

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/dev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8538b298a6b..cc750c68fe7 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -108,7 +108,6 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 	if (!req)
 		goto out;
 
-	fuse_request_init(req);
 	req->in.h.uid = current->fsuid;
 	req->in.h.gid = current->fsgid;
 	req->in.h.pid = current->pid;
-- 
cgit v1.2.3


From 56cf34ff0795692327234963dcdcc2cdeec2bb3d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 11 Apr 2006 21:16:51 +0200
Subject: [fuse] Direct I/O  should not use fuse_reset_request

It's cleaner to allocate a new request, otherwise the uid/gid/pid
fields of the request won't be filled in.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/file.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e4f041a11bb..fc342cf7c2c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -565,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		buf += nres;
 		if (nres != nbytes)
 			break;
-		if (count)
-			fuse_reset_request(req);
+		if (count) {
+			fuse_put_request(fc, req);
+			req = fuse_get_req(fc);
+			if (IS_ERR(req))
+				break;
+		}
 	}
 	fuse_put_request(fc, req);
 	if (res > 0) {
-- 
cgit v1.2.3


From c06511d12d720b23c8dffff23004f0a888698f20 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 14 Apr 2006 04:05:55 -0600
Subject: [PATCH] de_thread: Don't change our parents and ptrace flags.

This is two distinct changes.
 - Not changing our real parents.
 - Not changing our ptrace parents.

Not changing our real parents is trivially correct because both tasks
have the same real parents as they are part of a thread group.  Now that
we demote the leader to a thread there is no longer any reason to change
it's parentage.

Not changing our ptrace parents is a user visible change if someone
looks hard enough.  I don't think user space applications will care or
even notice.

In the practical and I think common case a debugger will have attached
to all of the threads using the same ptrace flags.  From my quick skim
of strace and gdb that appears to be the case.  Which if true means
debuggers will not notice a change.

Before this point we have already generated a ptrace event in do_exit
that reports the leaders pid has died so de_thread is visible to a
debugger.  Which means attempting to hide this case by copying flags
around appears excessive.

By not doing anything it avoids all of the weird locking issues between
de_thread and ptrace attach, and removes one case from consideration for
fixing the ptrace locking.

This only addresses Oleg's first concern with ptrace_attach, that of the
problems caused by reparenting.  Oleg's second concern is essentially a
race between ptrace_attach and release_task that causes an oops when we
get to force_sig_specific.  There is nothing special about de_thread
with respect to that race.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 3234a0c32d5..4121bb55973 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(current)) {
-		struct task_struct *parent;
 		struct dentry *proc_dentry1, *proc_dentry2;
-		unsigned long ptrace;
 
 		/*
 		 * Wait for the thread group leader to be a zombie.
@@ -704,22 +702,6 @@ static int de_thread(struct task_struct *tsk)
 		 * two threads with a switched PID, and release
 		 * the former thread group leader:
 		 */
-		ptrace = leader->ptrace;
-		parent = leader->parent;
-		if (unlikely(ptrace) && unlikely(parent == current)) {
-			/*
-			 * Joker was ptracing his own group leader,
-			 * and now he wants to be his own parent!
-			 * We can't have that.
-			 */
-			ptrace = 0;
-		}
-
-		ptrace_unlink(current);
-		ptrace_unlink(leader);
-		remove_parent(current);
-		remove_parent(leader);
-
 
 		/* Become a process group leader with the old leader's pid.
 		 * Note: The old leader also uses thispid until release_task
@@ -732,8 +714,6 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
 		list_add_tail(&current->tasks, &init_task.tasks);
 
-		current->parent = current->real_parent = leader->real_parent;
-		leader->parent = leader->real_parent = child_reaper;
 		current->group_leader = current;
 		leader->group_leader = current;
 
@@ -742,13 +722,6 @@ static int de_thread(struct task_struct *tsk)
 		detach_pid(leader, PIDTYPE_SID);
 		list_del_init(&leader->tasks);
 
-		add_parent(current);
-		add_parent(leader);
-		if (ptrace) {
-			current->ptrace = ptrace;
-			__ptrace_link(current, parent);
-		}
-
 		current->exit_signal = SIGCHLD;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
-- 
cgit v1.2.3


From 4508a7a734b111b8b7e39986237d84acb1168dd0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 20 Mar 2006 17:53:53 +1100
Subject: [PATCH] sysfs: Allow sysfs attribute files to be pollable

It works like this:
  Open the file
  Read all the contents.
  Call poll requesting POLLERR or POLLPRI (so select/exceptfds works)
  When poll returns,
     close the file and go to top of loop.
   or lseek to start of file and go back to the 'read'.

Events are signaled by an object manager calling
   sysfs_notify(kobj, dir, attr);

If the dir is non-NULL, it is used to find a subdirectory which
contains the attribute (presumably created by sysfs_create_group).

This has a cost of one int  per attribute, one wait_queuehead per kobject,
one int per open file.

The name "sysfs_notify" may be confused with the inotify
functionality.  Maybe it would be nice to support inotify for sysfs
attributes as well?

This patch also uses sysfs_notify to allow /sys/block/md*/md/sync_action
to be pollable

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   |  1 +
 fs/sysfs/file.c  | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/sysfs/sysfs.h |  1 +
 3 files changed, 78 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 6cfdc9a8777..610b5bdbe75 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd,
 
 	memset(sd, 0, sizeof(*sd));
 	atomic_set(&sd->s_count, 1);
+	atomic_set(&sd->s_event, 0);
 	INIT_LIST_HEAD(&sd->s_children);
 	list_add(&sd->s_sibling, &parent_sd->s_children);
 	sd->s_element = element;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f1cb1ddde51..cf3786625bf 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -6,6 +6,7 @@
 #include <linux/fsnotify.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
+#include <linux/poll.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 
@@ -57,6 +58,7 @@ struct sysfs_buffer {
 	struct sysfs_ops	* ops;
 	struct semaphore	sem;
 	int			needs_read_fill;
+	int			event;
 };
 
 
@@ -72,6 +74,7 @@ struct sysfs_buffer {
  */
 static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
 {
+	struct sysfs_dirent * sd = dentry->d_fsdata;
 	struct attribute * attr = to_attr(dentry);
 	struct kobject * kobj = to_kobj(dentry->d_parent);
 	struct sysfs_ops * ops = buffer->ops;
@@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 	if (!buffer->page)
 		return -ENOMEM;
 
+	buffer->event = atomic_read(&sd->s_event);
 	count = ops->show(kobj,attr,buffer->page);
 	buffer->needs_read_fill = 0;
 	BUG_ON(count > (ssize_t)PAGE_SIZE);
@@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
+/* Sysfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, as simply seeking and reading
+ * again will not get new data, or reset the state of 'poll'.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Nether 'poll' or 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
+{
+	struct sysfs_buffer * buffer = filp->private_data;
+	struct kobject * kobj = to_kobj(filp->f_dentry->d_parent);
+	struct sysfs_dirent * sd = filp->f_dentry->d_fsdata;
+	int res = 0;
+
+	poll_wait(filp, &kobj->poll, wait);
+
+	if (buffer->event != atomic_read(&sd->s_event)) {
+		res = POLLERR|POLLPRI;
+		buffer->needs_read_fill = 1;
+	}
+
+	return res;
+}
+
+
+static struct dentry *step_down(struct dentry *dir, const char * name)
+{
+	struct dentry * de;
+
+	if (dir == NULL || dir->d_inode == NULL)
+		return NULL;
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	de = lookup_one_len(name, dir, strlen(name));
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(dir);
+	if (IS_ERR(de))
+		return NULL;
+	if (de->d_inode == NULL) {
+		dput(de);
+		return NULL;
+	}
+	return de;
+}
+
+void sysfs_notify(struct kobject * k, char *dir, char *attr)
+{
+	struct dentry *de = k->dentry;
+	if (de)
+		dget(de);
+	if (de && dir)
+		de = step_down(de, dir);
+	if (de && attr)
+		de = step_down(de, attr);
+	if (de) {
+		struct sysfs_dirent * sd = de->d_fsdata;
+		if (sd)
+			atomic_inc(&sd->s_event);
+		wake_up_interruptible(&k->poll);
+		dput(de);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
+
 const struct file_operations sysfs_file_operations = {
 	.read		= sysfs_read_file,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
 	.open		= sysfs_open_file,
 	.release	= sysfs_release,
+	.poll		= sysfs_poll,
 };
 
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 32958a7c50e..3651ffb5ec0 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
 
 extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
 extern void sysfs_hash_and_remove(struct dentry * dir, const char * name);
+extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
 
 extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
 extern void sysfs_remove_subdir(struct dentry *);
-- 
cgit v1.2.3


From d4d7e5dffc4844ef51fe11f497bd774c04413a00 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 24 Mar 2006 20:45:35 +0100
Subject: [PATCH] BLOCK: delay all uevents until partition table is scanned

[BLOCK] delay all uevents until partition table is scanned

Here we delay the annoucement of all block device events until the
disk's partition table is scanned and all partition devices are already
created and sysfs is populated.

We have a bunch of old bugs for removable storage handling where we
probe successfully for a filesystem on the raw disk, but at the
same time the kernel recognizes a partition table and creates partition
devices.
Currently there is no sane way to tell if partitions will show up or not
at the time the disk device is announced to userspace. With the delayed
events we can simply skip any probe for a filesystem on the raw disk when
we find already present partitions.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index af0cb4b9e78..f3b6af07172 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part)
 	devfs_remove("%s/part%d", disk->devfs_name, part);
 	if (p->holder_dir)
 		kobject_unregister(p->holder_dir);
-	kobject_unregister(&p->kobj);
+	kobject_uevent(&p->kobj, KOBJ_REMOVE);
+	kobject_del(&p->kobj);
+	kobject_put(&p->kobj);
 }
 
 void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
@@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 		snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part);
 	p->kobj.parent = &disk->kobj;
 	p->kobj.ktype = &ktype_part;
-	kobject_register(&p->kobj);
+	kobject_init(&p->kobj);
+	kobject_add(&p->kobj);
+	if (!disk->part_uevent_suppress)
+		kobject_uevent(&p->kobj, KOBJ_ADD);
 	partition_sysfs_add_subdir(p);
 	disk->part[part-1] = p;
 }
@@ -395,6 +400,8 @@ void register_disk(struct gendisk *disk)
 {
 	struct block_device *bdev;
 	char *s;
+	int i;
+	struct hd_struct *p;
 	int err;
 
 	strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN);
@@ -406,13 +413,12 @@ void register_disk(struct gendisk *disk)
 		return;
 	disk_sysfs_symlinks(disk);
  	disk_sysfs_add_subdirs(disk);
-	kobject_uevent(&disk->kobj, KOBJ_ADD);
 
 	/* No minors to use for partitions */
 	if (disk->minors == 1) {
 		if (disk->devfs_name[0] != '\0')
 			devfs_add_disk(disk);
-		return;
+		goto exit;
 	}
 
 	/* always add handle for the whole disk */
@@ -420,16 +426,32 @@ void register_disk(struct gendisk *disk)
 
 	/* No such device (e.g., media were just removed) */
 	if (!get_capacity(disk))
-		return;
+		goto exit;
 
 	bdev = bdget_disk(disk, 0);
 	if (!bdev)
-		return;
+		goto exit;
 
+	/* scan partition table, but suppress uevents */
 	bdev->bd_invalidated = 1;
-	if (blkdev_get(bdev, FMODE_READ, 0) < 0)
-		return;
+	disk->part_uevent_suppress = 1;
+	err = blkdev_get(bdev, FMODE_READ, 0);
+	disk->part_uevent_suppress = 0;
+	if (err < 0)
+		goto exit;
 	blkdev_put(bdev);
+
+exit:
+	/* announce disk after possible partitions are already created */
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	for (i = 1; i < disk->minors; i++) {
+		p = disk->part[i-1];
+		if (!p || !p->nr_sects)
+			continue;
+		kobject_uevent(&p->kobj, KOBJ_ADD);
+	}
 }
 
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
-- 
cgit v1.2.3


From 2436f039d26a91e5404974ee0cb789b17db46168 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 10 Apr 2006 00:17:20 -0700
Subject: [PATCH] Fix block device symlink name

As noted further on the this file, some block devices have a / in their
name, so fix the "block:..." symlink name the same as the /sys/block name.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f3b6af07172..45ae7dd3c65 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -372,6 +372,7 @@ static char *make_block_name(struct gendisk *disk)
 	char *name;
 	static char *block_str = "block:";
 	int size;
+	char *s;
 
 	size = strlen(block_str) + strlen(disk->disk_name) + 1;
 	name = kmalloc(size, GFP_KERNEL);
@@ -379,6 +380,10 @@ static char *make_block_name(struct gendisk *disk)
 		return NULL;
 	strcpy(name, block_str);
 	strcat(name, disk->disk_name);
+	/* ewww... some of these buggers have / in name... */
+	s = strchr(name, '/');
+	if (s)
+		*s = '!';
 	return name;
 }
 
-- 
cgit v1.2.3


From 75616cf9854b83eb83a968b1338ae0ee11c9673c Mon Sep 17 00:00:00 2001
From: "Ananiev, Leonid I" <leonid.i.ananiev@intel.com>
Date: Mon, 10 Apr 2006 22:54:38 -0700
Subject: [PATCH] ext3: Fix missed mutex unlock

Missed unlock_super()call is added in error condition code path.

Signed-off-by: Leonid Ananiev <leonid.i.ananiev@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext3/resize.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 14f5f6ea3e7..c5ffa852396 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	if (input->group != sbi->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
+		unlock_super(sb);
 		err = -EBUSY;
 		goto exit_journal;
 	}
-- 
cgit v1.2.3


From 0a489cb3b6a7b277030cdbc97c2c65905db94536 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 18 Apr 2006 13:02:48 -0700
Subject: x86: don't allow tail-calls in sys_ftruncate[64]()

Gcc thinks it owns the incoming argument stack, but that's not true for
"asmlinkage" functions, and it corrupts the caller-set-up argument stack
when it pushes the third argument onto the stack.  Which can result in
%ebx getting corrupted in user space.

Now, normally nobody sane would ever notice, since libc will save and
restore %ebx anyway over the system call, but it's still wrong.

I'd much rather have "asmlinkage" tell gcc directly that it doesn't own
the stack, but no such attribute exists, so we're stuck with our hacky
manual "prevent_tail_call()" macro once more (we've had the same issue
before with sys_waitpid() and sys_wait4()).

Thanks to Hans-Werner Hilse <hilse@sub.uni-goettingen.de> for reporting
the issue and testing the fix.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index c32c89d6d8d..8279c65d3be 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -331,7 +331,9 @@ out:
 
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 {
-	return do_sys_ftruncate(fd, length, 1);
+	long ret = do_sys_ftruncate(fd, length, 1);
+	prevent_tail_call(ret);
+	return ret;
 }
 
 /* LFS versions of truncate are only needed on 32 bit machines */
@@ -343,7 +345,9 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length)
 
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 {
-	return do_sys_ftruncate(fd, length, 0);
+	long ret = do_sys_ftruncate(fd, length, 0);
+	prevent_tail_call(ret);
+	return ret;
 }
 #endif
 
-- 
cgit v1.2.3


From 385910f2b275a636238f70844f1b6da9fda6f2da Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 18 Apr 2006 13:22:59 -0700
Subject: x86: be careful about tailcall breakage for sys_open[at] too

Came up through a quick grep for other cases similar to the ftruncate()
one in commit 0a489cb3b6a7b277030cdbc97c2c65905db94536.

Also, add a comment, so that people who read the code understand why we
do what looks like a no-op.

(Again, this won't actually matter to any sane user, since libc will
save and restore the register gcc stomps on, but it's still wrong to
stomp on it)

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 8279c65d3be..53ec28c3677 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -332,6 +332,7 @@ out:
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 {
 	long ret = do_sys_ftruncate(fd, length, 1);
+	/* avoid REGPARM breakage on x86: */
 	prevent_tail_call(ret);
 	return ret;
 }
@@ -346,6 +347,7 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length)
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 {
 	long ret = do_sys_ftruncate(fd, length, 0);
+	/* avoid REGPARM breakage on x86: */
 	prevent_tail_call(ret);
 	return ret;
 }
@@ -1097,20 +1099,30 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 
 asmlinkage long sys_open(const char __user *filename, int flags, int mode)
 {
+	long ret;
+
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	return do_sys_open(AT_FDCWD, filename, flags, mode);
+	ret = do_sys_open(AT_FDCWD, filename, flags, mode);
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sys_open);
 
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   int mode)
 {
+	long ret;
+
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	return do_sys_open(dfd, filename, flags, mode);
+	ret = do_sys_open(dfd, filename, flags, mode);
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sys_openat);
 
-- 
cgit v1.2.3


From 91ad66ef4469cb631ec0ccd131b07f16770773f7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:55:10 +0200
Subject: [PATCH] splice: close i_size truncate races on read

We need to check i_size after doing a blocking readpage.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 8d57e89924a..7e858557472 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -145,8 +145,8 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
 static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
-			    int nr_pages, unsigned long offset,
-			    unsigned long len, unsigned int flags)
+			    int nr_pages, unsigned long len,
+			    unsigned int offset, unsigned int flags)
 {
 	int ret, do_wakeup, i;
 
@@ -243,14 +243,16 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
-	unsigned int offset, nr_pages;
+	unsigned int loff, offset, nr_pages;
 	struct page *pages[PIPE_BUFFERS];
 	struct page *page;
-	pgoff_t index;
+	pgoff_t index, end_index;
+	loff_t isize;
+	size_t bytes;
 	int i, error;
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
-	offset = *ppos & ~PAGE_CACHE_MASK;
+	loff = offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
@@ -268,6 +270,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 * Now fill in the holes:
 	 */
 	error = 0;
+	bytes = 0;
 	for (i = 0; i < nr_pages; i++, index++) {
 find_page:
 		/*
@@ -336,13 +339,41 @@ readpage:
 					goto find_page;
 				break;
 			}
+
+			/*
+			 * i_size must be checked after ->readpage().
+			 */
+			isize = i_size_read(mapping->host);
+			end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+			if (unlikely(!isize || index > end_index)) {
+				page_cache_release(page);
+				break;
+			}
+
+			/*
+			 * if this is the last page, see if we need to shrink
+			 * the length and stop
+			 */
+			if (end_index == index) {
+				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
+				if (bytes + loff > isize) {
+					page_cache_release(page);
+					break;
+				}
+				/*
+				 * force quit after adding this page
+				 */
+				nr_pages = i;
+			}
 		}
 fill_it:
 		pages[i] = page;
+		bytes += PAGE_CACHE_SIZE - loff;
+		loff = 0;
 	}
 
 	if (i)
-		return move_to_pipe(pipe, pages, i, offset, len, flags);
+		return move_to_pipe(pipe, pages, i, bytes, offset, flags);
 
 	return error;
 }
-- 
cgit v1.2.3


From c4f895cbe1e95aab633207fb19c650b7c984c01a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:56:12 +0200
Subject: [PATCH] splice: cleanup the SPLICE_F_NONBLOCK handling

- generic_file_splice_read() more readable and correct
- Don't bail on page allocation with NONBLOCK set, just don't allow
  direct blocking on IO (eg lock_page).

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 7e858557472..78cd264340f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -278,14 +278,6 @@ find_page:
 		 */
 		page = find_get_page(mapping, index);
 		if (!page) {
-			/*
-			 * If in nonblock mode then dont block on
-			 * readpage (we've kicked readahead so there
-			 * will be asynchronous progress):
-			 */
-			if (flags & SPLICE_F_NONBLOCK)
-				break;
-
 			/*
 			 * page didn't exist, allocate one
 			 */
@@ -307,6 +299,13 @@ find_page:
 		 * If the page isn't uptodate, we may need to start io on it
 		 */
 		if (!PageUptodate(page)) {
+			/*
+			 * If in nonblock mode then dont block on waiting
+			 * for an in-flight io page
+			 */
+			if (flags & SPLICE_F_NONBLOCK)
+				break;
+
 			lock_page(page);
 
 			/*
@@ -400,17 +399,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 	while (len) {
 		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 
-		if (ret <= 0)
+		if (ret < 0)
 			break;
+		else if (!ret) {
+			if (spliced)
+				break;
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
 
 		*ppos += ret;
 		len -= ret;
 		spliced += ret;
-
-		if (!(flags & SPLICE_F_NONBLOCK))
-			continue;
-		ret = -EAGAIN;
-		break;
 	}
 
 	if (spliced)
-- 
cgit v1.2.3


From 2a27250e6cf47ca1ea3bea0a55e4b7889c097627 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:56:40 +0200
Subject: [PATCH] tee: link_pipe() must be careful when dropping one of the
 pipe locks

We need to ensure that we only drop a lock that is ordered last, to avoid
ABBA deadlocks with competing processes.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 78cd264340f..4f5e6b09fb2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1012,7 +1012,9 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		     size_t len, unsigned int flags)
 {
 	struct pipe_buffer *ibuf, *obuf;
-	int ret = 0, do_wakeup = 0, i;
+	int ret, do_wakeup, i, ipipe_first;
+
+	ret = do_wakeup = ipipe_first = 0;
 
 	/*
 	 * Potential ABBA deadlock, work around it by ordering lock
@@ -1020,6 +1022,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
 	 */
 	if (ipipe->inode < opipe->inode) {
+		ipipe_first = 1;
 		mutex_lock(&ipipe->inode->i_mutex);
 		mutex_lock(&opipe->inode->i_mutex);
 	} else {
@@ -1068,9 +1071,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 
 			/*
 			 * We have input available, but no output room.
-			 * If we already copied data, return that.
+			 * If we already copied data, return that. If we
+			 * need to drop the opipe lock, it must be ordered
+			 * last to avoid deadlocks.
 			 */
-			if (flags & SPLICE_F_NONBLOCK) {
+			if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
 				if (!ret)
 					ret = -EAGAIN;
 				break;
@@ -1104,7 +1109,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 			if (ret)
 				break;
 		}
-		if (flags & SPLICE_F_NONBLOCK) {
+		/*
+		 * pipe_wait() drops the ipipe mutex. To avoid deadlocks
+		 * with another process, we can only safely do that if
+		 * the ipipe lock is ordered last.
+		 */
+		if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
-- 
cgit v1.2.3


From a4514ebd8e12c63c09ab02be518db545bd1d24af Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:57:05 +0200
Subject: [PATCH] splice: offset fixes

- We need to adjust *ppos for writes as well.
- Copy back modified offset value if one was passed in, similar to
  what sendfile does.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 45 ++++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 4f5e6b09fb2..27d6408ff49 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -720,22 +720,26 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	ssize_t ret;
 
 	ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
-
-	/*
-	 * If file or inode is SYNC and we actually wrote some data, sync it.
-	 */
-	if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host))
-	    && ret > 0) {
+	if (ret > 0) {
 		struct inode *inode = mapping->host;
-		int err;
 
-		mutex_lock(&inode->i_mutex);
-		err = generic_osync_inode(mapping->host, mapping,
-					  OSYNC_METADATA|OSYNC_DATA);
-		mutex_unlock(&inode->i_mutex);
+		*ppos += ret;
+
+		/*
+		 * If file or inode is SYNC and we actually wrote some data,
+		 * sync it.
+		 */
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			int err;
+
+			mutex_lock(&inode->i_mutex);
+			err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+			mutex_unlock(&inode->i_mutex);
 
-		if (err)
-			ret = err;
+			if (err)
+				ret = err;
+		}
 	}
 
 	return ret;
@@ -937,6 +941,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 {
 	struct pipe_inode_info *pipe;
 	loff_t offset, *off;
+	long ret;
 
 	pipe = in->f_dentry->d_inode->i_pipe;
 	if (pipe) {
@@ -951,7 +956,12 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &out->f_pos;
 
-		return do_splice_from(pipe, out, off, len, flags);
+		ret = do_splice_from(pipe, out, off, len, flags);
+
+		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
 	}
 
 	pipe = out->f_dentry->d_inode->i_pipe;
@@ -967,7 +977,12 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &in->f_pos;
 
-		return do_splice_to(in, off, pipe, len, flags);
+		ret = do_splice_to(in, off, pipe, len, flags);
+
+		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
 	}
 
 	return -EINVAL;
-- 
cgit v1.2.3


From 9e0267c26e237f84f608a68e579bf4eb89dad819 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:57:31 +0200
Subject: [PATCH] splice: fixup writeout path after ->map changes

Since ->map() no longer locks the page, we need to adjust the handling
of those pages (and stealing) a little. This now passes full regressions
again.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 49 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 27d6408ff49..22fac87e90b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -50,7 +50,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 	struct page *page = buf->page;
 	struct address_space *mapping = page_mapping(page);
 
-	WARN_ON(!PageLocked(page));
+	lock_page(page);
+
 	WARN_ON(!PageUptodate(page));
 
 	/*
@@ -65,8 +66,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 	if (PagePrivate(page))
 		try_to_release_page(page, mapping_gfp_mask(mapping));
 
-	if (!remove_mapping(mapping, page))
+	if (!remove_mapping(mapping, page)) {
+		unlock_page(page);
 		return 1;
+	}
 
 	buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
 	return 0;
@@ -507,14 +510,12 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	if (sd->flags & SPLICE_F_MOVE) {
 		/*
 		 * If steal succeeds, buf->page is now pruned from the vm
-		 * side (LRU and page cache) and we can reuse it.
+		 * side (LRU and page cache) and we can reuse it. The page
+		 * will also be looked on successful return.
 		 */
 		if (buf->ops->steal(info, buf))
 			goto find_page;
 
-		/*
-		 * this will also set the page locked
-		 */
 		page = buf->page;
 		if (add_to_page_cache(page, mapping, index, gfp_mask))
 			goto find_page;
@@ -523,15 +524,27 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 			lru_cache_add(page);
 	} else {
 find_page:
-		ret = -ENOMEM;
-		page = find_or_create_page(mapping, index, gfp_mask);
-		if (!page)
-			goto out_nomem;
+		page = find_lock_page(mapping, index);
+		if (!page) {
+			ret = -ENOMEM;
+			page = page_cache_alloc_cold(mapping);
+			if (unlikely(!page))
+				goto out_nomem;
+
+			/*
+			 * This will also lock the page
+			 */
+			ret = add_to_page_cache_lru(page, mapping, index,
+						    gfp_mask);
+			if (unlikely(ret))
+				goto out;
+		}
 
 		/*
-		 * If the page is uptodate, it is also locked. If it isn't
-		 * uptodate, we can mark it uptodate if we are filling the
-		 * full page. Otherwise we need to read it in first...
+		 * We get here with the page locked. If the page is also
+		 * uptodate, we don't need to do more. If it isn't, we
+		 * may need to bring it in if we are not going to overwrite
+		 * the full page.
 		 */
 		if (!PageUptodate(page)) {
 			if (sd->len < PAGE_CACHE_SIZE) {
@@ -553,10 +566,8 @@ find_page:
 					ret = -EIO;
 					goto out;
 				}
-			} else {
-				WARN_ON(!PageLocked(page));
+			} else
 				SetPageUptodate(page);
-			}
 		}
 	}
 
@@ -585,10 +596,10 @@ find_page:
 	mark_page_accessed(page);
 	balance_dirty_pages_ratelimited(mapping);
 out:
-	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
+	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN))
 		page_cache_release(page);
-		unlock_page(page);
-	}
+
+	unlock_page(page);
 out_nomem:
 	buf->ops->unmap(info, buf);
 	return ret;
-- 
cgit v1.2.3


From 5e85d4abe3f43bb5362f384bab0e20ef082ce0b5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 18 Apr 2006 22:20:16 -0700
Subject: [PATCH] task: Make task list manipulations RCU safe

While we can currently walk through thread groups, process groups, and
sessions with just the rcu_read_lock, this opens the door to walking the
entire task list.

We already have all of the other RCU guarantees so there is no cost in
doing this, this should be enough so that proc can stop taking the
tasklist lock during readdir.

prev_task was killed because it has no users, and using it will miss new
tasks when doing an rcu traversal.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 4121bb55973..3a79d97ac23 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -712,7 +712,7 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_PID,  current->pid);
 		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_add_tail(&current->tasks, &init_task.tasks);
+		list_add_tail_rcu(&current->tasks, &init_task.tasks);
 
 		current->group_leader = current;
 		leader->group_leader = current;
-- 
cgit v1.2.3


From dda27d1a55e185b0c5fd184b86ac26c66846f095 Mon Sep 17 00:00:00 2001
From: Arthur Othieno <apgo@patchbomb.org>
Date: Tue, 18 Apr 2006 22:20:57 -0700
Subject: [PATCH] hugetlbfs: add Kconfig help text

In kernel bugzilla #6248 (http://bugzilla.kernel.org/show_bug.cgi?id=6248),
Adrian Bunk <bunk@stusta.de> notes that CONFIG_HUGETLBFS is missing Kconfig
help text.

Signed-off-by: Arthur Othieno <apgo@patchbomb.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 2524629dc83..f9b5842c8d2 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -842,6 +842,12 @@ config TMPFS
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
+	help
+	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
+	  ramfs. For architectures that support it, say Y here and read
+	  <file:Documentation/vm/hugetlbpage.txt> for details.
+
+	  If unsure, say N.
 
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
-- 
cgit v1.2.3


From ca99c1da080345e227cfb083c330a184d42e27f3 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Tue, 18 Apr 2006 22:21:46 -0700
Subject: [PATCH] Fix file lookup without ref

There are places in the kernel where we look up files in fd tables and
access the file structure without holding refereces to the file.  So, we
need special care to avoid the race between looking up files in the fd
table and tearing down of the file in another CPU.  Otherwise, one might
see a NULL f_dentry or such torn down version of the file.  This patch
fixes those special places where such a race may happen.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c     |  9 +++++++--
 fs/proc/base.c | 21 +++++++++++++++------
 2 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index dda83d6cd48..efad798824d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2230,7 +2230,12 @@ void steal_locks(fl_owner_t from)
 
 	lock_kernel();
 	j = 0;
-	rcu_read_lock();
+
+	/*
+	 * We are not taking a ref to the file structures, so
+	 * we need to acquire ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
 	for (;;) {
 		unsigned long set;
@@ -2248,7 +2253,7 @@ void steal_locks(fl_owner_t from)
 			set >>= 1;
 		}
 	}
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	unlock_kernel();
 }
 EXPORT_SYMBOL(steal_locks);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a3a3eecef68..6cc77dc3f3f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 
 	files = get_files_struct(task);
 	if (files) {
-		rcu_read_lock();
+		/*
+		 * We are not taking a ref to the file structure, so we must
+		 * hold ->file_lock.
+		 */
+		spin_lock(&files->file_lock);
 		file = fcheck_files(files, fd);
 		if (file) {
 			*mnt = mntget(file->f_vfsmnt);
 			*dentry = dget(file->f_dentry);
-			rcu_read_unlock();
+			spin_unlock(&files->file_lock);
 			put_files_struct(files);
 			return 0;
 		}
-		rcu_read_unlock();
+		spin_unlock(&files->file_lock);
 		put_files_struct(files);
 	}
 	return -ENOENT;
@@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (!files)
 		goto out_unlock;
 	inode->i_mode = S_IFLNK;
-	rcu_read_lock();
+
+	/*
+	 * We are not taking a ref to the file structure, so we must
+	 * hold ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
 	file = fcheck_files(files, fd);
 	if (!file)
 		goto out_unlock2;
@@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 		inode->i_mode |= S_IRUSR | S_IXUSR;
 	if (file->f_mode & 2)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
@@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	return NULL;
 
 out_unlock2:
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 out_unlock:
 	iput(inode);
-- 
cgit v1.2.3


From 95cf959b245832ad49bb333bf88f9805244b225d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 18 Apr 2006 13:14:06 -0400
Subject: VFS: Fix another open intent Oops

If the call to nfs_intent_set_file() fails to open a file in
nfs4_proc_create(), we should return an error.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47ece1dd3c6..d86c0db7b1e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1218,7 +1218,7 @@ out:
 	return status;
 }
 
-static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
 {
 	struct file *filp;
 
@@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st
 		struct nfs_open_context *ctx;
 		ctx = (struct nfs_open_context *)filp->private_data;
 		ctx->state = state;
-	} else
-		nfs4_close_state(state, nd->intent.open.flags);
+		return 0;
+	}
+	nfs4_close_state(state, nd->intent.open.flags);
+	return PTR_ERR(filp);
 }
 
 struct dentry *
@@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 			nfs_setattr_update_inode(state->inode, sattr);
 	}
 	if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN))
-		nfs4_intent_set_file(nd, dentry, state);
+		status = nfs4_intent_set_file(nd, dentry, state);
 	else
 		nfs4_close_state(state, flags);
 out:
-- 
cgit v1.2.3


From e99170ff3b799a9fd43d538932a9231fac1de9d4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 18 Apr 2006 13:21:42 -0400
Subject: NFS,SUNRPC: Fix compiler warnings if CONFIG_PROC_FS & CONFIG_SYSCTL
 are unset

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 8 +++-----
 fs/nfs/file.c   | 5 ++---
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0f583cb16dd..3c72b0c0728 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
  */
 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
 {
-	struct dentry *dentry = iocb->ki_filp->f_dentry;
-
 	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
-			dentry->d_name.name, (long long) pos, nr_segs);
+			iocb->ki_filp->f_dentry->d_name.name,
+			(long long) pos, nr_segs);
 
 	return -EINVAL;
 }
@@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_write_data *data = dreq->commit_data;
-	struct rpc_task *task = &data->task;
 
 	data->inode = dreq->inode;
 	data->cred = dreq->ctx->cred;
@@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
 	dreq->commit_data = NULL;
 
-	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);
+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 
 	lock_kernel();
 	rpc_execute(&data->task);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f1df2c8d925..fade02c15e6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
  */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode * inode = filp->f_mapping->host;
-
 	dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
-			inode->i_sb->s_id, inode->i_ino,
+			filp->f_dentry->d_inode->i_sb->s_id,
+			filp->f_dentry->d_inode->i_ino,
 			fl->fl_type, fl->fl_flags);
 
 	/*
-- 
cgit v1.2.3


From ec535ce154f2eaad3d97f2f20a76a6d8bdac33e5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 18 Apr 2006 13:21:50 -0400
Subject: NFS: make 2 functions static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d2b66bad7d5..3ef739120df 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 	svc_wake_up(block->b_daemon);
 }
 
-void nlmsvc_grant_release(void *data)
+static void nlmsvc_grant_release(void *data)
 {
 	struct nlm_rqst		*call = data;
 
-- 
cgit v1.2.3


From b9d9506d944865876e67281a4e4269d823ce5381 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Wed, 19 Apr 2006 13:06:20 -0400
Subject: NFS: nfs_show_stats; for_each_possible_cpu(), not NR_CPUS

Convert a for-loop that explicitly references "NR_CPUS" into the
potentially more efficient for_each_possible_cpu() construct.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2f7656b911b..d0b991a9232 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	/*
 	 * Display superblock I/O counters
 	 */
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_cpu(cpu) {
 		struct nfs_iostats *stats;
 
-		if (!cpu_possible(cpu))
-			continue;
-
 		preempt_disable();
 		stats = per_cpu_ptr(nfss->io_stats, cpu);
 
-- 
cgit v1.2.3


From 7451c4f0ee53e36fd74168af8df75b28fd04a2aa Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Wed, 19 Apr 2006 13:06:37 -0400
Subject: NFS: remove needless check in nfs_opendir()

Local variable res was initialized to 0 - no check needed here.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a23f3489416..cae74dd4c7f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = {
 static int
 nfs_opendir(struct inode *inode, struct file *filp)
 {
-	int res = 0;
+	int res;
 
 	dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
 			inode->i_sb->s_id, inode->i_ino);
 
 	lock_kernel();
 	/* Call generic open code in order to cache credentials */
-	if (!res)
-		res = nfs_open(inode, filp);
+	res = nfs_open(inode, filp);
 	unlock_kernel();
 	return res;
 }
-- 
cgit v1.2.3


From 82aa5d6183667aa2a5f3c61e390934b0273d2ad7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 20 Apr 2006 13:05:48 +0200
Subject: [PATCH] splice: fix smaller sized splice reads

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 22fac87e90b..0559e7577a0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -275,6 +275,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	error = 0;
 	bytes = 0;
 	for (i = 0; i < nr_pages; i++, index++) {
+		unsigned int this_len;
+
+		if (!len)
+			break;
+
+		/*
+		 * this_len is the max we'll use from this page
+		 */
+		this_len = min(len, PAGE_CACHE_SIZE - loff);
 find_page:
 		/*
 		 * lookup the page for this index
@@ -366,11 +375,13 @@ readpage:
 				 * force quit after adding this page
 				 */
 				nr_pages = i;
+				this_len = min(this_len, loff);
 			}
 		}
 fill_it:
 		pages[i] = page;
-		bytes += PAGE_CACHE_SIZE - loff;
+		bytes += this_len;
+		len -= this_len;
 		loff = 0;
 	}
 
-- 
cgit v1.2.3