From bcc7b445eff295664a3a3ab14e742b3c9d88e6e3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 30 Aug 2007 17:21:38 +1000
Subject: [XFS] fix filestreams on 32-bit boxes

xfs_filestream_mount() sets up an mru cache with:
  err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
  (xfs_mru_cache_free_func_t)xfs_fstrm_free_func);
but that cast is causing problems...
  typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
but:
  void xfs_fstrm_free_func( xfs_ino_t ino, fstrm_item_t *item)
so on a 32-bit box, it's casting (32, 32) args into (64, 32) and I assume
it's getting garbage for *item, which subsequently causes an explosion.
With this change the filestreams xfsqa tests don't oops on my 32-bit box.

SGI-PV: 967795
SGI-Modid: xfs-linux-melb:xfs-kern:29510a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_filestream.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 16f8e175167..36d8f6aa11a 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -350,9 +350,10 @@ _xfs_filestream_update_ag(
 /* xfs_fstrm_free_func(): callback for freeing cached stream items. */
 void
 xfs_fstrm_free_func(
-	xfs_ino_t	ino,
-	fstrm_item_t	*item)
+	unsigned long	ino,
+	void		*data)
 {
+	fstrm_item_t	*item  = (fstrm_item_t *)data;
 	xfs_inode_t	*ip = item->ip;
 	int ref;
 
@@ -438,7 +439,7 @@ xfs_filestream_mount(
 	grp_count = 10;
 
 	err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
-	                     (xfs_mru_cache_free_func_t)xfs_fstrm_free_func);
+	                     xfs_fstrm_free_func);
 
 	return err;
 }
-- 
cgit v1.2.3


From 1bc5858d0d40e07697b5eda47ed8628b8a934235 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 19 Sep 2007 15:27:30 +1000
Subject: [XFS] fix valid but harmless sparse warning

The new xlog_recover_do_reg_buffer checks call be16_to_cpu on di_gen which
is a 32bit value so sparse rightly complains. Fortunately the warning is
harmless because we don't care for the value, but only whether it's
non-NULL. Due to that fact we can simply kill the endian swaps on this and
the previous di_mode check entirely.

SGI-PV: 969656
SGI-Modid: xfs-linux-melb:xfs-kern:29709a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index dacb19739cc..7174991f4be 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1920,9 +1920,9 @@ xlog_recover_do_reg_buffer(
 				stale_buf = 1;
 				break;
 			}
-			if (be16_to_cpu(dip->di_core.di_mode))
+			if (dip->di_core.di_mode)
 				mode_count++;
-			if (be16_to_cpu(dip->di_core.di_gen))
+			if (dip->di_core.di_gen)
 				gen_count++;
 		}
 
-- 
cgit v1.2.3


From b8fceee17a310f189188599a8fa5e9beaff57eb0 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 20 Sep 2007 12:40:16 -0700
Subject: signalfd simplification

This simplifies signalfd code, by avoiding it to remain attached to the
sighand during its lifetime.

In this way, the signalfd remain attached to the sighand only during
poll(2) (and select and epoll) and read(2).  This also allows to remove
all the custom "tsk == current" checks in kernel/signal.c, since
dequeue_signal() will only be called by "current".

I think this is also what Ben was suggesting time ago.

The external effect of this, is that a thread can extract only its own
private signals and the group ones.  I think this is an acceptable
behaviour, in that those are the signals the thread would be able to
fetch w/out signalfd.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     |   3 -
 fs/signalfd.c | 190 +++++++++-------------------------------------------------
 2 files changed, 29 insertions(+), 164 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index c21a8cc0627..073b0b8c6d0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,7 +50,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
-#include <linux/signalfd.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -784,7 +783,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and we can just re-use it all.
 	 */
 	if (atomic_read(&oldsighand->count) <= 1) {
-		signalfd_detach(tsk);
 		exit_itimers(sig);
 		return 0;
 	}
@@ -923,7 +921,6 @@ static int de_thread(struct task_struct *tsk)
 	sig->flags = 0;
 
 no_thread_group:
-	signalfd_detach(tsk);
 	exit_itimers(sig);
 	if (leader)
 		release_task(leader);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index a8e293d3003..aefb0be0794 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -11,8 +11,10 @@
  *      Now using anonymous inode source.
  *      Thanks to Oleg Nesterov for useful code review and suggestions.
  *      More comments and suggestions from Arnd Bergmann.
- * Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
+ *  Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
  *      Retrieve multiple signals with one read() call
+ *  Sun Jul 15, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Attach to the sighand only during read() and poll().
  */
 
 #include <linux/file.h>
@@ -27,102 +29,12 @@
 #include <linux/signalfd.h>
 
 struct signalfd_ctx {
-	struct list_head lnk;
-	wait_queue_head_t wqh;
 	sigset_t sigmask;
-	struct task_struct *tsk;
 };
 
-struct signalfd_lockctx {
-	struct task_struct *tsk;
-	unsigned long flags;
-};
-
-/*
- * Tries to acquire the sighand lock. We do not increment the sighand
- * use count, and we do not even pin the task struct, so we need to
- * do it inside an RCU read lock, and we must be prepared for the
- * ctx->tsk going to NULL (in signalfd_deliver()), and for the sighand
- * being detached. We return 0 if the sighand has been detached, or
- * 1 if we were able to pin the sighand lock.
- */
-static int signalfd_lock(struct signalfd_ctx *ctx, struct signalfd_lockctx *lk)
-{
-	struct sighand_struct *sighand = NULL;
-
-	rcu_read_lock();
-	lk->tsk = rcu_dereference(ctx->tsk);
-	if (likely(lk->tsk != NULL))
-		sighand = lock_task_sighand(lk->tsk, &lk->flags);
-	rcu_read_unlock();
-
-	if (!sighand)
-		return 0;
-
-	if (!ctx->tsk) {
-		unlock_task_sighand(lk->tsk, &lk->flags);
-		return 0;
-	}
-
-	if (lk->tsk->tgid == current->tgid)
-		lk->tsk = current;
-
-	return 1;
-}
-
-static void signalfd_unlock(struct signalfd_lockctx *lk)
-{
-	unlock_task_sighand(lk->tsk, &lk->flags);
-}
-
-/*
- * This must be called with the sighand lock held.
- */
-void signalfd_deliver(struct task_struct *tsk, int sig)
-{
-	struct sighand_struct *sighand = tsk->sighand;
-	struct signalfd_ctx *ctx, *tmp;
-
-	BUG_ON(!sig);
-	list_for_each_entry_safe(ctx, tmp, &sighand->signalfd_list, lnk) {
-		/*
-		 * We use a negative signal value as a way to broadcast that the
-		 * sighand has been orphaned, so that we can notify all the
-		 * listeners about this. Remember the ctx->sigmask is inverted,
-		 * so if the user is interested in a signal, that corresponding
-		 * bit will be zero.
-		 */
-		if (sig < 0) {
-			if (ctx->tsk == tsk) {
-				ctx->tsk = NULL;
-				list_del_init(&ctx->lnk);
-				wake_up(&ctx->wqh);
-			}
-		} else {
-			if (!sigismember(&ctx->sigmask, sig))
-				wake_up(&ctx->wqh);
-		}
-	}
-}
-
-static void signalfd_cleanup(struct signalfd_ctx *ctx)
-{
-	struct signalfd_lockctx lk;
-
-	/*
-	 * This is tricky. If the sighand is gone, we do not need to remove
-	 * context from the list, the list itself won't be there anymore.
-	 */
-	if (signalfd_lock(ctx, &lk)) {
-		list_del(&ctx->lnk);
-		signalfd_unlock(&lk);
-	}
-	kfree(ctx);
-}
-
 static int signalfd_release(struct inode *inode, struct file *file)
 {
-	signalfd_cleanup(file->private_data);
+	kfree(file->private_data);
 	return 0;
 }
 
@@ -130,23 +42,15 @@ static unsigned int signalfd_poll(struct file *file, poll_table *wait)
 {
 	struct signalfd_ctx *ctx = file->private_data;
 	unsigned int events = 0;
-	struct signalfd_lockctx lk;
 
-	poll_wait(file, &ctx->wqh, wait);
+	poll_wait(file, &current->sighand->signalfd_wqh, wait);
 
-	/*
-	 * Let the caller get a POLLIN in this case, ala socket recv() when
-	 * the peer disconnects.
-	 */
-	if (signalfd_lock(ctx, &lk)) {
-		if ((lk.tsk == current &&
-		     next_signal(&lk.tsk->pending, &ctx->sigmask) > 0) ||
-		    next_signal(&lk.tsk->signal->shared_pending,
-				&ctx->sigmask) > 0)
-			events |= POLLIN;
-		signalfd_unlock(&lk);
-	} else
+	spin_lock_irq(&current->sighand->siglock);
+	if (next_signal(&current->pending, &ctx->sigmask) ||
+	    next_signal(&current->signal->shared_pending,
+			&ctx->sigmask))
 		events |= POLLIN;
+	spin_unlock_irq(&current->sighand->siglock);
 
 	return events;
 }
@@ -219,59 +123,46 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
 				int nonblock)
 {
 	ssize_t ret;
-	struct signalfd_lockctx lk;
 	DECLARE_WAITQUEUE(wait, current);
 
-	if (!signalfd_lock(ctx, &lk))
-		return 0;
-
-	ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
+	spin_lock_irq(&current->sighand->siglock);
+	ret = dequeue_signal(current, &ctx->sigmask, info);
 	switch (ret) {
 	case 0:
 		if (!nonblock)
 			break;
 		ret = -EAGAIN;
 	default:
-		signalfd_unlock(&lk);
+		spin_unlock_irq(&current->sighand->siglock);
 		return ret;
 	}
 
-	add_wait_queue(&ctx->wqh, &wait);
+	add_wait_queue(&current->sighand->signalfd_wqh, &wait);
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
-		signalfd_unlock(&lk);
+		ret = dequeue_signal(current, &ctx->sigmask, info);
 		if (ret != 0)
 			break;
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
+		spin_unlock_irq(&current->sighand->siglock);
 		schedule();
-		ret = signalfd_lock(ctx, &lk);
-		if (unlikely(!ret)) {
-			/*
-			 * Let the caller read zero byte, ala socket
-			 * recv() when the peer disconnect. This test
-			 * must be done before doing a dequeue_signal(),
-			 * because if the sighand has been orphaned,
-			 * the dequeue_signal() call is going to crash
-			 * because ->sighand will be long gone.
-			 */
-			 break;
-		}
+		spin_lock_irq(&current->sighand->siglock);
 	}
+	spin_unlock_irq(&current->sighand->siglock);
 
-	remove_wait_queue(&ctx->wqh, &wait);
+	remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
 	__set_current_state(TASK_RUNNING);
 
 	return ret;
 }
 
 /*
- * Returns either the size of a "struct signalfd_siginfo", or zero if the
- * sighand we are attached to, has been orphaned. The "count" parameter
- * must be at least the size of a "struct signalfd_siginfo".
+ * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative
+ * error code. The "count" parameter must be at least the size of a
+ * "struct signalfd_siginfo".
  */
 static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 			     loff_t *ppos)
@@ -287,7 +178,6 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 		return -EINVAL;
 
 	siginfo = (struct signalfd_siginfo __user *) buf;
-
 	do {
 		ret = signalfd_dequeue(ctx, &info, nonblock);
 		if (unlikely(ret <= 0))
@@ -300,7 +190,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 		nonblock = 1;
 	} while (--count);
 
-	return total ? total : ret;
+	return total ? total: ret;
 }
 
 static const struct file_operations signalfd_fops = {
@@ -309,20 +199,13 @@ static const struct file_operations signalfd_fops = {
 	.read		= signalfd_read,
 };
 
-/*
- * Create a file descriptor that is associated with our signal
- * state. We can pass it around to others if we want to, but
- * it will always be _our_ signal state.
- */
 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
 {
 	int error;
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
-	struct sighand_struct *sighand;
 	struct file *file;
 	struct inode *inode;
-	struct signalfd_lockctx lk;
 
 	if (sizemask != sizeof(sigset_t) ||
 	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
@@ -335,17 +218,7 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 		if (!ctx)
 			return -ENOMEM;
 
-		init_waitqueue_head(&ctx->wqh);
 		ctx->sigmask = sigmask;
-		ctx->tsk = current->group_leader;
-
-		sighand = current->sighand;
-		/*
-		 * Add this fd to the list of signal listeners.
-		 */
-		spin_lock_irq(&sighand->siglock);
-		list_add_tail(&ctx->lnk, &sighand->signalfd_list);
-		spin_unlock_irq(&sighand->siglock);
 
 		/*
 		 * When we call this, the initialization must be complete, since
@@ -364,23 +237,18 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 			fput(file);
 			return -EINVAL;
 		}
-		/*
-		 * We need to be prepared of the fact that the sighand this fd
-		 * is attached to, has been detched. In that case signalfd_lock()
-		 * will return 0, and we'll just skip setting the new mask.
-		 */
-		if (signalfd_lock(ctx, &lk)) {
-			ctx->sigmask = sigmask;
-			signalfd_unlock(&lk);
-		}
-		wake_up(&ctx->wqh);
+		spin_lock_irq(&current->sighand->siglock);
+		ctx->sigmask = sigmask;
+		spin_unlock_irq(&current->sighand->siglock);
+
+		wake_up(&current->sighand->signalfd_wqh);
 		fput(file);
 	}
 
 	return ufd;
 
 err_fdalloc:
-	signalfd_cleanup(ctx);
+	kfree(ctx);
 	return error;
 }
 
-- 
cgit v1.2.3


From 415cb800375cc4e89fb5a6a454e484bd4adbffb4 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Sun, 16 Sep 2007 20:10:16 -0700
Subject: ocfs2: Allow smaller allocations during large writes

The ocfs2 write code loops through a page much like the block code, except
that ocfs2 allocation units can be any size, including larger than page
size. Typically it's equal to or larger than page size - most kernels run 4k
pages, the minimum ocfs2 allocation (cluster) size.

Some changes introduced during 2.6.23 changed the way writes to pages are
handled, and inadvertantly broke support for > 4k page size. Instead of just
writing one cluster at a time, we now handle the whole page in one pass.

This means that multiple (small) seperate allocations might happen in the
same pass. The allocation code howver typically optimizes by getting the
maximum which was reserved. This triggered a BUG_ON in the extend code where
it'd ask for a single bit (for one part of a > 4k page) and get back more
than it asked for.

Fix this by providing a variant of the high level allocation function which
allows the caller to specify a maximum. The traditional function remains and
just calls the new one with a maximum determined from the initial
reservation.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c       |  4 ++--
 fs/ocfs2/localalloc.c |  4 +---
 fs/ocfs2/localalloc.h |  2 +-
 fs/ocfs2/suballoc.c   | 29 +++++++++++++++++++++--------
 fs/ocfs2/suballoc.h   | 11 +++++++++++
 5 files changed, 36 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7e34e66159c..f3bc3658e7a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -491,8 +491,8 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
-				      &bit_off, &num_bits);
+	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+					clusters_to_add, &bit_off, &num_bits);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 545f7892cdf..de984d27257 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -524,13 +524,12 @@ bail:
 int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 				 handle_t *handle,
 				 struct ocfs2_alloc_context *ac,
-				 u32 min_bits,
+				 u32 bits_wanted,
 				 u32 *bit_off,
 				 u32 *num_bits)
 {
 	int status, start;
 	struct inode *local_alloc_inode;
-	u32 bits_wanted;
 	void *bitmap;
 	struct ocfs2_dinode *alloc;
 	struct ocfs2_local_alloc *la;
@@ -538,7 +537,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	mlog_entry_void();
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
 
-	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
 	local_alloc_inode = ac->ac_inode;
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 385a10152f9..3f76631e110 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -48,7 +48,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 				 handle_t *handle,
 				 struct ocfs2_alloc_context *ac,
-				 u32 min_bits,
+				 u32 bits_wanted,
 				 u32 *bit_off,
 				 u32 *num_bits);
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d9c5c9fcb30..8f09f5235e3 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1486,21 +1486,21 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
  * contig. allocation, set to '1' to indicate we can deal with extents
  * of any size.
  */
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
-			 handle_t *handle,
-			 struct ocfs2_alloc_context *ac,
-			 u32 min_clusters,
-			 u32 *cluster_start,
-			 u32 *num_clusters)
+int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+			   handle_t *handle,
+			   struct ocfs2_alloc_context *ac,
+			   u32 min_clusters,
+			   u32 max_clusters,
+			   u32 *cluster_start,
+			   u32 *num_clusters)
 {
 	int status;
-	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	unsigned int bits_wanted = max_clusters;
 	u64 bg_blkno = 0;
 	u16 bg_bit_off;
 
 	mlog_entry_void();
 
-	BUG_ON(!ac);
 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
 
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
@@ -1557,6 +1557,19 @@ bail:
 	return status;
 }
 
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters)
+{
+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+
+	return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+				      bits_wanted, cluster_start, num_clusters);
+}
+
 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 					       struct inode *alloc_inode,
 					       struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index f212dc01a84..cafe9370309 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -85,6 +85,17 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 			 u32 min_clusters,
 			 u32 *cluster_start,
 			 u32 *num_clusters);
+/*
+ * Use this variant of ocfs2_claim_clusters to specify a maxiumum
+ * number of clusters smaller than the allocation reserved.
+ */
+int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+			   handle_t *handle,
+			   struct ocfs2_alloc_context *ac,
+			   u32 min_clusters,
+			   u32 max_clusters,
+			   u32 *cluster_start,
+			   u32 *num_clusters);
 
 int ocfs2_free_suballoc_bits(handle_t *handle,
 			     struct inode *alloc_inode,
-- 
cgit v1.2.3


From db56246c6980e376b02d2da568d119da71f82fb9 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 17 Sep 2007 09:06:29 -0700
Subject: ocfs2: Fix pos/len passed to ocfs2_write_cluster

This was broken for file systems whose cluster size is greater than page
size. Pos needs to be incremented as we loop through the descriptors, and
len needs to be capped to the size of a single cluster.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 50cd8a20901..fa43810e597 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1211,18 +1211,33 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
 				       loff_t pos, unsigned len)
 {
 	int ret, i;
+	loff_t cluster_off;
+	unsigned int local_len = len;
 	struct ocfs2_write_cluster_desc *desc;
+	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
 
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 
+		/*
+		 * We have to make sure that the total write passed in
+		 * doesn't extend past a single cluster.
+		 */
+		local_len = len;
+		cluster_off = pos & (osb->s_clustersize - 1);
+		if ((cluster_off + local_len) > osb->s_clustersize)
+			local_len = osb->s_clustersize - cluster_off;
+
 		ret = ocfs2_write_cluster(mapping, desc->c_phys,
 					  desc->c_unwritten, data_ac, meta_ac,
-					  wc, desc->c_cpos, pos, len);
+					  wc, desc->c_cpos, pos, local_len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
+
+		len -= local_len;
+		pos += local_len;
 	}
 
 	ret = 0;
-- 
cgit v1.2.3


From 5c26a7b70f89c36e8d9acc95cb896c3cd205fc8d Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 18 Sep 2007 17:49:29 -0700
Subject: ocfs2: Don't double set write parameters

The target page offsets were being incorrectly set a second time in
ocfs2_prepare_page_for_write(), which was causing problems on a 16k page
size kernel. Additionally, ocfs2_write_failure() was incorrectly using those
parameters instead of the parameters for the individual page being cleaned
up.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index fa43810e597..f37f25c931f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -930,18 +930,11 @@ static void ocfs2_write_failure(struct inode *inode,
 				loff_t user_pos, unsigned user_len)
 {
 	int i;
-	unsigned from, to;
+	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+		to = user_pos + user_len;
 	struct page *tmppage;
 
-	ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
-
-	if (wc->w_large_pages) {
-		from = wc->w_target_from;
-		to = wc->w_target_to;
-	} else {
-		from = 0;
-		to = PAGE_CACHE_SIZE;
-	}
+	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
@@ -991,9 +984,6 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
 			map_from = cluster_start;
 			map_to = cluster_end;
 		}
-
-		wc->w_target_from = map_from;
-		wc->w_target_to = map_to;
 	} else {
 		/*
 		 * If we haven't allocated the new page yet, we
-- 
cgit v1.2.3


From 813d974c53a2b353566a86bb127625b403696dae Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 20 Sep 2007 10:59:48 -0700
Subject: ocfs2: Pack vote message and response structures

The ocfs2_vote_msg and ocfs2_response_msg structs needed to be
packed to ensure similar sizeofs in 32-bit and 64-bit arches. Without this,
we had inadvertantly broken 32/64 bit cross mounts.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/vote.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index 66a13ee63d4..c05358538f2 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -66,7 +66,7 @@ struct ocfs2_vote_msg
 {
 	struct ocfs2_msg_hdr v_hdr;
 	__be32 v_reserved1;
-};
+} __attribute__ ((packed));
 
 /* Responses are given these values to maintain backwards
  * compatibility with older ocfs2 versions */
@@ -78,7 +78,7 @@ struct ocfs2_response_msg
 {
 	struct ocfs2_msg_hdr r_hdr;
 	__be32 r_response;
-};
+} __attribute__ ((packed));
 
 struct ocfs2_vote_work {
 	struct list_head   w_list;
-- 
cgit v1.2.3


From d59952d532ed8fc93ccb98186f73d9ce5cfcb93d Mon Sep 17 00:00:00 2001
From: Jean Tourrilhes <jt@hpl.hp.com>
Date: Thu, 20 Sep 2007 11:08:18 -0700
Subject: [PATCH] WE : Add missing auth compat-ioctl

Johannes just found that we are missing a compat-ioctl
declaration. The fix is trivial. As previous patches for compat-ioctl,
this should also go to stable.

More info :
	http://marc.info/?l=linux-wireless&m=119029667902588&w=2

Signed-off-by: Jean Tourrilhes <jt@hpl.hp.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 fs/compat_ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5a5b7116cef..37310b0e810 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -3190,6 +3190,8 @@ COMPATIBLE_IOCTL(SIOCSIWRETRY)
 COMPATIBLE_IOCTL(SIOCGIWRETRY)
 COMPATIBLE_IOCTL(SIOCSIWPOWER)
 COMPATIBLE_IOCTL(SIOCGIWPOWER)
+COMPATIBLE_IOCTL(SIOCSIWAUTH)
+COMPATIBLE_IOCTL(SIOCGIWAUTH)
 /* hiddev */
 COMPATIBLE_IOCTL(HIDIOCGVERSION)
 COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
-- 
cgit v1.2.3


From f9b7cba1b8a74c10b0771ca28d4c554aeb9803fc Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Mon, 24 Sep 2007 21:24:39 -0700
Subject: ufs: fix sun state

Different types of ufs hold state in different places, to hide complexity
of this, there is ufs_get_fs_state, it returns state according to
"UFS_SB(sb)->s_flags", but during mount ufs_get_fs_state is called, before
setting s_flags, this cause message for ufs types like sun ufs: "fs need
fsck", and remount in readonly state.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ufs/super.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 73402c5eeb8..38eb0b7a1f3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -894,7 +894,7 @@ magic_found:
 		goto again;
 	}
 
-
+	sbi->s_flags = flags;/*after that line some functions use s_flags*/
 	ufs_print_super_stuff(sb, usb1, usb2, usb3);
 
 	/*
@@ -1025,8 +1025,6 @@ magic_found:
 	    UFS_MOUNT_UFSTYPE_44BSD)
 		uspi->s_maxsymlinklen =
 		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
-	
-	sbi->s_flags = flags;
 
 	inode = iget(sb, UFS_ROOTINO);
 	if (!inode || is_bad_inode(inode))
-- 
cgit v1.2.3


From 255129d1e9ca0ed3d69d5517fae3e03d7ab4b806 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 25 Sep 2007 15:55:03 -0400
Subject: NLM: Fix a circular lock dependency in lockd

The problem is that the garbage collector for the 'host' structures
nlm_gc_hosts(), holds nlm_host_mutex while calling down to
nlmsvc_mark_resources, which, eventually takes the file->f_mutex.

We cannot therefore call nlmsvc_lookup_host() from within
nlmsvc_create_block, since the caller will already hold file->f_mutex, so
the attempt to grab nlm_host_mutex may deadlock.

Fix the problem by calling nlmsvc_lookup_host() outside the file->f_mutex.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/lockd/svclock.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index a21e4bc5444..d098c7af0d2 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -171,19 +171,14 @@ found:
  * GRANTED_RES message by cookie, without having to rely on the client's IP
  * address. --okir
  */
-static inline struct nlm_block *
-nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
-		struct nlm_lock *lock, struct nlm_cookie *cookie)
+static struct nlm_block *
+nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
+		    struct nlm_file *file, struct nlm_lock *lock,
+		    struct nlm_cookie *cookie)
 {
 	struct nlm_block	*block;
-	struct nlm_host		*host;
 	struct nlm_rqst		*call = NULL;
 
-	/* Create host handle for callback */
-	host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-	if (host == NULL)
-		return NULL;
-
 	call = nlm_alloc_call(host);
 	if (call == NULL)
 		return NULL;
@@ -366,6 +361,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
 {
 	struct nlm_block	*block = NULL;
+	struct nlm_host		*host;
 	int			error;
 	__be32			ret;
 
@@ -377,6 +373,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_end,
 				wait);
 
+	/* Create host handle for callback */
+	host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
+	if (host == NULL)
+		return nlm_lck_denied_nolocks;
 
 	/* Lock file against concurrent access */
 	mutex_lock(&file->f_mutex);
@@ -385,7 +385,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	 */
 	block = nlmsvc_lookup_block(file, lock);
 	if (block == NULL) {
-		block = nlmsvc_create_block(rqstp, file, lock, cookie);
+		block = nlmsvc_create_block(rqstp, nlm_get_host(host), file,
+				lock, cookie);
 		ret = nlm_lck_denied_nolocks;
 		if (block == NULL)
 			goto out;
@@ -449,6 +450,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 out:
 	mutex_unlock(&file->f_mutex);
 	nlmsvc_release_block(block);
+	nlm_release_host(host);
 	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
 	return ret;
 }
@@ -477,10 +479,15 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 
 	if (block == NULL) {
 		struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+		struct nlm_host	*host;
 
 		if (conf == NULL)
 			return nlm_granted;
-		block = nlmsvc_create_block(rqstp, file, lock, cookie);
+		/* Create host handle for callback */
+		host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
+		if (host == NULL)
+			return nlm_lck_denied_nolocks;
+		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
 		if (block == NULL) {
 			kfree(conf);
 			return nlm_granted;
-- 
cgit v1.2.3


From 54af3bb543c071769141387a42deaaab5074da55 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2007 12:27:41 -0400
Subject: NFS: Fix an Oops in encode_lookup()

It doesn't look as if the NFS file name limit is being initialised correctly
in the struct nfs_server. Make sure that we limit whatever is being set in
nfs_probe_fsinfo() and nfs_init_server().

Also ensure that readdirplus and nfs4_path_walk respect our file name
limits.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/client.c  | 29 +++++++++++++++++++----------
 fs/nfs/dir.c     |  2 ++
 fs/nfs/getroot.c |  3 +++
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a49f9feff77..a204484072f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -588,16 +588,6 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
 	server->namelen  = data->namlen;
 	/* Create a client RPC handle for the NFSv3 ACL management interface */
 	nfs_init_server_aclclient(server);
-	if (clp->cl_nfsversion == 3) {
-		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
-			server->namelen = NFS3_MAXNAMLEN;
-		if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
-			server->caps |= NFS_CAP_READDIRPLUS;
-	} else {
-		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
-			server->namelen = NFS2_MAXNAMLEN;
-	}
-
 	dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
 	return 0;
 
@@ -794,6 +784,16 @@ struct nfs_server *nfs_create_server(const struct nfs_mount_data *data,
 	error = nfs_probe_fsinfo(server, mntfh, &fattr);
 	if (error < 0)
 		goto error;
+	if (server->nfs_client->rpc_ops->version == 3) {
+		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+			server->namelen = NFS3_MAXNAMLEN;
+		if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
+	} else {
+		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+			server->namelen = NFS2_MAXNAMLEN;
+	}
+
 	if (!(fattr.valid & NFS_ATTR_FATTR)) {
 		error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
 		if (error < 0) {
@@ -984,6 +984,9 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
 	if (error < 0)
 		goto error;
 
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
 	BUG_ON(!server->nfs_client);
 	BUG_ON(!server->nfs_client->rpc_ops);
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
@@ -1056,6 +1059,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	if (error < 0)
 		goto error;
 
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
 	dprintk("Referral FSID: %llx:%llx\n",
 		(unsigned long long) server->fsid.major,
 		(unsigned long long) server->fsid.minor);
@@ -1115,6 +1121,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 	if (error < 0)
 		goto out_free_server;
 
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
 	dprintk("Cloned FSID: %llx:%llx\n",
 		(unsigned long long) server->fsid.major,
 		(unsigned long long) server->fsid.minor);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ea97408e423..e4a04d16b8b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1162,6 +1162,8 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 	}
 	if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
 		return NULL;
+	if (name.len > NFS_SERVER(dir)->namelen)
+		return NULL;
 	/* Note: caller is already holding the dir->i_mutex! */
 	dentry = d_alloc(parent, &name);
 	if (dentry == NULL)
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index d1cbf0a0fbb..522e5ad4d8a 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -175,6 +175,9 @@ next_component:
 		path++;
 	name.len = path - (const char *) name.name;
 
+	if (name.len > NFS4_MAXNAMLEN)
+		return -ENAMETOOLONG;
+
 eat_dot_dir:
 	while (*path == '/')
 		path++;
-- 
cgit v1.2.3


From 564256c9e06d75e16d894a2cd30604bd6582cbba Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@chook.melbourne.sgi.com>
Date: Mon, 1 Oct 2007 16:39:37 +1000
Subject: Revert "[XFS] Avoid replaying inode buffer initialisation log items
 if on-disk version is newer."

This reverts commit b394e43e995d08821588a22561c6a71a63b4ff27.

Lachlan McIlroy says:
    It tried to fix an issue where log replay is replaying an inode cluster
    initialisation transaction that should not be replayed because the inode
    cluster on disk is more up to date.  Since we don't log file sizes (we
    rely on inode flushing to get them to disk) then we can't just replay
    all the transations in the log and expect the inode to be completely
    restored.  We lose file size updates.  Unfortunately this fix is causing
    more (serious) problems than it is fixing.

SGI-PV: 969656
SGI-Modid: xfs-linux-melb:xfs-kern:29804a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/xfs_buf_item.h    |  5 -----
 fs/xfs/xfs_log_recover.c | 51 +++---------------------------------------------
 fs/xfs/xfs_trans_buf.c   |  1 -
 3 files changed, 3 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index fa25b7dcc6c..d7e13614306 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -52,11 +52,6 @@ typedef struct xfs_buf_log_format_t {
 #define	XFS_BLI_UDQUOT_BUF	0x4
 #define XFS_BLI_PDQUOT_BUF	0x8
 #define	XFS_BLI_GDQUOT_BUF	0x10
-/*
- * This flag indicates that the buffer contains newly allocated
- * inodes.
- */
-#define	XFS_BLI_INODE_NEW_BUF	0x20
 
 #define	XFS_BLI_CHUNK		128
 #define	XFS_BLI_SHIFT		7
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7174991f4be..8ae6e8e5f3d 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1874,7 +1874,6 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
-	xfs_mount_t		*mp,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
@@ -1885,50 +1884,6 @@ xlog_recover_do_reg_buffer(
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 	int                     error;
-	int			stale_buf = 1;
-
-	/*
-	 * Scan through the on-disk inode buffer and attempt to
-	 * determine if it has been written to since it was logged.
-	 *
-	 * - If any of the magic numbers are incorrect then the buffer is stale
-	 * - If any of the modes are non-zero then the buffer is not stale
-	 * - If all of the modes are zero and at least one of the generation
-	 *   counts is non-zero then the buffer is stale
-	 *
-	 * If the end result is a stale buffer then the log buffer is replayed
-	 * otherwise it is skipped.
-	 *
-	 * This heuristic is not perfect.  It can be improved by scanning the
-	 * entire inode chunk for evidence that any of the inode clusters have
-	 * been updated.  To fix this problem completely we will need a major
-	 * architectural change to the logging system.
-	 */
-	if (buf_f->blf_flags & XFS_BLI_INODE_NEW_BUF) {
-		xfs_dinode_t    *dip;
-		int             inodes_per_buf;
-		int		mode_count = 0;
-		int		gen_count = 0;
-
-		stale_buf = 0;
-		inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
-		for (i = 0; i < inodes_per_buf; i++) {
-			dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-				i * mp->m_sb.sb_inodesize);
-			if (be16_to_cpu(dip->di_core.di_magic) !=
-					XFS_DINODE_MAGIC) {
-				stale_buf = 1;
-				break;
-			}
-			if (dip->di_core.di_mode)
-				mode_count++;
-			if (dip->di_core.di_gen)
-				gen_count++;
-		}
-
-		if (!mode_count && gen_count)
-			stale_buf = 1;
-	}
 
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
@@ -1962,7 +1917,7 @@ xlog_recover_do_reg_buffer(
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 		}
-		if (!error && stale_buf)
+		if (!error)
 			memcpy(xfs_buf_offset(bp,
 				(uint)bit << XFS_BLI_SHIFT),	/* dest */
 				item->ri_buf[i].i_addr,		/* source */
@@ -2134,7 +2089,7 @@ xlog_recover_do_dquot_buffer(
 	if (log->l_quotaoffs_flag & type)
 		return;
 
-	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	xlog_recover_do_reg_buffer(item, bp, buf_f);
 }
 
 /*
@@ -2235,7 +2190,7 @@ xlog_recover_do_buffer_trans(
 		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
-		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+		xlog_recover_do_reg_buffer(item, bp, buf_f);
 	}
 	if (error)
 		return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 95fff6872a2..60b6b898022 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -966,7 +966,6 @@ xfs_trans_inode_alloc_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
-	bip->bli_format.blf_flags |= XFS_BLI_INODE_NEW_BUF;
 }
 
 
-- 
cgit v1.2.3


From 75723957673bfa10c98b735259f891cc79cf0450 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Mon, 1 Oct 2007 13:17:28 -0700
Subject: Fix possible splice() mmap_sem deadlock

Nick Piggin points out that splice isn't being good about the mmap
semaphore: while two readers can nest inside each others, it does leave
a possible deadlock if a writer (ie a new mmap()) comes in during that
nesting.

Original "just move the locking" patch by Nick, replaced by one by me
based on an optimistic pagefault_disable().  And then Jens tested and
updated that patch.

Reported-by: Nick Piggin <npiggin@suse.de>
Tested-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/splice.c | 46 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index c010a72ca2d..e95a3622886 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1223,6 +1223,33 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 	return -EINVAL;
 }
 
+/*
+ * Do a copy-from-user while holding the mmap_semaphore for reading, in a
+ * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
+ * for writing) and page faulting on the user memory pointed to by src.
+ * This assumes that we will very rarely hit the partial != 0 path, or this
+ * will not be a win.
+ */
+static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
+{
+	int partial;
+
+	pagefault_disable();
+	partial = __copy_from_user_inatomic(dst, src, n);
+	pagefault_enable();
+
+	/*
+	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
+	 */
+	if (unlikely(partial)) {
+		up_read(&current->mm->mmap_sem);
+		partial = copy_from_user(dst, src, n);
+		down_read(&current->mm->mmap_sem);
+	}
+
+	return partial;
+}
+
 /*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
@@ -1236,31 +1263,26 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 {
 	int buffers = 0, error = 0;
 
-	/*
-	 * It's ok to take the mmap_sem for reading, even
-	 * across a "get_user()".
-	 */
 	down_read(&current->mm->mmap_sem);
 
 	while (nr_vecs) {
 		unsigned long off, npages;
+		struct iovec entry;
 		void __user *base;
 		size_t len;
 		int i;
 
-		/*
-		 * Get user address base and length for this iovec.
-		 */
-		error = get_user(base, &iov->iov_base);
-		if (unlikely(error))
-			break;
-		error = get_user(len, &iov->iov_len);
-		if (unlikely(error))
+		error = -EFAULT;
+		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
 			break;
 
+		base = entry.iov_base;
+		len = entry.iov_len;
+
 		/*
 		 * Sanity check this iovec. 0 read succeeds.
 		 */
+		error = 0;
 		if (unlikely(!len))
 			break;
 		error = -EFAULT;
-- 
cgit v1.2.3