From d3cf209476b72c83907a412b6708c5e498410aa7 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Tue, 8 May 2007 13:49:27 +1000
Subject: [XFS] propogate return codes from flush routines

This patch handles error return values in fs_flush_pages and
fs_flushinval_pages. It changes the prototype of fs_flushinval_pages so we
can propogate the errors and handle them at higher layers. I also modified
xfs_itruncate_start so that it could propogate the error further.

SGI-PV: 961990
SGI-Modid: xfs-linux-melb:xfs-kern:28231a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Stewart Smith <stewart@flamingspork.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/xfs_fs_subr.c | 21 ++++++++++++++-------
 fs/xfs/linux-2.6/xfs_fs_subr.h |  2 +-
 fs/xfs/linux-2.6/xfs_lrw.c     | 12 +++++++++---
 fs/xfs/linux-2.6/xfs_vnode.h   |  2 +-
 4 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'fs/xfs/linux-2.6')

diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index dc0562828e7..2eb87cd082a 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -35,7 +35,7 @@ fs_tosspages(
 		truncate_inode_pages(ip->i_mapping, first);
 }
 
-void
+int
 fs_flushinval_pages(
 	bhv_desc_t	*bdp,
 	xfs_off_t	first,
@@ -44,13 +44,16 @@ fs_flushinval_pages(
 {
 	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
+	int		ret = 0;
 
 	if (VN_CACHED(vp)) {
 		if (VN_TRUNC(vp))
 			VUNTRUNCATE(vp);
-		filemap_write_and_wait(ip->i_mapping);
-		truncate_inode_pages(ip->i_mapping, first);
+		ret = filemap_write_and_wait(ip->i_mapping);
+		if (!ret)
+			truncate_inode_pages(ip->i_mapping, first);
 	}
+	return ret;
 }
 
 int
@@ -63,14 +66,18 @@ fs_flush_pages(
 {
 	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
+	int		ret = 0;
+	int		ret2;
 
 	if (VN_DIRTY(vp)) {
 		if (VN_TRUNC(vp))
 			VUNTRUNCATE(vp);
-		filemap_fdatawrite(ip->i_mapping);
+		ret = filemap_fdatawrite(ip->i_mapping);
 		if (flags & XFS_B_ASYNC)
-			return 0;
-		filemap_fdatawait(ip->i_mapping);
+			return ret;
+		ret2 = filemap_fdatawait(ip->i_mapping);
+		if (!ret)
+			ret = ret2;
 	}
-	return 0;
+	return ret;
 }
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
index aee9ccdd18f..c1b53118a30 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.h
@@ -23,7 +23,7 @@ extern int  fs_noerr(void);
 extern int  fs_nosys(void);
 extern void fs_noval(void);
 extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+extern int  fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 extern int  fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
 
 #endif	/* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index ff8d64eba9f..8e46c9798fb 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -191,7 +191,7 @@ xfs_read(
 	struct file		*file = iocb->ki_filp;
 	struct inode		*inode = file->f_mapping->host;
 	size_t			size = 0;
-	ssize_t			ret;
+	ssize_t			ret = 0;
 	xfs_fsize_t		n;
 	xfs_inode_t		*ip;
 	xfs_mount_t		*mp;
@@ -263,9 +263,13 @@ xfs_read(
 
 	if (unlikely(ioflags & IO_ISDIRECT)) {
 		if (VN_CACHED(vp))
-			bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
+			ret = bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
 						 -1, FI_REMAPF_LOCKED);
 		mutex_unlock(&inode->i_mutex);
+		if (ret) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+			return ret;
+		}
 	}
 
 	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
@@ -814,8 +818,10 @@ retry:
 		if (need_flush) {
 			xfs_inval_cached_trace(io, pos, -1,
 					ctooff(offtoct(pos)), -1);
-			bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
+			error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
 					-1, FI_REMAPF_LOCKED);
+			if (error)
+				goto out_unlock_internal;
 		}
 
 		if (need_i_mutex) {
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index b76118cf489..d1b2d01843d 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -194,7 +194,7 @@ typedef	int	(*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
 typedef void	(*vop_link_removed_t)(bhv_desc_t *, bhv_vnode_t *, int);
 typedef void	(*vop_vnode_change_t)(bhv_desc_t *, bhv_vchange_t, __psint_t);
 typedef void	(*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-typedef void	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+typedef int	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
 				uint64_t, int);
 typedef int	(*vop_iflush_t)(bhv_desc_t *, int);
-- 
cgit v1.2.3


From 2a32963130aec5e157b58ff7dfa3dfa1afdf7ca1 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Tue, 8 May 2007 13:49:39 +1000
Subject: [XFS] Fix race condition in xfs_write().

This change addresses a race in xfs_write() where, for direct I/O, the
flags need_i_mutex and need_flush are setup before the iolock is acquired.
The logic used to setup the flags may change between setting the flags and
acquiring the iolock resulting in these flags having incorrect values. For
example, if a file is not currently cached then need_i_mutex is set to
zero and then if the file is cached before the iolock is acquired we will
fail to do the flushinval before the direct write.

The flush (and also the call to xfs_zero_eof()) need to be done with the
iolock held exclusive so we need to acquire the iolock before checking for
cached data (or if the write begins after eof) to prevent this state from
changing. For direct I/O I've chosen to always acquire the iolock in
shared mode initially and if there is a need to promote it then drop it
and reacquire it.

There's also some other tidy-ups including removing the O_APPEND offset
adjustment since that work is done in generic_write_checks() (and we don't
use offset as an input parameter anywhere).

SGI-PV: 962170
SGI-Modid: xfs-linux-melb:xfs-kern:28319a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 61 ++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

(limited to 'fs/xfs/linux-2.6')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 8e46c9798fb..80fe3123347 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -649,7 +649,7 @@ xfs_write(
 	bhv_vrwlock_t		locktype;
 	size_t			ocount = 0, count;
 	loff_t			pos;
-	int			need_i_mutex = 1, need_flush = 0;
+	int			need_i_mutex;
 
 	XFS_STATS_INC(xs_write_calls);
 
@@ -689,39 +689,20 @@ xfs_write(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (ioflags & IO_ISDIRECT) {
-		xfs_buftarg_t	*target =
-			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		if ((pos & target->bt_smask) || (count & target->bt_smask))
-			return XFS_ERROR(-EINVAL);
-
-		if (!VN_CACHED(vp) && pos < i_size_read(inode))
-			need_i_mutex = 0;
-
-		if (VN_CACHED(vp))
-			need_flush = 1;
-	}
-
 relock:
-	if (need_i_mutex) {
+	if (ioflags & IO_ISDIRECT) {
+		iolock = XFS_IOLOCK_SHARED;
+		locktype = VRWLOCK_WRITE_DIRECT;
+		need_i_mutex = 0;
+	} else {
 		iolock = XFS_IOLOCK_EXCL;
 		locktype = VRWLOCK_WRITE;
-
+		need_i_mutex = 1;
 		mutex_lock(&inode->i_mutex);
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
-		locktype = VRWLOCK_WRITE_DIRECT;
 	}
 
 	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 
-	isize = i_size_read(inode);
-
-	if (file->f_flags & O_APPEND)
-		*offset = isize;
-
 start:
 	error = -generic_write_checks(file, &pos, &count,
 					S_ISBLK(inode->i_mode));
@@ -730,6 +711,29 @@ start:
 		goto out_unlock_mutex;
 	}
 
+	isize = i_size_read(inode);
+
+	if (ioflags & IO_ISDIRECT) {
+		xfs_buftarg_t	*target =
+			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp;
+
+		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+			return XFS_ERROR(-EINVAL);
+		}
+
+		if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) {
+			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+			iolock = XFS_IOLOCK_EXCL;
+			locktype = VRWLOCK_WRITE;
+			need_i_mutex = 1;
+			mutex_lock(&inode->i_mutex);
+			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
+			goto start;
+		}
+	}
+
 	new_size = pos + count;
 	if (new_size > isize)
 		io->io_new_size = new_size;
@@ -761,7 +765,6 @@ start:
 		 * what allows the size to change in the first place.
 		 */
 		if ((file->f_flags & O_APPEND) && savedsize != isize) {
-			pos = isize = xip->i_d.di_size;
 			goto start;
 		}
 	}
@@ -815,7 +818,8 @@ retry:
 	current->backing_dev_info = mapping->backing_dev_info;
 
 	if ((ioflags & IO_ISDIRECT)) {
-		if (need_flush) {
+		if (VN_CACHED(vp)) {
+			WARN_ON(need_i_mutex == 0);
 			xfs_inval_cached_trace(io, pos, -1,
 					ctooff(offtoct(pos)), -1);
 			error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
@@ -849,7 +853,6 @@ retry:
 			pos += ret;
 			count -= ret;
 
-			need_i_mutex = 1;
 			ioflags &= ~IO_ISDIRECT;
 			xfs_iunlock(xip, iolock);
 			goto relock;
-- 
cgit v1.2.3


From ba87ea699ebd9dd577bf055ebc4a98200e337542 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Tue, 8 May 2007 13:49:46 +1000
Subject: [XFS] Fix to prevent the notorious 'NULL files' problem after a
 crash.

The problem that has been addressed is that of synchronising updates of
the file size with writes that extend a file. Without the fix the update
of a file's size, as a result of a write beyond eof, is independent of
when the cached data is flushed to disk. Often the file size update would
be written to the filesystem log before the data is flushed to disk. When
a system crashes between these two events and the filesystem log is
replayed on mount the file's size will be set but since the contents never
made it to disk the file is full of holes. If some of the cached data was
flushed to disk then it may just be a section of the file at the end that
has holes.

There are existing fixes to help alleviate this problem, particularly in
the case where a file has been truncated, that force cached data to be
flushed to disk when the file is closed. If the system crashes while the
file(s) are still open then this flushing will never occur.

The fix that we have implemented is to introduce a second file size,
called the in-memory file size, that represents the current file size as
viewed by the user. The existing file size, called the on-disk file size,
is the one that get's written to the filesystem log and we only update it
when it is safe to do so. When we write to a file beyond eof we only
update the in- memory file size in the write operation. Later when the I/O
operation, that flushes the cached data to disk completes, an I/O
completion routine will update the on-disk file size. The on-disk file
size will be updated to the maximum offset of the I/O or to the value of
the in-memory file size if the I/O includes eof.

SGI-PV: 958522
SGI-Modid: xfs-linux-melb:xfs-kern:28322a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 89 ++++++++++++++++++++++++++++++++++++++------
 fs/xfs/linux-2.6/xfs_lrw.c  | 91 +++++++++++++++++++++++++++------------------
 2 files changed, 132 insertions(+), 48 deletions(-)

(limited to 'fs/xfs/linux-2.6')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 143ffc851c9..4475588e973 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -140,10 +140,47 @@ xfs_destroy_ioend(
 	mempool_free(ioend, xfs_ioend_pool);
 }
 
+/*
+ * Update on-disk file size now that data has been written to disk.
+ * The current in-memory file size is i_size.  If a write is beyond
+ * eof io_new_size will be the intended file size until i_size is
+ * updated.  If this write does not extend all the way to the valid
+ * file size then restrict this update to the end of the write.
+ */
+STATIC void
+xfs_setfilesize(
+	xfs_ioend_t		*ioend)
+{
+	xfs_inode_t		*ip;
+	xfs_fsize_t		isize;
+	xfs_fsize_t		bsize;
+
+	ip = xfs_vtoi(ioend->io_vnode);
+
+	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+	ASSERT(ioend->io_type != IOMAP_READ);
+
+	if (unlikely(ioend->io_error))
+		return;
+
+	bsize = ioend->io_offset + ioend->io_size;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	isize = MAX(ip->i_size, ip->i_iocore.io_new_size);
+	isize = MIN(isize, bsize);
+
+	if (ip->i_d.di_size < isize) {
+		ip->i_d.di_size = isize;
+		ip->i_update_core = 1;
+		ip->i_update_size = 1;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
 /*
  * Buffered IO write completion for delayed allocate extents.
- * TODO: Update ondisk isize now that we know the file data
- * has been flushed (i.e. the notorious "NULL file" problem).
  */
 STATIC void
 xfs_end_bio_delalloc(
@@ -152,6 +189,7 @@ xfs_end_bio_delalloc(
 	xfs_ioend_t		*ioend =
 		container_of(work, xfs_ioend_t, io_work);
 
+	xfs_setfilesize(ioend);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -165,6 +203,7 @@ xfs_end_bio_written(
 	xfs_ioend_t		*ioend =
 		container_of(work, xfs_ioend_t, io_work);
 
+	xfs_setfilesize(ioend);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -184,8 +223,23 @@ xfs_end_bio_unwritten(
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
 
-	if (likely(!ioend->io_error))
+	if (likely(!ioend->io_error)) {
 		bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
+		xfs_setfilesize(ioend);
+	}
+	xfs_destroy_ioend(ioend);
+}
+
+/*
+ * IO read completion for regular, written extents.
+ */
+STATIC void
+xfs_end_bio_read(
+	struct work_struct	*work)
+{
+	xfs_ioend_t		*ioend =
+		container_of(work, xfs_ioend_t, io_work);
+
 	xfs_destroy_ioend(ioend);
 }
 
@@ -224,6 +278,8 @@ xfs_alloc_ioend(
 		INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
 	else if (type == IOMAP_DELAY)
 		INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
+	else if (type == IOMAP_READ)
+		INIT_WORK(&ioend->io_work, xfs_end_bio_read);
 	else
 		INIT_WORK(&ioend->io_work, xfs_end_bio_written);
 
@@ -913,7 +969,7 @@ xfs_page_state_convert(
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
 	flags = -1;
-	type = 0;
+	type = IOMAP_READ;
 
 	/* TODO: cleanup count and page_dirty */
 
@@ -999,7 +1055,7 @@ xfs_page_state_convert(
 			 * That means it must already have extents allocated
 			 * underneath it. Map the extent by reading it.
 			 */
-			if (!iomap_valid || type != 0) {
+			if (!iomap_valid || type != IOMAP_READ) {
 				flags = BMAPI_READ;
 				size = xfs_probe_cluster(inode, page, bh,
 								head, 1);
@@ -1010,7 +1066,7 @@ xfs_page_state_convert(
 				iomap_valid = xfs_iomap_valid(&iomap, offset);
 			}
 
-			type = 0;
+			type = IOMAP_READ;
 			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
 				ASSERT(buffer_mapped(bh));
 				if (iomap_valid)
@@ -1356,12 +1412,21 @@ xfs_end_io_direct(
 	 * completion handler in the future, in which case all this can
 	 * go away.
 	 */
-	if (private && size > 0) {
-		ioend->io_offset = offset;
-		ioend->io_size = size;
+	ioend->io_offset = offset;
+	ioend->io_size = size;
+	if (ioend->io_type == IOMAP_READ) {
+		xfs_finish_ioend(ioend);
+	} else if (private && size > 0) {
 		xfs_finish_ioend(ioend);
 	} else {
-		xfs_destroy_ioend(ioend);
+		/*
+		 * A direct I/O write ioend starts it's life in unwritten
+		 * state in case they map an unwritten extent.  This write
+		 * didn't map an unwritten extent so switch it's completion
+		 * handler.
+		 */
+		INIT_WORK(&ioend->io_work, xfs_end_bio_written);
+		xfs_finish_ioend(ioend);
 	}
 
 	/*
@@ -1392,15 +1457,15 @@ xfs_vm_direct_IO(
 	if (error)
 		return -error;
 
-	iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
-
 	if (rw == WRITE) {
+		iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
 		ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
 			iomap.iomap_target->bt_bdev,
 			iov, offset, nr_segs,
 			xfs_get_blocks_direct,
 			xfs_end_io_direct);
 	} else {
+		iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
 		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 			iomap.iomap_target->bt_bdev,
 			iov, offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 80fe3123347..82ab792c7fc 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -224,7 +224,7 @@ xfs_read(
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 		if ((*offset & target->bt_smask) ||
 		    (size & target->bt_smask)) {
-			if (*offset == ip->i_d.di_size) {
+			if (*offset == ip->i_size) {
 				return (0);
 			}
 			return -XFS_ERROR(EINVAL);
@@ -387,9 +387,10 @@ xfs_splice_write(
 {
 	xfs_inode_t		*ip = XFS_BHVTOI(bdp);
 	xfs_mount_t		*mp = ip->i_mount;
+	xfs_iocore_t		*io = &ip->i_iocore;
 	ssize_t			ret;
 	struct inode		*inode = outfilp->f_mapping->host;
-	xfs_fsize_t		isize;
+	xfs_fsize_t		isize, new_size;
 
 	XFS_STATS_INC(xs_write_calls);
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -410,6 +411,14 @@ xfs_splice_write(
 			return -error;
 		}
 	}
+
+	new_size = *ppos + count;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (new_size > ip->i_size)
+		io->io_new_size = new_size;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
 	xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
 			   pipe, count, *ppos, ioflags);
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
@@ -420,14 +429,18 @@ xfs_splice_write(
 	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
 		*ppos = isize;
 
-	if (*ppos > ip->i_d.di_size) {
+	if (*ppos > ip->i_size) {
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (*ppos > ip->i_d.di_size) {
-			ip->i_d.di_size = *ppos;
-			i_size_write(inode, *ppos);
-			ip->i_update_core = 1;
-			ip->i_update_size = 1;
-		}
+		if (*ppos > ip->i_size)
+			ip->i_size = *ppos;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	if (io->io_new_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		io->io_new_size = 0;
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -711,8 +724,6 @@ start:
 		goto out_unlock_mutex;
 	}
 
-	isize = i_size_read(inode);
-
 	if (ioflags & IO_ISDIRECT) {
 		xfs_buftarg_t	*target =
 			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
@@ -723,7 +734,7 @@ start:
 			return XFS_ERROR(-EINVAL);
 		}
 
-		if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) {
+		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 			iolock = XFS_IOLOCK_EXCL;
 			locktype = VRWLOCK_WRITE;
@@ -735,7 +746,7 @@ start:
 	}
 
 	new_size = pos + count;
-	if (new_size > isize)
+	if (new_size > xip->i_size)
 		io->io_new_size = new_size;
 
 	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
@@ -751,8 +762,7 @@ start:
 				      pos, count,
 				      dmflags, &locktype);
 		if (error) {
-			xfs_iunlock(xip, iolock);
-			goto out_unlock_mutex;
+			goto out_unlock_internal;
 		}
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
 		eventsent = 1;
@@ -764,9 +774,8 @@ start:
 		 * event prevents another call to XFS_SEND_DATA, which is
 		 * what allows the size to change in the first place.
 		 */
-		if ((file->f_flags & O_APPEND) && savedsize != isize) {
+		if ((file->f_flags & O_APPEND) && savedsize != xip->i_size)
 			goto start;
-		}
 	}
 
 	if (likely(!(ioflags & IO_INVIS))) {
@@ -784,11 +793,11 @@ start:
 	 * to zero it out up to the new size.
 	 */
 
-	if (pos > isize) {
-		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize);
+	if (pos > xip->i_size) {
+		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size);
 		if (error) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-			goto out_unlock_mutex;
+			xfs_iunlock(xip, XFS_ILOCK_EXCL);
+			goto out_unlock_internal;
 		}
 	}
 	xfs_iunlock(xip, XFS_ILOCK_EXCL);
@@ -808,8 +817,7 @@ start:
 		if (likely(!error))
 			error = -remove_suid(file->f_path.dentry);
 		if (unlikely(error)) {
-			xfs_iunlock(xip, iolock);
-			goto out_unlock_mutex;
+			goto out_unlock_internal;
 		}
 	}
 
@@ -879,12 +887,12 @@ retry:
 		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
 				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
 				0, 0, 0); /* Delay flag intentionally  unused */
-		if (error)
-			goto out_nounlocks;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_rwlock(bdp, locktype);
-		pos = xip->i_d.di_size;
+		if (error)
+			goto out_unlock_internal;
+		pos = xip->i_size;
 		ret = 0;
 		goto retry;
 	}
@@ -893,14 +901,10 @@ retry:
 	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
 		*offset = isize;
 
-	if (*offset > xip->i_d.di_size) {
+	if (*offset > xip->i_size) {
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
-		if (*offset > xip->i_d.di_size) {
-			xip->i_d.di_size = *offset;
-			i_size_write(inode, *offset);
-			xip->i_update_core = 1;
-			xip->i_update_size = 1;
-		}
+		if (*offset > xip->i_size)
+			xip->i_size = *offset;
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 	}
 
@@ -922,16 +926,31 @@ retry:
 
 		error = sync_page_range(inode, mapping, pos, ret);
 		if (!error)
-			error = ret;
-		return error;
+			error = -ret;
+		if (need_i_mutex)
+			mutex_lock(&inode->i_mutex);
+		xfs_rwlock(bdp, locktype);
 	}
 
  out_unlock_internal:
+	if (io->io_new_size) {
+		xfs_ilock(xip, XFS_ILOCK_EXCL);
+		io->io_new_size = 0;
+		/*
+		 * If this was a direct or synchronous I/O that failed (such
+		 * as ENOSPC) then part of the I/O may have been written to
+		 * disk before the error occured.  In this case the on-disk
+		 * file size may have been adjusted beyond the in-memory file
+		 * size and now needs to be truncated back.
+		 */
+		if (xip->i_d.di_size > xip->i_size)
+			xip->i_d.di_size = xip->i_size;
+		xfs_iunlock(xip, XFS_ILOCK_EXCL);
+	}
 	xfs_rwunlock(bdp, locktype);
  out_unlock_mutex:
 	if (need_i_mutex)
 		mutex_unlock(&inode->i_mutex);
- out_nounlocks:
 	return -error;
 }
 
-- 
cgit v1.2.3


From e6a0e9cdff79e1406e5653f759aaf9f59b7ce4c8 Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@sgi.com>
Date: Tue, 8 May 2007 13:49:59 +1000
Subject: [XFS] Export via a function xfs_buftarg_list for use by kdb/xfsidbg.

SGI-PV: 963465
SGI-Modid: xfs-linux-melb:xfs-kern:28414a

Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 10 +++++++++-
 fs/xfs/linux-2.6/xfs_buf.h |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs/xfs/linux-2.6')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 69e9e80735d..fe4f66a5af1 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1426,7 +1426,7 @@ xfs_free_bufhash(
 /*
  *	buftarg list for delwrite queue processing
  */
-LIST_HEAD(xfs_buftarg_list);
+static LIST_HEAD(xfs_buftarg_list);
 static DEFINE_SPINLOCK(xfs_buftarg_lock);
 
 STATIC void
@@ -1867,3 +1867,11 @@ xfs_buf_terminate(void)
 	ktrace_free(xfs_buf_trace_buf);
 #endif
 }
+
+#ifdef CONFIG_KDB_MODULES
+struct list_head *
+xfs_get_buftarg_list(void)
+{
+	return &xfs_buftarg_list;
+}
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 9e8ef8fef39..b6241f6201a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -411,6 +411,9 @@ extern void xfs_free_buftarg(xfs_buftarg_t *, int);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
+#ifdef CONFIG_KDB_MODULES
+extern struct list_head *xfs_get_buftarg_list(void);
+#endif
 
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
-- 
cgit v1.2.3


From 71dfd5a396d11512aa6c8ed0d35b268bc084bb9b Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Tue, 8 May 2007 13:50:12 +1000
Subject: [XFS] Fix race in xfs_write() b/w dmapi callout and direct I/O
 checks.

In xfs_write() the iolock is dropped and reacquired in XFS_SEND_DATA()
which means that the file could change from not-cached to cached and we
need to redo the direct I/O checks. We should also redo the direct I/O
checks when the file size changes regardless if O_APPEND is set or not.

SGI-PV: 963483
SGI-Modid: xfs-linux-melb:xfs-kern:28440a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 53 +++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

(limited to 'fs/xfs/linux-2.6')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 82ab792c7fc..b2a1beb3388 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -724,34 +724,8 @@ start:
 		goto out_unlock_mutex;
 	}
 
-	if (ioflags & IO_ISDIRECT) {
-		xfs_buftarg_t	*target =
-			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-			return XFS_ERROR(-EINVAL);
-		}
-
-		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-			iolock = XFS_IOLOCK_EXCL;
-			locktype = VRWLOCK_WRITE;
-			need_i_mutex = 1;
-			mutex_lock(&inode->i_mutex);
-			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
-			goto start;
-		}
-	}
-
-	new_size = pos + count;
-	if (new_size > xip->i_size)
-		io->io_new_size = new_size;
-
 	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
 	    !(ioflags & IO_INVIS) && !eventsent)) {
-		loff_t		savedsize = pos;
 		int		dmflags = FILP_DELAY_FLAG(file);
 
 		if (need_i_mutex)
@@ -774,10 +748,35 @@ start:
 		 * event prevents another call to XFS_SEND_DATA, which is
 		 * what allows the size to change in the first place.
 		 */
-		if ((file->f_flags & O_APPEND) && savedsize != xip->i_size)
+		if ((file->f_flags & O_APPEND) && pos != xip->i_size)
 			goto start;
 	}
 
+	if (ioflags & IO_ISDIRECT) {
+		xfs_buftarg_t	*target =
+			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp;
+
+		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+			return XFS_ERROR(-EINVAL);
+		}
+
+		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
+			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+			iolock = XFS_IOLOCK_EXCL;
+			locktype = VRWLOCK_WRITE;
+			need_i_mutex = 1;
+			mutex_lock(&inode->i_mutex);
+			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
+			goto start;
+		}
+	}
+
+	new_size = pos + count;
+	if (new_size > xip->i_size)
+		io->io_new_size = new_size;
+
 	if (likely(!(ioflags & IO_INVIS))) {
 		file_update_time(file);
 		xfs_ichgtime_fast(xip, inode,
-- 
cgit v1.2.3


From f7c66ce3f70d8417de0cfb481ca4e5430382ec5d Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Tue, 8 May 2007 13:50:19 +1000
Subject: [XFS] Add lockdep support for XFS

SGI-PV: 963965
SGI-Modid: xfs-linux-melb:xfs-kern:28485a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/xfs/linux-2.6/mrlock.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs/xfs/linux-2.6')

diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
index af168a1a98c..c110bb00266 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -43,6 +43,18 @@ static inline void mrupdate(mrlock_t *mrp)
 	mrp->mr_writer = 1;
 }
 
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
+{
+	down_read_nested(&mrp->mr_lock, subclass);
+}
+
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
+{
+	down_write_nested(&mrp->mr_lock, subclass);
+	mrp->mr_writer = 1;
+}
+
+
 static inline int mrtryaccess(mrlock_t *mrp)
 {
 	return down_read_trylock(&mrp->mr_lock);
-- 
cgit v1.2.3